In [1]:
import hashlib
import logging
import pickle
import requests
import json
import numpy as np
from openai import AzureOpenAI
import tiktoken
import requests

In [2]:
import os

property_param = {"id": os.getenv("PROPERTY_PARAM")}
whitelist_param = {"property_id": os.getenv("PROPERTY_PARAM")}
model_name = os.getenv("EMBEDDING_MODEL_NAME")
api_version = os.getenv("EMBEDDING_MODEL_API_VERSION")
endpoint = os.getenv("AZ_OPEN_AI_ENDPOINT")
subscription_key = os.getenv("AZ_OPEN_AI_KEY")
encoding = "cl100k_base"

print(whitelist_param)

def num_tokens_from_string(string: str, encoding_name: str = encoding) -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(string))

def generate_embeddings(text, model):
    client = AzureOpenAI(
        api_version=api_version,
        azure_endpoint=endpoint,
        api_key=subscription_key,
    )
    result = client.embeddings.create(input=text, model=model)
    return result.data

{'property_id': 'dbca0987-6794-473a-8053-158c817e5540'}


In [3]:
def get_embedded_whitelist(whitelist_data):
    embedded_whitelist = {"merchant_name": [], "company_name": []}
    merchant_name_whitelist = [
        whitelist["store_name"].lower()
        for whitelist in whitelist_data
        if whitelist["store_name"].strip() != ''
    ]
    company_name_whitelist = [
        whitelist["company_name"].lower() 
        for whitelist in whitelist_data
        if whitelist["company_name"].strip() != ''
    ]
    print(f"Number of merchant tokens : {num_tokens_from_string(str(merchant_name_whitelist), encoding)}")
    embedded_store_name = generate_embeddings(merchant_name_whitelist, model_name)
    for store_name in embedded_store_name:
        embedded_store_name = np.array(store_name.embedding).reshape(1, -1)
        embedded_whitelist["merchant_name"].append(embedded_store_name)
        
    print(f"Number of company tokens : {num_tokens_from_string(str(company_name_whitelist), encoding)}")
    embedded_company_name = generate_embeddings(company_name_whitelist, model_name)
    for company_name in embedded_company_name:
        embedded_company_name = np.array(company_name.embedding).reshape(1, -1)
        embedded_whitelist["company_name"].append(embedded_company_name)

    return embedded_whitelist

In [6]:
from collections import Counter

whitelist_data= requests.get(
    "https://uat-obk.tccproptech.com/ob-parking/config/store/whitelist",
    params=whitelist_param,
)
get_embedded_whitelist(whitelist_data.json())

def count_total_entries(data):
    total_stores = sum(1 for item in data if item['store_name'])
    total_companies = sum(1 for item in data if item['company_name'])
    
    return total_stores, total_companies
total_stores, total_companies = count_total_entries(whitelist_data.json())
print(f"\nTotal stores with names: {total_stores}")
print(f"Total companies with names: {total_companies}")

Number of merchant tokens : 3998
Number of company tokens : 3831

Total stores with names: 280
Total companies with names: 280


In [5]:
import pandas as pd

df = pd.DataFrame(whitelist_data.json())
print(f"Store number {len(df['store_name'])}")
print(f"Company number {len(df['company_name'])}")
print(len(df))

Store number 280
Company number 280
280
