In [1]:
import os
from openai import OpenAI

import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm
import numpy as np
from sklearn.decomposition import PCA

from sklearn.metrics.pairwise import cosine_similarity

# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

## Generate Embeddings

In [2]:
def get_embedding(row, model="text-embedding-3-small"):
    product_title_left = row['title_left']
    product_title_right = row['title_right']
    
    text = f"{product_title_left} - {product_title_right}"
    return client.embeddings.create(input = [text], model=model).data[0].embedding


In [3]:
datasets = [
    {"dataset_name": "abt-buy-train", "dataset_path": "../../data/abt-buy/abt-buy-train.json"}, 
    {"dataset_name": "amazon-google-train", "dataset_path": "../../data/amazon-google/amazon-google-train.json"},
    {"dataset_name": "dblp-acm", "dataset_path": "../../data/dblp-acm/dblp-acm-train.json.gz"},
    {"dataset_name": "dblp-scholar", "dataset_path": "../../data/dblp-scholar/dblp-scholar-train.json.gz"},
    {"dataset_name": "walmart-amazon", "dataset_path": "../../data/walmart-amazon/walmart-amazon-train.json.gz"}
]

In [8]:
for dataset in datasets:
    dataset_name = dataset["dataset_name"]
    dataset_path = dataset["dataset_path"]
    
    print(f"Processing {dataset_name}")
    if ".json.gz" in dataset_path:
        train_df = pd.read_json(dataset_path, lines=True, compression='gzip')
    else:
        train_df = pd.read_json(dataset_path)
    
    # Use tqdm with apply to show a progress bar
    tqdm.pandas(desc="Processing Embeddings")
    train_df['embedding'] = train_df.progress_apply(get_embedding, axis=1)
    
    # Save the dataframe with embeddings
    train_df.to_pickle(f"{dataset_path.replace('.json', '')}_embeddings.pkl")

Processing abt-buy-train


Processing Embeddings:   0%|          | 0/7659 [00:00<?, ?it/s]

Processing Embeddings: 100%|██████████| 7659/7659 [36:34<00:00,  3.49it/s]  


Processing amazon-google-train


Processing Embeddings:  60%|██████    | 5535/9167 [27:18<14:24,  4.20it/s]  

## Reduce dimensions

In [23]:
datasets = [
    {"dataset_name": "wdc-fullsize", "dataset_path": "../../data/wdc/wdcproducts80cc20rnd050un_test_gs_embeddings.pkl"},
    {"dataset_name": "abt-buy-full", "dataset_path": "../../data/abt-buy/abt-buy-gs_embeddings.pkl"}, 
    {"dataset_name": "amazon-google-full", "dataset_path": "../../data/amazon-google/amazon-google-gs_embeddings.pkl"},
    {"dataset_name": "dblp-acm", "dataset_path": "../../data/dblp-acm/dblp-acm-gs_embeddings.pkl"},
    {"dataset_name": "dblp-scholar", "dataset_path": "../../data/dblp-scholar/dblp-scholar-gs_embeddings.pkl"},
    {"dataset_name": "walmart-amazon", "dataset_path": "../../data/walmart-amazon/walmart-amazon-gs_embeddings.pkl"}
]

In [2]:
datasets = [
    #{"dataset_name": "wdc-fullsize", "dataset_path": "../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small_with_embeddings.pkl.gz"},
    {"dataset_name": "abt-buy-full", "dataset_path": "../../data/abt-buy/abt-buy-train_embeddings.pkl"},
    {"dataset_name": "amazon-google-full", "dataset_path": "../../data/amazon-google/amazon-google-train_embeddings.pkl"},
    {"dataset_name": "dblp-acm", "dataset_path": "../../data/dblp-acm/dblp-acm-train_embeddings.pkl"},
    {"dataset_name": "dblp-scholar", "dataset_path": "../../data/dblp-scholar/dblp-scholar-train_embeddings.pkl"},
    {"dataset_name": "walmart-amazon", "dataset_path": "../../data/walmart-amazon/walmart-amazon-train.gz_embeddings.pkl"}
]
    

In [7]:
file_path = "../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_valid_small_embeddings.pkl"
df = pd.read_pickle(file_path)
len(df["embedding"].iloc[500])

1536

In [6]:
datasets = [
    {"dataset_name": "wdc-large", "dataset_path": "../../data/wdc/filtered/large/filtered_large_embeddings.pkl.gz"},
]

In [7]:
# Example function to normalize vectors using L2 norm
def normalize_l2(x):
    x = np.array(x)
    if x.ndim == 1:
        norm = np.linalg.norm(x)
        if norm == 0:
            return x
        return x / norm
    else:
        norm = np.linalg.norm(x, axis=1, keepdims=True)
        return np.where(norm == 0, x, x / norm)

# Reduce dimensionality using PCA
def reduce_embeddings_dimensionality(df, target_dim=256):
    # Extract embeddings from DataFrame
    embeddings = np.array(df["embedding"].tolist())

    # Normalize the embeddings
    normalized_embeddings = normalize_l2(embeddings)

    # Apply PCA to reduce dimensionality
    pca = PCA(n_components=target_dim)
    reduced_embeddings = pca.fit_transform(normalized_embeddings)

    # Update the DataFrame with reduced embeddings
    df["embedding"] = list(reduced_embeddings)
    
    return df

for dataset in datasets:
    dataset_name = dataset["dataset_name"]
    dataset_path = dataset["dataset_path"]
    
    print(f"Processing {dataset_name}")
    df = pd.read_pickle(dataset_path, compression='gzip')
    # Reduce dimensionality of embeddings in the DataFrame
    df_reduced = reduce_embeddings_dimensionality(df, target_dim=256)
    # Check the dimensionality of the first reduced embedding
    reduced_dim = len(df_reduced["embedding"].iloc[0])
    print(f"Reduced dimensionality: {reduced_dim}")
    df_reduced.to_pickle(dataset_path)

Processing wdc-large
Reduced dimensionality: 256


## Find examples based on embeddings

In [2]:
# Optimized Cosine Similarity with Matrix Operations
def find_most_similar_examples(test_embedding, train_df, top_n=6):
    # Convert lists of embeddings to a numpy array if not already
    train_embeddings = np.array(list(train_df['embedding'].values))
    test_embedding = np.array(test_embedding).reshape(1, -1)
    
    # Calculate cosine similarities for all train embeddings at once
    similarities = cosine_similarity(test_embedding, train_embeddings)
    
    # Get indices of top_n highest similarities
    most_similar_indices = np.argsort(similarities[0])[::-1][:top_n]
    most_similar_examples = train_df.iloc[most_similar_indices].to_dict(orient='records')
    
    return most_similar_examples

In [8]:
examples = find_most_similar_examples(df["embedding"].iloc[5], df, top_n=6)
for example in examples:
    print("Entity 1: ", example["title_left"])
    print("Entity 2: ", example["title_right"])
    print("Explanation: ", example["explanation"])
    print("\n ---------------- \n")

Entity 1:  clickfree c2 500gb usb 3.0 portable external hard drive
Entity 2:  iomega skin 500 gb usb 2.0 portable external hard drive 35106 black
Explanation:  No.  
attribute=brand|||importance=0.05|||values=Swiss Military Hanowa###Swiss Military Hanowa|||similarity=1.00  
attribute=model|||importance=-0.95|||values=FLAGSHIP 06-5161.2.04.007###Flagship 06-5161.2.04.003|||similarity=0.20  
attribute=type|||importance=0.00|||values=Herrenuhr###missing|||similarity=0.00  
attribute=product code|||importance=0.90|||values=06-5161.2.04.007###06-5161.2.04.003|||similarity=0.00  

 ---------------- 

Entity 1:  clickfree c2 500gb usb 3.0 portable external hard drive
Entity 2:  iomega ego helium 320 gb usb 2.0 portable external hard drive 34943
Explanation:  No.  
attribute=brand|||importance=0.05|||values=Epson###Ubiquiti Networks|||similarity=0.00  
attribute=model|||importance=0.95|||values=A3 Premium Glossy Photo Paper 255gsm###UVC-G3-LED|||similarity=0.00  
attribute=product type|||impor

In [3]:
df = pd.read_pickle("../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small_with_embeddings_with_explanations.pkl")
df

Unnamed: 0,id_left,title_left,category_left,brand_left,modelno_left,price_left,cluster_id_left,id_right,title_right,category_right,brand_right,modelno_right,price_right,cluster_id_right,label,pair_id,embedding,explanation
0,walmart_1853,sony 16gb class 4 sd memory card,usb drives,sony,sf16n4/tqp,0.00,847,amazon_2139,pny 4gb class 4 navy sd card,car audio video,pny,p-sdhc4g4-ef / navy,11.18,847,0,walmart_1853#amazon_2139,"[-0.15397552868492334, -0.07523027893977886, -...",Yes. \nattribute=brand|||importance=0.05|||va...
1,walmart_621,zotac geforce gt430 1gb ddr3 pci-express 2.0 g...,electronics - general,zotac,zt-40604-10l,88.88,847,amazon_3167,evga geforce gts450 superclocked 1 gb gddr5 pc...,graphics cards,evga,01g-p3-1452-tr,119.88,847,0,walmart_621#amazon_3167,"[-0.06757154576519979, -0.1088042998052574, -0...",No. \nattribute=brand|||importance=0.05|||val...
2,walmart_2115,da-lite high power model b manual screen with ...,electronics - general,da-lite,85303,372.99,847,amazon_8789,da-lite advantage manual with csr - projection...,home audio theater,da-lite,,904.95,415,0,walmart_2115#amazon_8789,"[0.5307267315176386, 0.011071906526856775, -0....",No. \nattribute=brand|||importance=0.05|||valu...
3,walmart_278,da-lite hc cinema vision tensioned advantage e...,electronics - general,da-lite,89939,2595.00,847,amazon_9064,hc da-mat tensioned advantage electrol - av fo...,projection screens,da-lite,,,847,0,walmart_278#amazon_9064,"[0.447079679745628, 0.08560868191549544, -0.06...",No. \nattribute=brand|||importance=0.05|||val...
4,walmart_1673,verbatim 4gb tuff - n - tiny usb 2.0 flash dri...,usb drives,verbatim,,11.98,847,amazon_13279,verbatim clip-it 4 gb usb 2.0 flash drive 9755...,usb flash drives,verbatim,97556,10.98,847,0,walmart_1673#amazon_13279,"[-0.28616665560010707, -0.25351590531632756, -...",No. \nattribute=brand|||importance=0.05|||val...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044,walmart_1539,lexar platinum ii 16gb sd sdhc memory card,usb drives,lexar,lsd16gbsbna100,32.98,446,amazon_2841,lexar lcf4gbbsbna200 platinum ii 4 gb 200x com...,blank media,lexar,lcf4gbbsbna200,24.06,847,0,walmart_1539#amazon_2841,"[-0.176583721569523, -0.05735348067401766, -0....",No. \nattribute=brand|||importance=0.05|||val...
2045,walmart_612,da-lite video spectra 1.5 advantage manual wit...,electronics - general,da-lite,34716,888.99,847,amazon_14704,da-lite advantage manual with csr - projection...,home audio theater,da-lite,,789.95,847,0,walmart_612#amazon_14704,"[0.4985105756635347, -0.0318777806160994, -0.0...",No. \nattribute=brand|||importance=0.05|||val...
2046,walmart_1257,da-lite da-plex unframed rear projection scree...,electronics - general,da-lite,27503,3561.99,847,amazon_18153,da-lite 27651 da-glas deluxe rear projection s...,projection screens,da-lite,,,571,0,walmart_1257#amazon_18153,"[0.6749821864833128, -0.044683228025034785, -0...",No. \nattribute=brand|||importance=0.05|||val...
2047,walmart_2488,rca cassette adapter,mp3 accessories,rca,ah760r,12.88,472,amazon_11658,rca vca115 vhs-c cassette adapter,blank media,rca,vca115,,847,0,walmart_2488#amazon_11658,"[0.0263399903553608, -0.0059841189988476375, -...",No. \nattribute=brand|||importance=0.05|||val...
