In [1]:
import os
from openai import OpenAI

import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json
import helper
import utils
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

## Get Scores

In [2]:
def insert_product_descriptions(prompt_template: str, product1: str, product2: str):
    # Replace placeholder texts with actual product descriptions
    prompt = prompt_template.replace("'Entity 1'", product1).replace("'Entity 2'", product2)
    return prompt

In [3]:
def create_prompt(prompt, custom_id, product_1=None, product_2=None):
    if product_1 is not None and product_2 is not None:
        prompt = insert_product_descriptions(prompt, product_1, product_2)
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::A0Fjwxah",
            "messages": [
                {"role": "user", "content": prompt},
            ],
            "max_tokens": 5,
            "temperature": 0
        }
    }

In [4]:
full_datasets = [
    {"dataset_name": "wdc-fullsize", "dataset_path": "../../data/wdc/wdcproducts80cc20rnd050un_test_gs.pkl"},
    {"dataset_name": "abt-buy-full", "dataset_path": "../../data/abt-buy/abt-buy-gs.pkl"}, 
    {"dataset_name": "amazon-google-full", "dataset_path": "../../data/amazon-google/amazon-google-gs.pkl"},
    #{"dataset_name": "dblp-acm", "dataset_path": "../../data/dblp-acm/dblp-acm-gs.pkl"},
    #{"dataset_name": "dblp-scholar", "dataset_path": "../../data/dblp-scholar/dblp-scholar-gs.pkl"},
    #{"dataset_name": "walmart-amazon", "dataset_path": "../../data/walmart-amazon/walmart-amazon-gs.pkl"}
    
]

In [3]:
sampeled_datasets = [
    {"dataset_name": "sampled_wdc", "dataset_path": "../../ralph/data_finetuning/wdc/wdcproducts80cc20rnd000un_test_sampled.jsonl"},
    {"dataset_name": "sampled_abt-buy", "dataset_path": "../../ralph/data_finetuning/abt-buy/abt-buy-test-sampled.jsonl"},
    {"dataset_name": "sampled_amazon-google", "dataset_path": "../../ralph/data_finetuning/amazon-google/amazon-google-test-sampled.jsonl"},
    {"dataset_name": "sampled_dblp-acm", "dataset_path": "../../ralph/data_finetuning/dblp-acm/dblp-acm-test-sampled.jsonl"},
    {"dataset_name": "sampled_dblp-scholar", "dataset_path": "../../ralph/data_finetuning/dblp-scholar/dblp-scholar-test-sampled.jsonl"},
    {"dataset_name": "sampled_walmart-amazon", "dataset_path": "../../ralph/data_finetuning/walmart-amazon/walmart-amazon-test-sampled.jsonl"}
]

In [4]:
batch_job = []

# Load the dataset
df = pd.read_pickle("../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_large.pkl.gz", compression='gzip')

# Load all prompts we want to test
with open('../../prompts/test_prompt.json', 'r') as file:
    prompts = json.load(file)

result_rows = []

for task in prompts:
    title = task['title']
    prompt_template = task['prompt']

    for index, row in df.iterrows():
        product1, product2, label = row['title_left'], row['title_right'], row.get('label') 
        
        custom_id = f"{title};{row['pair_id']};{label};{index}"
        try:
            prompt = create_prompt(prompt_template, custom_id, product1, product2)
            batch_job.append(prompt)
        except:
            print(f"Error: {custom_id}")
            continue

In [9]:
batch_job = []

for dataset in full_datasets:
    # Load the dataset
    df = pd.read_pickle(dataset["dataset_path"])

    # Load all prompts we want to test
    with open('../../prompts/domain_promts.json', 'r') as file:
        prompts = json.load(file)

    result_rows = []

    for task in prompts:
        title = task['title']
        prompt_template = task['prompt']

        for index, row in df.iterrows():
            product1, product2, label = row['title_left'], row['title_right'], row.get('label') 
            
            custom_id = f"{dataset['dataset_name']};{title};{row['pair_id']};{label}"
            prompt = create_prompt(prompt_template, custom_id, product1, product2)
            batch_job.append(prompt)
            
len(batch_job)

41056

In [8]:
full_datasets = [
    #{"dataset_name": "wdc-fullsize", "dataset_path": "../../data/wdc/wdcproducts80cc20rnd050un_test_gs.pkl"},
    #{"dataset_name": "abt-buy-full", "dataset_path": "../../data/abt-buy/abt-buy-gs.pkl"}, 
    #{"dataset_name": "amazon-google-full", "dataset_path": "../../data/amazon-google/amazon-google-gs.pkl"},
    {"dataset_name": "dblp-acm", "dataset_path": "../../data/dblp-acm/dblp-acm-gs.pkl"},
    {"dataset_name": "dblp-scholar", "dataset_path": "../../data/dblp-scholar/dblp-scholar-gs.pkl"},
    {"dataset_name": "walmart-amazon", "dataset_path": "../../data/walmart-amazon/walmart-amazon-gs.pkl"}
    
]

In [10]:
batch_file_path = "dblp_walmart_filter.jsonl"
with open(batch_file_path, "w") as f:
    for request in batch_job:
        f.write(json.dumps(request) + "\n")

In [11]:
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Test synthetic examples"}
)

# delete the batch input file
os.remove(batch_file_path)



In [9]:
print(f"Batch job created with id: {batch.id}")

Batch job created with id: batch_ECDnc1nkc0yQvrwExqFQERpH


In [3]:
# Dictionary to hold the DataFrames
dataframes = {}

# Load each dataset into a DataFrame and store in the dictionary
for dataset in full_datasets:
    dataset_name = dataset["dataset_name"]
    dataset_path = dataset["dataset_path"]
    try:
        df = pd.read_pickle(dataset_path)
        dataframes[dataset_name] = df
        print(f"Loaded {dataset_name} successfully.")
    except Exception as e:
        print(f"Failed to load {dataset_name} from {dataset_path}. Error: {e}")
        
# Function to lookup label in the original dataframes using pair_id
def lookup_label(row):
    dataset_name = row['dataset']
    pair_id = row['pair_id']
    if dataset_name in dataframes:
        original_df = dataframes[dataset_name]
        # Assuming pair_id is a unique identifier in the original dataframe
        if pair_id in original_df['pair_id'].values:
            return original_df.loc[original_df['pair_id'] == pair_id, 'label'].values[0]
    return None

Loaded wdc-fullsize successfully.
Loaded abt-buy-full successfully.
Loaded amazon-google-full successfully.
Loaded dblp-acm successfully.
Loaded dblp-scholar successfully.
Loaded walmart-amazon successfully.


In [3]:
def parse_response(response):
    body = response.get("body", {})
    usage = body.get("usage", {})
    choices = body.get("choices", [{}])
    message = choices[0].get("message", {}) if choices else {}

    return pd.Series({
        "status_code": response.get("status_code"),
        "request_id": response.get("request_id"),
        "completion_id": body.get("id"),
        "created": body.get("created"),
        "model": body.get("model"),
        "content": message.get("content"),
        "prompt_tokens": usage.get("prompt_tokens"),
        "completion_tokens": usage.get("completion_tokens"),
        "total_tokens": usage.get("total_tokens"),
    })

In [12]:
df_all_synthetics = pd.read_pickle("../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small.pkl.gz", compression='gzip')

In [14]:
# only keep rows in df_all_synthetics where the index is in df_results_filtered's index
df_all_synthetics = df_all_synthetics[df_all_synthetics.index.isin(df_results_filtered.index)]
df_all_synthetics.to_csv("../../data/wdc/filtered/wdc_small_filtered_.csv", index=False)

In [15]:
df_all_synthetics

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,title_right,description_right,price_right,priceCurrency_right,specTableContent_right,cluster_id_right,pair_id,label,is_hard_negative,embedding
0,14654897,,HDD 35 4TB Seagate IronWolf Pro NAS ST4000NE001,,154.10,,,1102119,36425270,,"HD 3,5 4TB 7200RPM IRONWOLF PRO 128 MB SATA3 S...",,153.99,EUR,,1102119,14654897#36425270,1,False,"[0.3275445552152033, 0.01101741506399397, 0.09..."
1,31531912,,Buy Quality Replica Omega Seamaster Planet Oce...,Quality AAA Replica Omega Seamaster Planet Oce...,,,,27649829,60397145,,GIGABYTE Radeon RX 5500 XT OC - 4GB GDDR6 RAM ...,"Grafikkort, AMD Radeon RX 5500 XT Overclocked ...",2322.00,NOK,,1857431,31531912#60397145,0,False,"[0.055900165776579844, 0.2703106646872746, 0.1..."
2,44557157,,Ubiquiti UVC-G3-FLEX-3 UniFi Protect G3 FLEX C...,BackDetailsStylish Full HD (1080p) mini turret...,$‎234.95,USD,,266703,90806148,,AAA Replica Omega Seamaster Planet Ocean 600M ...,AAA Replica Omega Seamaster Planet Ocean 600M ...,,,,2193117,44557157#90806148,0,False,"[-0.14620856094649143, 0.2868684206862582, 0.1..."
3,49605449,Brother,Brother HL-L6300DW Business Laser Printer for ...,The Brother HL-L6300DW is the ultimate monochr...,479.98,USD,,408446,36985401,,Epson T6923 Ultrachrome XD rautt Ink 110ml,Nánari lýsing frá framleiðanda:Barcode: 010343...,16371,ISK,,126198,49605449#36985401,0,False,"[-0.24113111321950417, -0.4079377498217689, 0...."
4,3024917,KINGSTON,KINGSTON 64GB USB 3.0 DataTraveler SE9 G2 (Kov...,"Lightweight, stylish USB 3.0 drive. Store, tra...",369.00,czk,,435008,70174967,,Buy Quality Replica Tag Heuer Monaco Steve McQ...,Quality AAA Replica Tag Heuer Monaco Steve McQ...,,,,556904,3024917#70174967,0,False,"[0.020603531223475077, 0.2448852751114778, 0.4..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,84514463,,SSD TRANSCEND SSD230S 128Gb 3D NAND SATA 3 Alu...,,188,RON,,715391,66625804,Samsung,Samsung T7 Touch Black 1TB Portable SSD with F...,"MU-PC1T0K/WW, Portable External SSD, Fingerpri...",217.10,GBP,,1458655,84514463#66625804,0,False,"[0.3272156643014264, 0.014920934565058677, 0.2..."
2495,28465006,,Canon EF-S 18-200mm f/3.5-5.6 IS,"The EF-S 18-200mm is a compact, lightweight le...",699.99,USD,,213315,43932865,Canon,Canon EF-S 18-135/3.5-5.6 IS USM,,5818,SEK,,390091,28465006#43932865,0,True,"[-0.19067569662882541, -0.09847493728761648, -..."
2496,48305046,,Lacie 5TB USB-C Mobile Drive prenosni disk,,,,,500837,36040842,,SRAM NX Eagle PG-1230 Cassetta Pignoni 11-50T ...,,89.90003,EUR,,70762,48305046#36040842,0,True,"[0.18154710826015963, 0.04766450938318995, 0.1..."
2497,80529811,Ubiquiti Networks (UBNT),Ubiquiti UniFi G3 Bullet Video Camera,Ubiquiti UniFi G3 Bullet Video Camera (1080p H...,167.42,GBP,,192122,16871572,UBIQUITI,Ubiquiti UVC-G3-BULLET (Formerly UVC-G3-AF) Un...,Ubiquiti UVC-G3-AF UniFi Video Camera G3 1080p...,118.20,GBP,,192122,80529811#16871572,1,False,"[-0.05494073888898931, -0.03672708128033956, -..."


In [20]:
small_df = pd.read_csv("../../data/wdc/wdcproducts80cc20rnd000un_train_small_simple.csv")
# concat df_all_synthetics and small_df
df_all_synthetics = pd.concat([df_all_synthetics, small_df], ignore_index=True)
df_all_synthetics.to_csv("../../data/wdc/synthetic/4o/textual_explanation/syntheic_filtered_with_small.csv", index=False)

In [13]:
df_results["chat_bot_response_clean"] = df_results["content"].apply(utils.clean_response)
# filter out all -1 chat bot responses
df_results = df_results[df_results["chat_bot_response_clean"] != -1]
# change label to int
df_results["label"] = df_results["label"].astype(int)
# filter out all record if  label and chat bot response are not same
df_results_filtered = df_results[df_results["label"] == df_results["chat_bot_response_clean"]]
df_results_filtered


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_results["label"] = df_results["label"].astype(int)


Unnamed: 0,id,response,error,task,pair_id,label,index,status_code,request_id,completion_id,created,model,content,prompt_tokens,completion_tokens,total_tokens,chat_bot_response_clean
0,batch_req_u82u6V81369L1XAqNHW9oit7,"{'status_code': 200, 'request_id': '838628572d...",,general-complex-free,14654897#36425270,1,0,200,838628572d5e5e2a658ecd83de5ddd16,chatcmpl-9ycwONOe78S3xJRikfiLnJmCk5evp,1724236396,gpt-4o-mini-2024-07-18,"Yes, the two entity",72,5,77,1
1,batch_req_BKc66f2pLhGr9uTHzNMJspc6,"{'status_code': 200, 'request_id': '5b29e55b31...",,general-complex-free,31531912#60397145,0,1,200,5b29e55b31b1c54a0027cecb5afa31a8,chatcmpl-9ycuSpBzbd3lKJDpMSMuTv17QpGu0,1724236276,gpt-4o-mini-2024-07-18,"No, the two entity",82,5,87,0
2,batch_req_55r67UzH7EqcafyG19CQDwXe,"{'status_code': 200, 'request_id': '411422b103...",,general-complex-free,44557157#90806148,0,2,200,411422b103e6004b8844b01f26212d43,chatcmpl-9ycufjfQjSxrurFe63Bzqg0d63Opr,1724236289,gpt-4o-mini-2024-07-18,"No, the two entity",80,5,85,0
3,batch_req_MMaU6gav1tAroKJw0A9Jcuht,"{'status_code': 200, 'request_id': '054390fe9b...",,general-complex-free,49605449#36985401,0,3,200,054390fe9b44462d994358d01f7dd6f3,chatcmpl-9ycvEIzmNv44Zygbx4c2eR3dFKOa0,1724236324,gpt-4o-mini-2024-07-18,"No, the two entity",66,5,71,0
4,batch_req_4xSTtMWS1h0seNQYPnTwNQQp,"{'status_code': 200, 'request_id': 'f6b3d94df0...",,general-complex-free,3024917#70174967,0,4,200,f6b3d94df025b784bde775da3227c071,chatcmpl-9ycvdTTISrMZ5FNuicaVCpLgmK2tl,1724236349,gpt-4o-mini-2024-07-18,"No, the two entity",79,5,84,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2494,batch_req_IShJExf49TqdA8fwK77LtoMk,"{'status_code': 200, 'request_id': 'aa1012bedc...",,general-complex-free,84514463#66625804,0,2494,200,aa1012bedcda163e6cad2df53221e83d,chatcmpl-9yczHSqOdmrHsuSVSDfXtruNXzHTi,1724236575,gpt-4o-mini-2024-07-18,"No, the two entity",69,5,74,0
2495,batch_req_00ycRgEkZJdJoHO1YDjTb3jj,"{'status_code': 200, 'request_id': '0196428f4f...",,general-complex-free,28465006#43932865,0,2495,200,0196428f4ffc50f1168ac66104f1a657,chatcmpl-9yczJUMnZZlURb28I3o2ydkqCnGds,1724236577,gpt-4o-mini-2024-07-18,"No, the two entity",66,5,71,0
2496,batch_req_sxwAo4xN3OQiWcrRyxAFsqIi,"{'status_code': 200, 'request_id': '800d343ba0...",,general-complex-free,48305046#36040842,0,2496,200,800d343ba06525bcce8342b2ad76b6ca,chatcmpl-9yczLnfY97ZtRJN5B7UkJPOHxJt4O,1724236579,gpt-4o-mini-2024-07-18,"No, the two entity",67,5,72,0
2497,batch_req_wu8J6GN1Oihs4Fm1GFcJzw3z,"{'status_code': 200, 'request_id': 'f0ba2b50ac...",,general-complex-free,80529811#16871572,1,2497,200,f0ba2b50ac3356504783065e2a3a214b,chatcmpl-9yczReVdHsXxUkkh8UvMB3mKuQJm5,1724236585,gpt-4o-mini-2024-07-18,"Yes, the two entity",76,5,81,1


In [10]:
## Aggregate the results
df_results = pd.read_json("../../data/wdc/filtered/small_filter_raw.jsonl", lines=True) 
#df_2 = pd.read_json("../../results/ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4ominifirst:9oUSMtcj/baseline_all_datasets/batch_8IuN0dE6AUr63DZ2r0i4BiI2_output.jsonl", lines=True)

#df_results = pd.concat([df_results, df_2])
# split the custom_id into dataset, task and index
df_results[['task', 'pair_id', 'label', 'index']] = df_results.custom_id.str.split(";", expand=True)
df_results = df_results.drop(columns=['custom_id'])

# Apply the function to the response column
parsed_df = df_results["response"].apply(parse_response)

# Concatenate the parsed results with the original dataframe
df_results = pd.concat([df_results, parsed_df], axis=1)


# Apply the lookup function to get labels from original dataframes
df_results['label'] = df_results.apply(lookup_label, axis=1)

df_results["chatbot_response_clean"] = df_results["content"].apply(utils.clean_response)


# Dictionary to hold the stats DataFrames
stats_dataframes = []

# Get the unique datasets from df_results
unique_datasets = df_results['dataset'].unique()

# Iterate over each unique dataset
for dataset_name in unique_datasets:
    # Filter the df_results for the current dataset
    filtered_df = df_results[df_results['dataset'] == dataset_name]
    
    # Calculate stats for the filtered DataFrame
    stats_df = helper.calculate_stats(filtered_df)
    stats_df['Dataset'] = dataset_name  # Add dataset name for reference
    stats_dataframes.append(stats_df)

# Concatenate all the stats DataFrames
results = pd.concat(stats_dataframes, ignore_index=True)
results.to_csv("../../results/ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4ominifirst:9oUSMtcj/baseline_all_datasets/results.csv", index=False)

NameError: name 'lookup_label' is not defined

## Generate Explanations

In [4]:
def create_prompt_explanation(product_1, product_2, label, custom_id):
    
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [
                {"role": "user", "content": f"""
                Do the two entity descriptions refer to the same real-world entity?
                Entity 1: {product_1}
                Entity 2: {product_2}

                The correct answer is {label}.

                Please provide an explanation for this answer in a structured format, listing the attributes that you compared for reaching this answer. Each attribute should be accompanied by the attribute values and a score between -1 and 1 that shows the importance of the attribute for the decision. If the attribute influenced the decision towards non-match the importance score should be negative. If the attribute pointed towards a match, the importance score should be positive. Also provide a similarity score for the attribute values. If an attribute only occurs in one item, specify the value of that attribute for the other item as "missing". An example output is the following:

                attribute=brand|||importance=0.05|||values=Logitech###Logitech|||similarity=1.00
                attribute=model|||importance=-0.95|||values=MX G500###MX Master 3S|||similarity=0.20
                attribute=color|||importance=0.00|||values=missing###Graphite|||similarity=0.00
                
                Here is a complete example:
                Do the two product descriptions refer to the same real-world product? Entity 1: 'WD 4TB Black My Passport Portable External Hard Drive - USB 3.0 - WDBYFT0040BBK-WESN'. Entity 2: 'Dysk WD My Passport 1TB USB 3.0 black'.
                "No. 
                attribute=brand|||importance=0.05|||values=Western Digital###Western Digital|||similarity=1.00
                attribute=model|||importance=0.95|||values=My Passport###My Passport|||similarity=1.00
                attribute=storage capacity|||importance=0.9|||values=4TB###1TB|||similarity=0.25
                attribute=color|||importance=0.1|||values=Black###Black|||similarity=1.00
                attribute=USB version|||importance=0.05|||values=USB 3.0###USB 3.0|||similarity=1.00
                
                Do not provide a explanation in a different format. The explanation should be in the format described above. Only provide the answer and explanation dont repeat the question.
                """}
            ],
            "max_tokens": 1000,
            "temperature": 0
        }
    }



In [33]:
def create_prompt(product_1, product_2, label, custom_id):
    label = "MATCH" if label == 1 else "NOT A MATCH"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": f"""
                <s>[INST] Given the following two examples, provide an explanation for the third example for why the two entities do or do not match. [\INST]

                Entity A: [NAME] samsung dlp tv stand in black tr72bx [DESCRIPTION] samsung dlp tv stand in black tr72bx designed to fit samsung hlt7288, hlt7288, hl72a650, and hl67a650 television sets tempered 6mm tinted glass shelves wide audio storage shelves to accommodate 4 or more components wire management system easy to assemble high gloss black finish [PRICE] 369.0
                Entity B: [NAME] samsung tr72b tv stand [DESCRIPTION] glass black [PRICE] 232.14
                Label: MATCH
                Explanation: Both entities refer to samsung TV stand in black and therefore have substantially similar specifications, therefore they’re a match. </s>

                Entity A: [NAME] canon high capacity color ink cartridge color ink cl51 [DESCRIPTION] canon high capacity color ink cartridge cl51 compatible with pixma ip6210d, ip6220d, mp150, mp170 and mp450 printers [PRICE] 35.0
                Entity B: [NAME] canon pg-40 twin pack black ink cartridge 0615b013 [DESCRIPTION] black [PRICE]
                Label: NOT A MATCH
                Explanation: Entity A refers to color ink cartridge while Entity B is a black ink cartridge, therefore they are not a match. </s>

                Entity A: [NAME] {product_1.get("name")} [DESCRIPTION] {product_1.get("description")} [PRICE] {product_1.get("price")}
                Entity B: [NAME] {product_2.get("name")} [DESCRIPTION] {product_2.get("description")} [PRICE] {product_2.get("price")}
                Label: {label}
                Explanation:
                """}
            ],
            "max_tokens": 128,
            "temperature": 0,
            "top_p": 0.95,
        }
    }

In [3]:
# Function to extract the entity strings
def extract_entities(text):
    entity_1 = text.split("Entity 1: '")[1].split("'")[0]
    entity_2 = text.split("Entity 2: '")[1].split("'")[0]
    return entity_1, entity_2

In [5]:
# Load your data
small_df = pd.read_csv(f"../../data/wdc/synthetic/4o/textual_example/interesting/filtered_with_small.csv")

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(small_df.iterrows(), total=small_df.shape[0]):
    product_1, product_2 = extract_entities(row["prompt"])
    label = row["completion"]
    custom_id = str(index)
    prompt = create_prompt_explanation(product_1, product_2, label, custom_id=custom_id)
    requests.append(prompt)

batch_file_path = "wdc_synthetic.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")


  0%|          | 0/8900 [00:00<?, ?it/s]

In [6]:
batch_input_file = client.files.create(
  file=open(batch_file_path, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "product matching explanations, aws training small"}
)

Batch(id='batch_04lrbOlf2Wnaxxf0fUDGCDuH', completion_window='24h', created_at=1722774630, endpoint='/v1/chat/completions', input_file_id='file-NGAsKRI1lia0fmJdgwiHFrSQ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1722861030, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'product matching explanations'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [36]:
client.batches.retrieve(batch.id)

Batch(id='batch_1B72RGVspqdiDBUo8KoU45F9', completion_window='24h', created_at=1723023471, endpoint='/v1/chat/completions', input_file_id='file-b52Ejhuuud1vgOnx1zGlZ0RP', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1723109871, failed_at=None, finalizing_at=None, in_progress_at=1723023473, metadata={'description': 'product matching explanations, aws training small'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=2500))

In [9]:
generated_explanations = pd.read_json("../../data/wdc/synthetic/4o/textual_example/interesting/batch_8wemll6kjbeL5e2ru78e7hp6_explanations.jsonl", lines=True)
generated_explanations_parsed = generated_explanations["response"].apply(parse_response)    
generated_explanations = pd.concat([generated_explanations, generated_explanations_parsed], axis=1)

# convert the custom_id to an int
generated_explanations["custom_id"] = generated_explanations["custom_id"].astype(int)

dataset_without_explanations = pd.read_csv("../../data/wdc/synthetic/4o/textual_example/interesting/filtered_with_small.csv")

for index, row in dataset_without_explanations.iterrows():
    custom_id = index
    explanation = generated_explanations[generated_explanations["custom_id"] == custom_id]["content"].values[0]
    dataset_without_explanations.at[index, "completion"] = explanation
    
dataset_without_explanations.to_csv("../../data/wdc/synthetic/4o/textual_example/interesting/filtered_with_small_with_explanations.csv", index=False)

In [7]:
generated_explanations

Unnamed: 0,id,custom_id,response,error,status_code,request_id,completion_id,created,model,content,prompt_tokens,completion_tokens,total_tokens
0,batch_req_K9046zHKu76M5T7GeggZ2QBm,0,"{'status_code': 200, 'request_id': '1c0777ecfc...",,200,1c0777ecfc75c90fe1b981d4e582469d,chatcmpl-A0txEDYTxKxYwY4pFGWJxzz94O1jY,1724778452,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,534,185,719
1,batch_req_5Llu5mvVHQNAp09b9hoCfzUp,1,"{'status_code': 200, 'request_id': 'c1c4acc0be...",,200,c1c4acc0bebbee07f94f304d2f99c5d7,chatcmpl-A0txEsaXcXRuIh0IigR9qdjFqZtSw,1724778452,gpt-4o-2024-05-13,No.\nattribute=product type|||importance=1.00|...,544,138,682
2,batch_req_b6QrScjn38lUDYnk7DEQFzsM,2,"{'status_code': 200, 'request_id': 'da97650f7f...",,200,da97650f7fdefbe6da58197f0bb76014,chatcmpl-A0txEsCjdTj8u4VU3OkxSKVCr8dPm,1724778452,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,542,161,703
3,batch_req_UeShsRTO7XCatXpuH4oMYAM4,3,"{'status_code': 200, 'request_id': 'b4fcf304a1...",,200,b4fcf304a16adab7bc0960a0739eeb63,chatcmpl-A0txEf4gfJ38STz365XlXah5tXM60,1724778452,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.8|||values...,528,108,636
4,batch_req_yfsyWl8fmjodEu0bUTpaQq82,4,"{'status_code': 200, 'request_id': 'c9634a09e8...",,200,c9634a09e855328fb1a00bd8730da70b,chatcmpl-A0txEXdPeE0sEgvD0ziqR2Yu2l9bO,1724778452,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,541,164,705
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8895,batch_req_t80SIVtTXKrCMizyjZmxVTYn,8895,"{'status_code': 200, 'request_id': '4fddf32d38...",,200,4fddf32d388d61308971cfd8a14a12b3,chatcmpl-A0u8tRHBxMIDW25VH0jz1xOcqzhHp,1724779175,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,527,130,657
8896,batch_req_d17okfDoBQ3XrMqyRVqTKiDw,8896,"{'status_code': 200, 'request_id': '8dfbacc3c7...",,200,8dfbacc3c712a93c8859adbdc839c952,chatcmpl-A0u8tiMIux07JNZGgw23T29wANA1i,1724779175,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,544,151,695
8897,batch_req_kceUPfyL5GhyILEM8jW8SSwv,8897,"{'status_code': 200, 'request_id': '5cf902e136...",,200,5cf902e1367b7a49a6274a04b4e2d497,chatcmpl-A0u8w3dZM8lNLQJ7DdwllkWbp4swQ,1724779178,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,516,126,642
8898,batch_req_dtTooytLvlFIRP3ZdRssMHvE,8898,"{'status_code': 200, 'request_id': '143f891a42...",,200,143f891a421d08c521c62fac04b88a03,chatcmpl-A0u9015drcNBa0Dxnm22bvaNtXGAe,1724779182,gpt-4o-2024-05-13,No.\nattribute=brand|||importance=0.05|||value...,551,167,718


## Generate new examples

In [91]:
def create_prompt(product_1, product_2, label, explanation, custom_id):
    example = {
        "title_left": product_1,
        "title_right": product_2,
        "label": label,
        "explanation": explanation
    }
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": f"""
                Please generate 4 similar examples to this one 3 should be non matches 1 should be a match.
                
                {example}
               
                Only return the title_left, title_right and the label in a JSON format
                """}
            ],
            "max_tokens": 300,
            "temperature": 0
        }
    }



In [8]:
# Load your data
small_df = pd.read_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small_explanations_40_mini.pkl.gz", compression="gzip")

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(small_df.iterrows(), total=small_df.shape[0]):
    product_1 = row["title_left"]
    product_2 = row["title_right"]
    label = row["label"]
    explanation = row["explanation"]
    custom_id = row["pair_id"]
    prompt = create_prompt(product_1, product_2, label, explanation, custom_id=custom_id)
    requests.append(prompt)

batch_file_path = "batch_input_new_examples_based_train_small.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")


  0%|          | 0/2500 [00:00<?, ?it/s]

In [9]:
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Generate new examples based on explanations, small training set"}
)

## Synthetic examples

In [4]:
def create_synthetic_examples(product_1, product_2, label, custom_id, examples=None):
    label = "Yes" if label == 1 else "NO"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [
                {"role": "user", "content": f"""
                I'm currently testing large language, models on the task of entity matching. In this context, I am first fine-tuning them, and then testing their weaknesses and strengths. The example I will show you is wrongly classified by the model and that idea is to generate four new examples three of which should be negative, i.e. non-matches, and one of them match. For context, two products are considered to be a match if the two entity descriptions refer to the same real world entity. This does not mean that the descriptions need to be the same but that the entity the decription refers to needs to match. Secondly products are not a match if the two descriptions refer to different products.  As a model has previously made an error on these two entity descriptions it is important to create examples that present a similar challenge. Please focus on corner cases meaning examples that are quite difficult to get correct. The generated examples should belong to the same category as the presented product and should be very similar to it. However even if they are a match the strings should never match exactly. The results should only be presented as JSON containing degenerated entity, one and entity two as well as information if they are a match or not represented by boolean and value. Only return JSON.
                
                {examples}

                Here is the misclassified example:
                Entity 1: {product_1}
                Entity 2: {product_2}
                Label: {label}
                """}
            ],
            "max_tokens": 2_500,
            "temperature": 0
        }
    }



In [5]:
# Optimized Cosine Similarity with Matrix Operations
def find_most_similar_examples(test_embedding, train_df, top_n=6):
    # Convert lists of embeddings to a numpy array if not already
    train_embeddings = np.array(list(train_df['embedding'].values))
    test_embedding = np.array(test_embedding).reshape(1, -1)
    
    # Calculate cosine similarities for all train embeddings at once
    similarities = cosine_similarity(test_embedding, train_embeddings)
    
    # Get indices of top_n highest similarities
    most_similar_indices = np.argsort(similarities[0])[::-1][:top_n]
    most_similar_examples = train_df.iloc[most_similar_indices].to_dict(orient='records')
    
    return most_similar_examples

In [9]:
def transform_label(label):
    return "Yes" if label == 1 else "No"

In [7]:
validation_df = pd.read_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_valid_small_embeddings.pkl")
validation_df

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,title_right,description_right,price_right,priceCurrency_right,specTableContent_right,cluster_id_right,pair_id,label,is_hard_negative,embedding
0,82659340,,Garmin Black fenix 5S Sapphire Watch with Blac...,Get More From Your Workout with Less on Your W...,949.0,AUD,,1406172,15847024,,Western Digital Blue SN550 1TB NVMe PCIe Gen3....,The new SSD value leader doesn't skimp on perf...,,,,156996,82659340#15847024,0,False,"[0.16355520114084598, 0.2695404750582223, -0.1..."
1,77373298,Maxxis,"Maxxis Maxxis, High Roller II, Tire, 27.5''x2....",,102.00,CAD,,729259,53025257,,Ubiquiti UVC-G3-Bullet UniFi IP Bullet Camera,,$‎197.95,USD,,591042,77373298#53025257,0,False,"[-0.11614268887719063, 0.023930370080093515, 0..."
2,14965636,,HyperX Fury DDR3 1600MHz 8GB,"2x4GB 1600MHz (PC3-12800) DDR3 CL10, Sort",350.00,DKK,,702149,64399529,,Memoria DDR3 1600Mhz Kit 8GB HyperX Kingston (...,,"73,58 €",EUR,,288081,14965636#64399529,0,True,"[0.32378779168143834, -0.11955184833984764, 0...."
3,10229338,,Brother TN200 toner laser x HL720-730-760,TN200 TONER LASER X HL720-730-760,36.07,EUR,,5223239,80827080,Xerox,Xerox Toner Gul 25k - Phaser 7760,,2899,DKK,,219682,10229338#80827080,0,True,"[-0.21823224955430381, -0.35864784439011, 0.00..."
4,89697019,,"AMD RYZEN 5 2400G, W5 3.6 GHZ 65W SOC AM4 RADE...",W5 3.6 GHZ 65W SOC AM4 RADEON RX VEGA GRAPHICS,3670.1,MXN,,1442607,48676771,,CPU AMD Ryzen 5 2400G Desktop Processor of 4 C...,AMD Ryzen™ 5 2400G with Radeon™ RX Vega 11 Gra...,10249,INR,,1442607,89697019#48676771,1,False,"[0.23819084486943, 0.06052992620967666, 0.2481..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,24382278,,Swiss Military Hanowa Undercover 06-4307.30.007,06-430730007,4936.00,CZK,,4246715,60576778,Swiss Military Hanowa,Swiss Military Hanowa 06-4226.30.003.03,"; : ;: , , , , , ; : ,;:(), ;: 12 ;:;;cIP ;;WR...",30500,RUB,,2080188,24382278#60576778,0,True,"[-0.23401700696891956, 0.27543792783601284, -0..."
2496,15417028,,"Samsung CF390 27"" 4ms HDMI Curved Monitor","Samsung CF390 27"" 4ms HDMI Curved Monitor",175.88,GBP,,709380,2596154,,"Monitor curbat LED SAMSUNG C27F390FHU, 27\"", F...",,,,,709380,15417028#2596154,1,False,"[-0.005975582989415096, -0.01320530360606273, ..."
2497,32212233,,Sigma 50mm f/1.4 DG HSM | Art - Nikon F Nikon ...,Sigma 50mm f/1.4 DG HSM | Art - Nikon F Nikon ...,569,GBP,,393711,19473276,Shimano,Shimano PD-R7000 105 Clipless Pedal,The new Shimano 105 R7000 groupset brings a ne...,8770.0,INR,,1588018,32212233#19473276,0,False,"[-0.19529779033010675, 0.054974836746459685, -..."
2498,18046344,,Daniel Wellington Cornwall Classic DW00100150,,,,,2533164,94982317,,"RAM Short Double Socket Arm for 1.5\"" Balls (O...","The RAM-201U-B, short double socket arm, has a...",3.595E1,AUD,,93315,18046344#94982317,0,True,"[-0.09148275848920813, 0.24183877765142345, -0..."


In [10]:
# Load your data
validation_df = pd.read_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_valid_small_embeddings.pkl")

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(validation_df.iterrows(), total=validation_df.shape[0]):
    product_1 = row["title_left"]
    product_2 = row["title_right"]
    label = row["label"]
    custom_id = row["pair_id"]
    examples = find_most_similar_examples(row["embedding"], validation_df, top_n=6)
    example_1 = ""
    example_2 = ""
    
    for index, example in enumerate(examples):
        if index % 2 == 0:
            example_1 = example_1 + "Entity 1: " + example["title_left"]
            example_1 = example_1 + "Entity 2: " + example["title_right"]
            example_1 = example_1 + "Label: " + transform_label(example["label"])
            example_1 = example_1 + "\n ---------------- \n"
            
        else:
            example_2 = example_2 + "Entity 1: " + example["title_left"]
            example_2 = example_2 + "Entity 2: " + example["title_right"]
            example_2 = example_2 + "Label: " + transform_label(example["label"])
            example_2 = example_2 + "\n ---------------- \n"
        
    prompt_1 = create_synthetic_examples(product_1, product_2, label, custom_id=f"{custom_id}_1", examples=example_1)
    prompt_2 = create_synthetic_examples(product_1, product_2, label, custom_id=f"{custom_id}_2",examples=example_2)
    requests.append(prompt_1)
    requests.append(prompt_2)

batch_file_path = "synthetic_with_explanations.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")


  0%|          | 0/2500 [00:00<?, ?it/s]

In [11]:
batch_input_file = client.files.create(
  file=open(batch_file_path, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "product matching explanations, aws training small"}
)

In [69]:
synthetic_examples = pd.read_json("../../data/wdc/synthetic/4o/validation/batch_zzpA3KvaEm1MSmfWY3zbSOrp_output.jsonl", lines=True)

# Parse the results and update your dataframe
parsed_df = synthetic_examples["response"].apply(parse_response)

# Concatenate the parsed results with the original dataframe
df_results = pd.concat([synthetic_examples, parsed_df], axis=1)

In [70]:
df_results["content"].iloc[0]

'```json\n[\n    {\n        "entity_one": "Garmin Forerunner 945 LTE GPS Running Watch with Black Band",\n        "entity_two": "Garmin Fenix 6 Pro Solar, Premium Multisport GPS Watch with Black Band",\n        "match": true\n    },\n    {\n        "entity_one": "Garmin Black fenix 5S Sapphire Watch with Black Band",\n        "entity_two": "Garmin Venu 2 Plus, GPS Smartwatch with Black Band",\n        "match": false\n    },\n    {\n        "entity_one": "Garmin Black fenix 5S Sapphire Watch with Black Band",\n        "entity_two": "Samsung Galaxy Watch 4 Classic, 46mm, Black",\n        "match": false\n    },\n    {\n        "entity_one": "Garmin Black fenix 5S Sapphire Watch with Black Band",\n        "entity_two": "Apple Watch Series 7 GPS, 45mm, Midnight Aluminum Case with Black Sport Band",\n        "match": false\n    }\n]\n```'

In [80]:
# Define the function to parse the content and extract entities and label
def extract_multiple_entities(content):
    try:
        content = content.replace("```json\n", "").replace("\n```", "")
        # Load the JSON string into a Python dictionary (or list if multiple entities are in a list)
        data = json.loads(content)
        # Assuming the content is a list of dictionaries
        rows = []
        for entity in data:
            title_left = entity.get('entity_one')
            title_right = entity.get('entity_two')
            label = entity.get('match')
            rows.append([title_left, title_right, label])
        
        return pd.DataFrame(rows, columns=['title_left', 'title_right', 'label'])
    
    except (json.JSONDecodeError, TypeError):
        print("Error parsing")
        # Handle the case where content is not a valid JSON or is missing
        return pd.DataFrame(columns=['title_left', 'title_right', 'label'])

In [79]:
extract_multiple_entities(df_results["content"].iloc[8])

[
    {
        "entity_one": "Jabra Evolve 20 monaural wired USB headset",
        "entity_two": "Jabra Evolve 20 UC Mono USB Headset with Noise Cancellation",
        "match": true
    },
    {
        "entity_one": "Jabra Evolve 20 monaural wired USB headset",
        "entity_two": "Jabra Evolve 30 Binaural USB Headset with Inline Controls",
        "match": false
    },
    {
        "entity_one": "Jabra Evolve 20 monaural wired USB headset",
        "entity_two": "Jabra Evolve 20 SE Stereo USB Headset with Mute Button",
        "match": false
    },
    {
        "entity_one": "Jabra Evolve 20 monaural wired USB headset",
        "entity_two": "Jabra Evolve 40 Mono USB Headset with Busy Light Indicator",
        "match": false
    }
]
[{'entity_one': 'Jabra Evolve 20 monaural wired USB headset', 'entity_two': 'Jabra Evolve 20 UC Mono USB Headset with Noise Cancellation', 'match': True}, {'entity_one': 'Jabra Evolve 20 monaural wired USB headset', 'entity_two': 'Jabra Evolve 30 Bin

Unnamed: 0,title_left,title_right,label
0,Jabra Evolve 20 monaural wired USB headset,Jabra Evolve 20 UC Mono USB Headset with Noise...,True
1,Jabra Evolve 20 monaural wired USB headset,Jabra Evolve 30 Binaural USB Headset with Inli...,False
2,Jabra Evolve 20 monaural wired USB headset,Jabra Evolve 20 SE Stereo USB Headset with Mut...,False
3,Jabra Evolve 20 monaural wired USB headset,Jabra Evolve 40 Mono USB Headset with Busy Lig...,False


In [85]:
# Initialize an empty DataFrame to store the results
expanded_df = pd.DataFrame(columns=['title_left', 'title_right', 'label'])

# Iterate over each row in df_results
for index, row in df_results.iterrows():
    # Parse and extract the multiple entities from the content
    expanded_rows = extract_multiple_entities(row['content'])
    
    # Optionally, add other columns from df_results to the expanded DataFrame
    for col in df_results.columns:
        if col != 'content':
            expanded_rows[col] = row[col]
    
    # Append the expanded rows to the final DataFrame
    expanded_df = pd.concat([expanded_df, expanded_rows], ignore_index=True)

Error parsing
Error parsing
Error parsing


In [90]:
df = pd.read_json("../../results/meta-llama/Meta-Llama-3.1-8B-Instruct/error/small_explanation/2024-08-12-14-59-11_explanation/checkpoint-294/validation_results.json")

# filter misclassified examples
df = df[df["label"] != df["chatbot_response_clean"]]
df

Unnamed: 0,task,chatbot_question,chatbot_response_raw,chatbot_response_clean,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,...,brand_right,title_right,description_right,price_right,priceCurrency_right,specTableContent_right,cluster_id_right,pair_id,label,is_hard_negative
2,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",Yes. \nattribute=,1,14965636,,HyperX Fury DDR3 1600MHz 8GB,"2x4GB 1600MHz (PC3-12800) DDR3 CL10, Sort",350.00,DKK,...,,Memoria DDR3 1600Mhz Kit 8GB HyperX Kingston (...,,"73,58 €",EUR,,288081,14965636#64399529,0,True
16,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,13329848,,Daniel Wellington St Mawes Classic DW00100006,,,,...,,St Mawes 40mm Rose Gold Watch - Multi,An integral part of the flagship classic Danie...,169.00,GBP,,667096,13329848#87967305,1,False
20,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",Yes. \nattribute=,1,61533965,,WD My Passport USB 3.0 Portable Hard Drive 4TB...,PLEASE NOTE: Online Clearance Product | Includ...,256,NZD,...,WD,WD My Passport 4TB External Portable Hard Driv...,"4TB WD My Passport WDBPKJ0040BBK-WESN, Portabl...",99.98,GBP,,1939560,61533965#12422845,0,True
21,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,35630494,Western Digital,Western Digital My Book disque dur externe 300...,"Western Digital My Book, 3000 Go, 3.5\"", 3.2 G...",143.59,EUR,...,,Western Digital WDBBGB Black 3 TB Portable Har...,Western Digital My Book desktop storage is des...,213.630,EUR,,707463,35630494#1529645,1,False
38,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,9221397,,652589-B21 HP G8 G9 900-GB 6G 10K 2.5 SAS,Description:HP 900GB 2.5-inch SFF SAS 6Gb/s 10...,4276850,VND,...,,Сървър HEWLETT PACKARD HP 900GB 6G SAS 10K rpm...,Сървър HEWLETT PACKARD HP 900GB 6G SAS 10K rpm...,890.90,BGN,,603762,9221397#20390207,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2463,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,32709637,3M,3M MOBILE INTERACTIVE SOLUTION PF220W9F LIGHTW...,Condition : New,178.23,USD,...,,"3M - Privacy Filter 19\""\"" WideS","3M Privacy Filter 19\""\"" WideS (PF319W) - Type...",2020.00,DKK,,3303908,32709637#13519118,1,False
2464,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,7268238,,Access Point TP-Link N300 WIFI Ceiling Mount -...,TP-LINK EAP110. Velocidade: 300Mbps Wireless N...,40.95,EUR,...,,TL-EAP110 - TP-Link Wireless OUTDOOR 2.4 GH Ac...,The EAP’s elegant appearance and easy mounting...,319,ILS,,2124537,7268238#44019497,1,False
2465,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,18151425,RAZER,Razer Mamba Elite Ergonomic Optical Gaming Mouse,The Razer Mamba Elite is the iconic gaming mou...,89.99,GBP,...,,"Razer Mamba Elite Wired Gaming Mouse, Black, 5...","Razer Mamba Elite Wired Gaming Mouse, Black, 5...",139.00,AUD,,967973,18151425#10806044,1,False
2486,general-complex-free,"[{'role': 'user', 'content': 'Do the two entit...",No. \nattribute=,0,12319284,Tp-link,TP-LINK TL-WN781ND networking card WLAN 150 Mb...,Internal Wireless PCI Express WLAN 150 Mbit/s,,,...,,TARJETA DE RED PCI EXPRESS X1 INALAMBRICA TP-L...,,189.1728,MXN,,466493,12319284#13666920,1,False


In [87]:
# Convert the label to integer
expanded_df['label'] = expanded_df['label'].apply(lambda x: 1 if x == True else 0)
expanded_df

Unnamed: 0,title_left,title_right,label,id,custom_id,response,error,status_code,request_id,completion_id,created,model,prompt_tokens,completion_tokens,total_tokens
0,Garmin Forerunner 945 LTE GPS Running Watch wi...,"Garmin Fenix 6 Pro Solar, Premium Multisport G...",1,batch_req_C8vhg3R2gskZLG4QhQiHhZCt,82659340#15847024,,,200.0,1fe271007f8829d5234f867cd29ae414,chatcmpl-9wy1SLSuo97GP9mWT4f3mAhoPIRD7,1.723841e+09,gpt-4o-2024-05-13,323.0,217.0,540.0
1,Garmin Black fenix 5S Sapphire Watch with Blac...,"Garmin Venu 2 Plus, GPS Smartwatch with Black ...",0,batch_req_C8vhg3R2gskZLG4QhQiHhZCt,82659340#15847024,,,200.0,1fe271007f8829d5234f867cd29ae414,chatcmpl-9wy1SLSuo97GP9mWT4f3mAhoPIRD7,1.723841e+09,gpt-4o-2024-05-13,323.0,217.0,540.0
2,Garmin Black fenix 5S Sapphire Watch with Blac...,"Samsung Galaxy Watch 4 Classic, 46mm, Black",0,batch_req_C8vhg3R2gskZLG4QhQiHhZCt,82659340#15847024,,,200.0,1fe271007f8829d5234f867cd29ae414,chatcmpl-9wy1SLSuo97GP9mWT4f3mAhoPIRD7,1.723841e+09,gpt-4o-2024-05-13,323.0,217.0,540.0
3,Garmin Black fenix 5S Sapphire Watch with Blac...,"Apple Watch Series 7 GPS, 45mm, Midnight Alumi...",0,batch_req_C8vhg3R2gskZLG4QhQiHhZCt,82659340#15847024,,,200.0,1fe271007f8829d5234f867cd29ae414,chatcmpl-9wy1SLSuo97GP9mWT4f3mAhoPIRD7,1.723841e+09,gpt-4o-2024-05-13,323.0,217.0,540.0
4,"Maxxis, High Roller II, Tire, 27.5''x2.50, Fol...","Maxxis, High Roller II, Tire, 27.5''x2.50, Fol...",1,batch_req_3F7FqnpJttUbQBXCvJUZxKIV,77373298#53025257,,,200.0,55a5d41c2b2ee1026a535f90ebaf7a0d,chatcmpl-9wy0UH1TFS789SLuPYaZxi1FuqHr0,1.723841e+09,gpt-4o-2024-05-13,349.0,453.0,802.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10057,Daniel Wellington Classic Bristol DW00100099,Daniel Wellington Classic York DW00100099,0,batch_req_uIDVEPff1K9xCE2oEk1lWNzn,18046344#94982317,,,200.0,4366a5662a7bb05f7111fea0d7af9be9,chatcmpl-9wy3yiyUN50ZlTiROgkjDVxRWXhU3,1.723841e+09,gpt-4o-2024-05-13,317.0,172.0,489.0
10058,Omega Seamaster Diver 300M Co‑Axial Master Chr...,Omega Seamaster Aqua Terra 150M Co‑Axial Maste...,1,batch_req_AE6fZFPuqWoznp3asQHK74RJ,85806842#323104,,,200.0,1df970416a7eb85c5efab02d178c5b2a,chatcmpl-9wy3Q80AC2ppfo3qx3kDlQR1LIfan,1.723841e+09,gpt-4o-2024-05-13,327.0,216.0,543.0
10059,Omega Seamaster ETNZ Planet Ocean Co‑Axial Mas...,Rolex Submariner Date Men's Watch,0,batch_req_AE6fZFPuqWoznp3asQHK74RJ,85806842#323104,,,200.0,1df970416a7eb85c5efab02d178c5b2a,chatcmpl-9wy3Q80AC2ppfo3qx3kDlQR1LIfan,1.723841e+09,gpt-4o-2024-05-13,327.0,216.0,543.0
10060,Omega Seamaster ETNZ Planet Ocean Co‑Axial Mas...,Seiko Prospex Diver's Automatic Men's Watch,0,batch_req_AE6fZFPuqWoznp3asQHK74RJ,85806842#323104,,,200.0,1df970416a7eb85c5efab02d178c5b2a,chatcmpl-9wy3Q80AC2ppfo3qx3kDlQR1LIfan,1.723841e+09,gpt-4o-2024-05-13,327.0,216.0,543.0
