In [1]:
import os
import openai
import json
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
import time
from dotenv import load_dotenv
import helper as analytics


In [2]:
# Load OPENAI_API_KEY from .env file
load_dotenv()

True

In [3]:
from openai import OpenAI

client = OpenAI(api_key=os.getenv("CHATGPTKEY"))

In [5]:
model_id = "gpt-4o-mini"
BASE_PATH = f"../../results/{model_id}"

In [4]:
def generate_explanations(product_1:str, product_2:str, label:str):
    completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": f"""
        Do the two entity descriptions refer to the same real-world entity?
        Entity 1: {product_1}
        Entity 2: {product_2}

        The correct answer is {label}.

        Please provide an explanation for this answer in a structured format, listing the attributes that you compared for reaching this answer. Each attribute should be accompanied by the attribute values and a score between -1 and 1 that shows the importance of the attribute for the decision. If the attribute influenced the decision towards non-match the importance score should be negative. If the attribute pointed towards a match, the importance score should be positive. Also provide a similarity score for the attribute values. If an attribute only occurs in one item, specify the value of that attribute for the other item as "missing". An example output is the following:

        attribute=brand|||importance=0.05|||values=Logitech###Logitech|||similarity=1.00
        attribute=model|||importance=-0.95|||values=MX G500###MX Master 3S|||similarity=0.20
        attribute=color|||importance=0.00|||values=missing###Graphite|||similarity=0.00
        
        Here is a complete example:
        Do the two product descriptions refer to the same real-world product? Entity 1: 'WD 4TB Black My Passport Portable External Hard Drive - USB 3.0 - WDBYFT0040BBK-WESN'. Entity 2: 'Dysk WD My Passport 1TB USB 3.0 black'.
        "No. 
        attribute=brand|||importance=0.05|||values=Western Digital###Western Digital|||similarity=1.00
        attribute=model|||importance=0.95|||values=My Passport###My Passport|||similarity=1.00
        attribute=storage capacity|||importance=0.9|||values=4TB###1TB|||similarity=0.25
        attribute=color|||importance=0.1|||values=Black###Black|||similarity=1.00
        attribute=USB version|||importance=0.05|||values=USB 3.0###USB 3.0|||similarity=1.00
        
        Do not provide a explanation in a different format. The explanation should be in the format described above. Only provide the answer and explanation dont repeat the question.
        
        """}
    ],
        max_tokens=300,
        temperature=0
    )
    return {
        "answer": completion.choices[0].message.content,
        "total_tokens": completion.usage.total_tokens,
        "prompt_tokens": completion.usage.prompt_tokens,
        "completion_tokens": completion.usage.completion_tokens
    }
    
print(generate_explanations("WD 4TB Black My Passport Portable External Hard Drive - USB 3.0 - WDBYFT0040BBK-WESN", "Dysk WD My Passport 1TB USB 3.0 black", "No"))

{'answer': 'No. \nattribute=brand|||importance=0.05|||values=WD###WD|||similarity=1.00\nattribute=model|||importance=0.95|||values=My Passport###My Passport|||similarity=1.00\nattribute=storage capacity|||importance=0.9|||values=4TB###1TB|||similarity=0.25\nattribute=color|||importance=0.1|||values=Black###Black|||similarity=1.00\nattribute=USB version|||importance=0.05|||values=USB 3.0###USB 3.0|||similarity=1.00', 'total_tokens': 665, 'prompt_tokens': 536, 'completion_tokens': 129}


In [6]:
small_df = pd.read_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small.pkl.gz", compression="gzip")
# Iterate over the small dataset and generate explanations

for index, row in tqdm(small_df.iterrows(), total=small_df.shape[0]):
    product_1 = row["title_left"]
    product_2 = row["title_right"]
    label = row["label"]
    explanation = generate_explanations(product_1, product_2, label)
    small_df.loc[index, "explanation"] = explanation["answer"]
    small_df.loc[index, "total_tokens"] = explanation["total_tokens"]
    small_df.loc[index, "prompt_tokens"] = explanation["prompt_tokens"]
    small_df.loc[index, "completion_tokens"] = explanation["completion_tokens"]

small_df.to_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small_explanations_40.pkl.gz", compression="gzip")

  0%|          | 0/2500 [00:00<?, ?it/s]

In [12]:
small_df.to_pickle(f"../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small_explanations_40_mini.pkl.gz", compression="gzip")

In [8]:
def clean_response(response):
    if "yes" in response.lower():
        return 1
    else:
        return 0

In [9]:
def insert_product_descriptions(prompt_template: str, product1: str, product2: str):
    # Replace placeholder texts with actual product descriptions
    prompt = prompt_template.replace("'Entity 1'", product1).replace("'Entity 2'", product2)
    return prompt

In [12]:
def generate_answer(prompt: str, model:str = "gpt-3.5-turbo-0125", max_new_tokens=5):  
    completion = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "user", "content": prompt}
    ],
        max_tokens=max_new_tokens,
        temperature=0
    )
    return completion.choices[0].message.content, completion.usage.total_tokens, completion.usage.prompt_tokens, completion.usage.completion_tokens

In [17]:
generate_answer("Hello world", "gpt-4o-mini")

('Hello! How can I', 14, 9, 5)

# Connect to ChatGpt Api

## Start testing

In [19]:
result_rows = []

datasets = [{"dataset_name": "wdc", "dataset_path": "../../data/wdc/preprocessed_wdcproducts80cc20rnd000unsampled250_2_gs_testset_ralph.pkl"}, {"dataset_name": "abt-buy", "dataset_path": "../../data/abt-buy/abt-buy-sampled_gs.pkl"}, {"dataset_name": "amazon-google", "dataset_path": "../../data/amazon-google/amazon-google-sampled_gs.pkl"}]

for dataset in datasets:
    # Load the dataset
    df = pd.read_pickle(dataset["dataset_path"])

    # Load all prompts we want to test
    with open('../../prompts/domain_promts.json', 'r') as file:
        prompts = json.load(file)

    result_rows = []

    for task in prompts:
        title = task['title']
        prompt_template = task['prompt']
        print(f"Processing dataset: {dataset['dataset_name']} \n Processing task:  {title}")

        for index, row in df.iterrows():
            product1, product2, label = row['title_left'], row['title_right'], row.get('label') 
            message = insert_product_descriptions(prompt_template, product1, product2)
            
            response = ""
            error = ""

            
            response, total_tokens, prompt_tokens, completion_tokens = generate_answer(message, model_id, 5)
            # Prepare a result dictionary combining task info with the original row data
            result_row = {
                'task': title,
                'chatbot_question': message,
                'chatbot_response_raw': response if response else error,
                'chatbot_response_clean': clean_response(response),
                'total_tokens': total_tokens,
                'prompt_tokens': prompt_tokens,
                'completion_tokens': completion_tokens
            }
            
            # Add original dataframe row data
            for col in df.columns:
                result_row[col] = row[col]

            # Append this combined result row to the list
            result_rows.append(result_row)

            if index % 250 == 0:
                print(f"Processed {index} out of {len(df)} queries")


    all_columns = ['task', 'chatbot_question', 'chatbot_response_raw', 'chatbot_response_clean'] + list(df.columns)

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(result_rows, columns=all_columns)

    # get the current date and time
    now = datetime.now()

    directory = f"{BASE_PATH}/{dataset['dataset_name']}"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # save the dataframe as a json file
    results_df.to_json(f"{BASE_PATH}/{dataset['dataset_name']}/{now.strftime('%Y-%m-%d-%H-%M-%S')}.json")

Processing dataset: wdc 
 Processing task:  domain-complex-free (Product)
Processed 0 out of 1239 queries
Processed 250 out of 1239 queries
Processed 500 out of 1239 queries
Processed 750 out of 1239 queries
Processed 1000 out of 1239 queries
Processing dataset: wdc 
 Processing task:  domain-simple-free (Product)
Processed 0 out of 1239 queries
Processed 250 out of 1239 queries
Processed 500 out of 1239 queries
Processed 750 out of 1239 queries
Processed 1000 out of 1239 queries
Processing dataset: wdc 
 Processing task:  domain-complex-force (Product)
Processed 0 out of 1239 queries
Processed 250 out of 1239 queries
Processed 500 out of 1239 queries
Processed 750 out of 1239 queries
Processed 1000 out of 1239 queries
Processing dataset: wdc 
 Processing task:  domain-simple-force (Product)
Processed 0 out of 1239 queries
Processed 250 out of 1239 queries
Processed 500 out of 1239 queries
Processed 750 out of 1239 queries
Processed 1000 out of 1239 queries
Processing dataset: abt-buy 

In [5]:
result_paths = analytics.get_all_files_in_directory(f"{BASE_PATH}/")
stats = analytics.calculate_fine_tuned_stats(result_paths, model_id)

# Initialize an empty dataframe to collect all rows
combined_df = pd.DataFrame()

# Iterate through the data and concatenate
for entry in stats:
    stats_df = entry['stats']
    stats_df['model'] = entry['model']
    stats_df['benchmark'] = entry['benchmark']
    stats_df['file_path'] = entry['file_path']
    combined_df = pd.concat([combined_df, stats_df], ignore_index=True)

# Reordering columns so that 'model' and 'benchmark' are the first two columns
combined_df = combined_df[['model', 'benchmark', 'Task', 'Accuracy', 'F1 Score', 'Precision', 'Recall', 'Num -1 Responses', 'file_path']]

combined_df.to_csv(f"{BASE_PATH}/results/stats.csv")

In [6]:
combined_df

Unnamed: 0,model,benchmark,Task,Accuracy,F1 Score,Precision,Recall,Num -1 Responses,file_path
0,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,amazon-google,domain-complex-free (Product),0.841977,0.701378,0.546539,0.978632,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
1,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,amazon-google,domain-simple-free (Product),0.851702,0.712716,0.563275,0.970085,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
2,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,amazon-google,domain-complex-force (Product),0.851702,0.713615,0.562963,0.974359,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
3,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,amazon-google,domain-simple-force (Product),0.857374,0.720635,0.573232,0.970085,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
4,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,wdc,domain-complex-free (Product),0.951574,0.88764,0.834507,0.948,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
5,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,wdc,domain-simple-free (Product),0.955609,0.896811,0.844523,0.956,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
6,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,wdc,domain-complex-force (Product),0.954802,0.89434,0.846429,0.948,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
7,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,wdc,domain-simple-force (Product),0.955609,0.895636,0.851986,0.944,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
8,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,abt-buy,domain-complex-free (Product),0.970149,0.918552,0.860169,0.985437,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...
9,ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4o...,abt-buy,domain-simple-free (Product),0.975124,0.931193,0.882609,0.985437,0,../../results/ft:gpt-4o-mini-2024-07-18:wbsg-u...


In [18]:
# open our test dataframe from the csv
df = pd.read_pickle('../data/wdc/preprocessed_wdcproducts80cc20rnd000unsampled250_2_gs_testset_ralph.pkl')
df.head(2)

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,title_right,description_right,price_right,priceCurrency_right,specTableContent_right,cluster_id_right,pair_id,label,is_hard_negative
0,67017951,,Ubiquiti UniFi Protect Video Security,Ubiquiti's UniFi Protect offers all the flexib...,,,,1649038,5303451,,Ubiquiti UniFi Video G3 Infrared Range Extender,The IR Range Extender is a 'Plug and Play' acc...,70.8,GBP,,711907,67017951#5303451,0,True
1,41364196,,Kšiltovka New Era Clean Trucker Chicago Bulls,,658.0,CZK,,46857262,29165749,,Kšiltovka New Era New York Yankees MLB 9Fifty,,810.0,CZK,,76491423,41364196#29165749,0,True


In [19]:
base_path = "../results/gpt-4o-mini"
# Load all prompts we want to test
with open('../prompts/domain_promts.json', 'r') as file:
    prompts = json.load(file)

result_rows = []
times = []

for task in prompts:
    title = task['title']
    prompt_template = task['prompt']
    print(f"Processing task:  {title}")

    for index, row in df.iterrows():
        start_time = time.time()
        product1, product2, label = row['title_left'], row['title_right'], row.get('label') 
        message = insert_product_descriptions(prompt_template, product1, product2)
        
        response = ""
        error = ""

        
        response, total_tokens, prompt_tokens, completion_tokens = generate_answer(message, "ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:4ominifirst:9oUSMtcj", 10)
        end_time = time.time()
        times.append(end_time - start_time)
        # Prepare a result dictionary combining task info with the original row data
        result_row = {
            'task': title,
            'chatbot_question': message,
            'chatbot_response_raw': response if response else error,
            'chatbot_response_clean': clean_response(response),
            'total_tokens': total_tokens,
            'prompt_tokens': prompt_tokens,
            'completion_tokens': completion_tokens
        }
        
        # Add original dataframe row data
        for col in df.columns:
            result_row[col] = row[col]

        # Append this combined result row to the list
        result_rows.append(result_row)

        if index % 50 == 0:
            print(f"Processed {index} out of {len(df)} queries")


    all_columns = ['task', 'chatbot_question', 'chatbot_response_raw', 'chatbot_response_clean',"prompt_tokens", "completion_tokens", "total_tokens"] + list(df.columns)

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(result_rows, columns=all_columns)

    # get the current date and time
    now = datetime.now()

    directory = f"{base_path}"
    if not os.path.exists(directory):
        os.makedirs(directory)

    # save the dataframe as a json file
    results_df.to_json(f"{base_path}/{now.strftime('%Y-%m-%d-%H-%M-%S')}_wdc_testset.json")

Processing task:  domain-complex-free (Product)
Processed 0 out of 1239 queries
Processed 50 out of 1239 queries
Processed 100 out of 1239 queries
Processed 150 out of 1239 queries
Processed 200 out of 1239 queries
Processed 250 out of 1239 queries
Processed 300 out of 1239 queries
Processed 350 out of 1239 queries
Processed 400 out of 1239 queries
Processed 450 out of 1239 queries
Processed 500 out of 1239 queries
Processed 550 out of 1239 queries
Processed 600 out of 1239 queries
Processed 650 out of 1239 queries
Processed 700 out of 1239 queries
Processed 750 out of 1239 queries
Processed 800 out of 1239 queries
Processed 850 out of 1239 queries
Processed 900 out of 1239 queries
Processed 950 out of 1239 queries
Processed 1000 out of 1239 queries


In [23]:
all_columns = ['task', 'chatbot_question', 'chatbot_response_raw', 'chatbot_response_clean',"prompt_tokens", "completion_tokens", "total_tokens"] + list(df.columns)

# Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(result_rows, columns=all_columns)

# get the current date and time
now = datetime.now()

directory = f"{base_path}"
if not os.path.exists(directory):
    os.makedirs(directory)

# save the dataframe as a json file
results_df.to_json(f"{base_path}/{now.strftime('%Y-%m-%d-%H-%M-%S')}_wdc_testset.json")

In [21]:
result_rows

[{'task': 'domain-complex-free (Product)',
  'chatbot_question': 'Do the two product descriptions refer to the same real-world product? Entity 1: Ubiquiti UniFi Protect Video Security. Entity 2: Ubiquiti UniFi Video G3 Infrared Range Extender.',
  'chatbot_response_raw': 'No, the two product descriptions refer to different products',
  'chatbot_response_clean': 0,
  'total_tokens': 61,
  'prompt_tokens': 51,
  'completion_tokens': 10,
  'id_left': 67017951,
  'brand_left': nan,
  'title_left': 'Ubiquiti UniFi Protect Video Security',
  'description_left': "Ubiquiti's UniFi Protect offers all the flexibility one would need for their home security needs. Here's our full review.",
  'price_left': nan,
  'priceCurrency_left': nan,
  'specTableContent_left': nan,
  'cluster_id_left': 1649038,
  'id_right': 5303451,
  'brand_right': nan,
  'title_right': 'Ubiquiti UniFi Video G3 Infrared Range Extender',
  'description_right': "The IR Range Extender is a 'Plug and Play' accessory designed to

In [25]:
df = pd.read_json(f"../results/gpt-3.5-turbo-0125/2024-06-15-14-03-56_wdc_testset.json")
# Give me the total prompt and completion tokens
print(df['prompt_tokens'].sum())
print(df['completion_tokens'].sum())


87303
12390


In [17]:
all_columns = ['task', 'chatbot_question', 'chatbot_response_raw', 'chatbot_response_clean',"total_tokens"] + list(df.columns)
 # Convert the list of dictionaries to a DataFrame
results_df = pd.DataFrame(result_rows, columns=all_columns)

# get the current date and time
now = datetime.now()

directory = f"{base_path}"
if not os.path.exists(directory):
    os.makedirs(directory)
# save the dataframe as a json file
results_df.to_json(f"{base_path}/{now.strftime('%Y-%m-%d-%H-%M-%S')}_wdc_complex_free.json")

In [33]:
results_df[results_df["chatbot_response_clean"] == results_df["label"]].shape[0] / results_df.shape[0]

0.9467312348668281

In [13]:
df = pd.read_json("../results/gpt-3.5-turbo-0125/2024-04-06-00-10-00_wdc_complex_free.json")
df

Unnamed: 0,task,chatbot_question,chatbot_response_raw,chatbot_response_clean,total_tokens,id_left,brand_left,title_left,description_left,price_left,...,brand_right,title_right,description_right,price_right,priceCurrency_right,specTableContent_right,cluster_id_right,pair_id,label,is_hard_negative
0,domain-complex-free (Product),Do the two product descriptions refer to the s...,"Yes, both product descriptions",1,73,67017951,,Ubiquiti UniFi Protect Video Security,Ubiquiti's UniFi Protect offers all the flexib...,,...,,Ubiquiti UniFi Video G3 Infrared Range Extender,The IR Range Extender is a 'Plug and Play' acc...,70.80,GBP,,711907,67017951#5303451,0,True
1,domain-complex-free (Product),Do the two product descriptions refer to the s...,"No, the two product",0,85,41364196,,Kšiltovka New Era Clean Trucker Chicago Bulls,,658,...,,Kšiltovka New Era New York Yankees MLB 9Fifty,,810,CZK,,76491423,41364196#29165749,0,True
2,domain-complex-free (Product),Do the two product descriptions refer to the s...,"Yes, both product descriptions",1,136,49744056,,DYMO D1 - Glossy tape - black on white - Roll ...,Dymo 2000/5500 Tape 19mmx7m Black/White 45803.,13.49,...,DYMO,"DYMO 45017 D1 Tape 12mm x 7m sort p rd, S0720570",Originalt DYMO 45017 D1-tape SORT/RD Passer ti...,139.00,DKK,,48435,49744056#66087579,0,True
3,domain-complex-free (Product),Do the two product descriptions refer to the s...,"Yes, both product descriptions",1,85,63587638,Samsung,SAMSUNG T7 Touch Black 2TB Portable SSD with F...,"2TB Samsung T7 Touch MU-PC2T0K/WW, Portable Ex...",365.99,...,,T7 Portable SSD - 2TB Indigo Blue,"Light and pocket-sized, the Portable SSD T7 bo...",639.99,NZD,,1252792,63587638#52588862,0,True
4,domain-complex-free (Product),Do the two product descriptions refer to the s...,"No, the two product",0,86,25138178,,Kingston Canvas 128GB Micro SDXC C10 U1 V10 A1,,39.95,...,,Transcend SSD230S 128GB best price,"Transcend 128GB 2.5\"" SSD230S SATA3 3D NAND Fl...",2.208E1,eur,,715391,25138178#45664586,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4951,domain-simple-force (Product),Do the two product descriptions match? Answer ...,Yes,1,145,16319132,,Sigma 35mm f/1.4 DG HSM Art Lens Canon,67mm Filter Size f/1.4 Aperture N/A Stabilized...,699.00,...,,SIGMA35mm/1.4 DG HSM | ART [Canon EF-mount]E67...,,,,,704991,16319132#54637257,1,False
4952,domain-simple-force (Product),Do the two product descriptions match? Answer ...,Yes,1,134,91699237,,CM / CoolerMaster MasterCase H500 ARGB ATX Cas...,Warranty : N/A,159.00,...,,Cooler Master Mastercase H500 ARGB Midi Tower ...,,11495,EUR,,1473407,91699237#13684723,1,False
4953,domain-simple-force (Product),Do the two product descriptions match? Answer ...,Yes,1,172,69931309,,Corsair DDR4 8GB 2133Mhz CL15 Dimm Valueselect...,"Corsair Value Select 8GB PC4-17000, 8 GB, DDR4...",29.1,...,Corsair,CORSAIR DDR4 2133MHZ 8GB 1x288 DIMM 1.20V Unbu...,,32.15,EUR,,2861749,69931309#38788118,1,False
4954,domain-simple-force (Product),Do the two product descriptions match? Answer ...,No,0,103,92956854,Cooler Master,Cooler Master MasterBox Lite 5 ATX Case Window...,"Motherboard compatibility :ATX, Micro-ATX, Min...",85.0,...,,CM Case MasterBox Lite 5,CM Case MasterBox Lite 5 Kucista,144.00,BAM,,1006147,92956854#42906184,1,False


In [63]:
# create a new column for the chatbot response
df['chatbot_response'] = ''
df['chatbot_response_raw'] = ''
df['chatbot_question'] = ''

In [64]:
# loop through the dataframe and send the messages to the chatbot
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    message = [
        {"role": "system", "content": "Please match these two products"},
        {"role": "user",
            "content": f" product 1: {row['title_left']}, product 2: {row['title_right']}"},
    ]
    error = ""
    for i in range(3):
        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=message,
                max_tokens=3,
                temperature=0.0,
            )
            # if the above call succeeds, break out of the loop and continue
            break
        except Exception as e:
            error = e
            print(f"Error: {e}")
            # if the call fails, wait for one minute before retrying
            time.sleep(60)

    # check if the response was set successfully
    if response:
        # your code here
        df.at[index, 'chatbot_response_raw'] = response
        df.at[index, 'chatbot_response'] = response['choices'][0]['message']['content']
        df.at[index, 'chatbot_question'] = message
        print(index, response['choices'][0]['message']['content'])
    else:
        df.at[index, 'chatbot_response_raw'] = error
        df.at[index, 'chatbot_response'] = -1
        df.at[index, 'chatbot_question'] = message
        print("Error: response was not set successfully")

  0%|          | 0/100 [00:00<?, ?it/s]

0 product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3
1 product 1: Office Supplies, product 2: Office Supplies
2 product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics
product 2: CRUCIAL CT4
3 product 1: Crucial MX500 250GB 2.5" SATA III: Electronics
product 2: JABARA Evolve 65 with Link 370 USB - Mono:
4 product 1: Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp
5 Product 1: Tissot Seastar 1000 T120.417.11.051.00: Watches
Product 2: Samsung Portable SSD T7 Touch MU-PC1
6 product 1: Samsung 970 EVO SSD M.2 2280 - 1TB SSD - 1 TB - M.2 2280 (80mm) - PCI Express 
7 product 1: Ilford Ilford FP4+ 125 120 B&W Film - Single Roll: Photography
product 2: Canon EF-S 18-200mm f3.5
8 product 1: iiyama 27' ETE, ULTRA SLIM LINE, 2560x1440 WQHD, IPS, 5ms, FreeSync, 13
9 product 1: Cycling Equipment, product 2: Cycling Equipment
10 product 1: Kingston Technology DataTraveler SE9 G

In [65]:
df.head()

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,...,pair_id,label,is_hard_negative,roberta-base_logits,roberta-base_prediction,rsupcon-base_logits,rsupcon-base_prediction,chatbot_response,chatbot_response_raw,chatbot_question
0,70108616,Maxxis,Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead,The new incarnation of the Minion DHR. Ready to shred any line.,110.0,CAD,,1084991,45040021,Maxxis,...,70108616#45040021,1,False,"[-4.3633804321, 4.6556377411]",1,[1.0],1,"product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3","{'id': 'chatcmpl-7a2PJXD30RMVmVvOJrSzcqSRJ5ouz', 'object': 'chat.completion', 'created': 1688823417, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3"",  ""ro...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
1,67977820,,Canon CLI251XL High Yield Black Inkjet Cartridge Remanufactured,"Crazy Inkjets is a leading supplier of high quality printing supplies for your Canon CLI251XL printer cartridge. CrazyInkjets products are guaranteed to meet or exceed the quality, reliability and yield standards of the original equipment remanufacturer. The units are tested to ensure compliance...",6.95,USD,,767463,69787972,Samsung,...,67977820#69787972,0,True,"[3.8366084099, -4.2942962646]",0,[0.0],0,"product 1: Office Supplies, product 2: Office Supplies","{'id': 'chatcmpl-7a2PNNQOnhJRFKKMVzuqggncuJGVG', 'object': 'chat.completion', 'created': 1688823421, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Office Supplies, product 2: Office Supplies"",  ""role"": ""assistant"" }, 'finish_reason': 'stop'}], 'usage': {'...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
2,79859336,Crucial,Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266,,34.99,CAD,,1892167,65646040,Crucial,...,79859336#65646040,1,False,"[-4.363576889, 4.6508393288]",1,[1.0],1,product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics\nproduct 2: CRUCIAL CT4,"{'id': 'chatcmpl-7a2POoegCmNDoLsZag4KOIztZj4mg', 'object': 'chat.completion', 'created': 1688823422, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics\nproduct 2: CRUCIAL CT4"",  ""role"":...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
3,30368884,,"Crucial MX500 250GB 2.5"" SATA III","MX500 250GB SATA 2.5-inch, SATA 6.0Gb/s, 560 MB/s Read, 510 MB/s Write",43.59,EUR,,672125,86893846,,...,30368884#86893846,0,False,"[3.8347194195, -4.2961273193]",0,[0.0],0,"product 1: Crucial MX500 250GB 2.5"" SATA III: Electronics\nproduct 2: JABARA Evolve 65 with Link 370 USB - Mono:","{'id': 'chatcmpl-7a2PSC2hJ9ojRg1tvYYBagWh0Fcuh', 'object': 'chat.completion', 'created': 1688823426, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Crucial MX500 250GB 2.5\"" SATA III: Electronics\nproduct 2: JABARA Evolve 65 with Link 370 USB - Mono:"",  ""r...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
4,82078171,,Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp; Lga 1151 S R35 C,"ntel i3-7100, Core. Processor family: 7th gen Intel® Core™ i3, Processor frequency: 3.9 GHz, Processor socket: LGA 1151 (Socket H4). Memory channels: Dual, Maximum internal memory supported by processor: 64 GB, Memory types supported by processor: DDR3L-SDRAM,DDR4-SDRAM. On-board graphics adapte...",2899.0,ZAR,,443612,6914049,WESTERN DIGITAL,...,82078171#6914049,0,False,"[3.8386721611, -4.3010449409]",0,[0.0],0,product 1: Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp,"{'id': 'chatcmpl-7a2PVnttBxPEhBOnQ6toIGkXLEg1y', 'object': 'chat.completion', 'created': 1688823429, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp"",  ""role"": ""assistant"" }, 'f...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."


In [66]:
# get the number of rows were the chatbot respionse is not empty
df[df['chatbot_response'] != ''].shape

(100, 26)

In [67]:
df.tail()

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,...,pair_id,label,is_hard_negative,roberta-base_logits,roberta-base_prediction,rsupcon-base_logits,rsupcon-base_prediction,chatbot_response,chatbot_response_raw,chatbot_question
95,53525303,Ryze,Ryze Tello Battery Charging Hub G1CH,,129,DKK,,1555405,79449686,,...,53525303#79449686,0,True,"[3.6632392406000003, -4.1180138588]",0,[0.0],0,"product 1: Ryze Tello Battery Charging Hub G1CH: Electronics, product 2: AirPods with Wireless Charging Case: Electronics","{'id': 'chatcmpl-7a2U4IyKM563DbZAJkfvcy2LMdScH', 'object': 'chat.completion', 'created': 1688823712, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Ryze Tello Battery Charging Hub G1CH: Electronics, product 2: AirPods with Wireless Charging Case: Electronic...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
96,81415527,,2MP 25X Network IR PTZ Camera,"1/2.8\"" progressive scan CMOSUp to 1920 × 1080@30fps resolutionMin. illumination: Color: 0.005 Lux @(F1.6, AGC ON) B/W: 0.001 Lux @(F1.6, AGC ON) 0 Lux with IR25× optical zoom, 16× digital zoomWDR, HLC, BLC, 3D DNR, Defog, EIS, Regional Exposure, Regional FocusUp to 150 m IR distance24 VAC & Hi-...","Incl. BTW€1.035,76",EUR,,2694265,46810114,,...,81415527#46810114,1,False,"[-4.1793274879, 4.4736194611]",1,[1.0],1,product 1: 2MP 25X Network IR PTZ Camera: Electronics\nproduct 2: HIKVISION 2MP 25X NETWORK IR SPEED DOME CAMERA DS-2,"{'id': 'chatcmpl-7a2U7eSkdNrFqE25zBYz9zzBAxjJu', 'object': 'chat.completion', 'created': 1688823715, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: 2MP 25X Network IR PTZ Camera: Electronics\nproduct 2: HIKVISION 2MP 25X NETWORK IR SPEED DOME CAMERA DS-2"", ...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
97,53053823,,Canon EF-S 18-135mm f/3.5-5.6 IS USM Zoom Lens,"The Canon EF-S 18-135mm f/3.5-5.6 IS USM Zoom Lens is great for photos and movies. With improved Image Stabilisation, it allows hand held shooting even in lower light conditions.Features 16 elements from 12 groups18-135mm Focal LengthBuilt-in Image StabiliserNano USM technologyMaximum magnificat...",799,AUD,,390091,45511817,,...,53053823#45511817,0,True,"[3.7480974197, -4.2194314003]",0,[0.0],0,product 1: Canon EF-S 18-135mm f/3.5-5.6 IS USM Zoom Lens: Electronics\nproduct 2: Nikon Nikkor AF-S 24,"{'id': 'chatcmpl-7a2UAdB9lNQJjjIT15dxMykW1DHn6', 'object': 'chat.completion', 'created': 1688823718, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Canon EF-S 18-135mm f/3.5-5.6 IS USM Zoom Lens: Electronics\nproduct 2: Nikon Nikkor AF-S 24"",  ""role"": ""ass...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
98,72039756,,"Cooler Master Chassis, MasterBox Lite 5 Tower",,64.52,USD,,1006147,43470746,,...,72039756#43470746,1,False,"[-4.3535614014, 4.6426811218]",1,[1.0],1,"product 1: Cooler Master Chassis, MasterBox Lite 5 Tower: Computer Hardware\nproduct 2: CASE MID-TOWER NO PSU MASTERBOX LITE 5 3USB3 BLACK","{'id': 'chatcmpl-7a2UEQFZQ5peIqDXOWceYq1w3g06j', 'object': 'chat.completion', 'created': 1688823722, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Cooler Master Chassis, MasterBox Lite 5 Tower: Computer Hardware\nproduct 2: CASE MID-TOWER NO PSU MASTERBOX ...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."
99,79205401,,Crucial BX500 120GB 2.5″ SSD,"Ever wonder why your phone responds faster than your computer? It’s because your phone runs on flash memory. Add flash memory to your laptop or desktop computer with the Crucial BX500 120GB 2.5″ SSD, the easiest way to get all the speed of a new computer without the price. This SSD offers sequen...",499.0,ZAR,,744155,31394689,Crucial,...,79205401#31394689,0,True,"[3.8113934994000003, -4.2730379105]",0,[0.0],0,product 1: Crucial BX500 120GB 2.5″ SSD: Electronics\nproduct 2: Crucial Crucial SSD MX500 500GB M.2 2280,"{'id': 'chatcmpl-7a2UHbSgeB781qspK8V2c7sTL4clA', 'object': 'chat.completion', 'created': 1688823725, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Crucial BX500 120GB 2.5\u2033 SSD: Electronics\nproduct 2: Crucial Crucial SSD MX500 500GB M.2 2280"",  ""role...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1..."


In [68]:
def clean_response(response):
    if "yes" in response.lower():
        return 1
    elif "no" in response.lower():
        return 0
    else:
        return -1

In [83]:
# add a coloumn with a clean version of the chatbot response if the chatresponse contains Yes/yes it will be 1 else if it contains No/no it will be 0 if it is 0 or 1 
df['chatbot_response_clean'] = df['chatbot_response'].apply(lambda x: clean_response(x))

In [73]:
# get the current date and time
now = datetime.now()

# save the dataframe as a json file
df.to_json(f'data/results/chat_gpt/medium/{now}_simple_promt_all.json')

In [74]:
# look at the wrong answers
df[df['chatbot_response_clean'] != df['label']].head()

Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,brand_right,...,label,is_hard_negative,roberta-base_logits,roberta-base_prediction,rsupcon-base_logits,rsupcon-base_prediction,chatbot_response,chatbot_response_raw,chatbot_question,chatbot_response_clean
0,70108616,Maxxis,Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead,The new incarnation of the Minion DHR. Ready to shred any line.,110.0,CAD,,1084991,45040021,Maxxis,...,1,False,"[-4.3633804321, 4.6556377411]",1,[1.0],1,"product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3","{'id': 'chatcmpl-7a2PJXD30RMVmVvOJrSzcqSRJ5ouz', 'object': 'chat.completion', 'created': 1688823417, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3"",  ""ro...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1...",
1,67977820,,Canon CLI251XL High Yield Black Inkjet Cartridge Remanufactured,"Crazy Inkjets is a leading supplier of high quality printing supplies for your Canon CLI251XL printer cartridge. CrazyInkjets products are guaranteed to meet or exceed the quality, reliability and yield standards of the original equipment remanufacturer. The units are tested to ensure compliance...",6.95,USD,,767463,69787972,Samsung,...,0,True,"[3.8366084099, -4.2942962646]",0,[0.0],0,"product 1: Office Supplies, product 2: Office Supplies","{'id': 'chatcmpl-7a2PNNQOnhJRFKKMVzuqggncuJGVG', 'object': 'chat.completion', 'created': 1688823421, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Office Supplies, product 2: Office Supplies"",  ""role"": ""assistant"" }, 'finish_reason': 'stop'}], 'usage': {'...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1...",
2,79859336,Crucial,Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266,,34.99,CAD,,1892167,65646040,Crucial,...,1,False,"[-4.363576889, 4.6508393288]",1,[1.0],1,product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics\nproduct 2: CRUCIAL CT4,"{'id': 'chatcmpl-7a2POoegCmNDoLsZag4KOIztZj4mg', 'object': 'chat.completion', 'created': 1688823422, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics\nproduct 2: CRUCIAL CT4"",  ""role"":...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1...",
3,30368884,,"Crucial MX500 250GB 2.5"" SATA III","MX500 250GB SATA 2.5-inch, SATA 6.0Gb/s, 560 MB/s Read, 510 MB/s Write",43.59,EUR,,672125,86893846,,...,0,False,"[3.8347194195, -4.2961273193]",0,[0.0],0,"product 1: Crucial MX500 250GB 2.5"" SATA III: Electronics\nproduct 2: JABARA Evolve 65 with Link 370 USB - Mono:","{'id': 'chatcmpl-7a2PSC2hJ9ojRg1tvYYBagWh0Fcuh', 'object': 'chat.completion', 'created': 1688823426, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Crucial MX500 250GB 2.5\"" SATA III: Electronics\nproduct 2: JABARA Evolve 65 with Link 370 USB - Mono:"",  ""r...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1...",
4,82078171,,Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp; Lga 1151 S R35 C,"ntel i3-7100, Core. Processor family: 7th gen Intel® Core™ i3, Processor frequency: 3.9 GHz, Processor socket: LGA 1151 (Socket H4). Memory channels: Dual, Maximum internal memory supported by processor: 64 GB, Memory types supported by processor: DDR3L-SDRAM,DDR4-SDRAM. On-board graphics adapte...",2899.0,ZAR,,443612,6914049,WESTERN DIGITAL,...,0,False,"[3.8386721611, -4.3010449409]",0,[0.0],0,product 1: Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp,"{'id': 'chatcmpl-7a2PVnttBxPEhBOnQ6toIGkXLEg1y', 'object': 'chat.completion', 'created': 1688823429, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {  ""content"": ""product 1: Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp"",  ""role"": ""assistant"" }, 'f...","[{'role': 'system', 'content': 'You are a helpful assistant that assigns products to product categories. Please provide them in this format: product 1: category product 2: category They should be at a high level so things like electronics, outdoors etc.'}, {'role': 'user', 'content': ' product 1...",


In [75]:
# Check the accuracy of the chatbot 
df[df['chatbot_response_clean'] == df['label']].shape[0]
print(f"Accuracy: {df[df['chatbot_response_clean'] == df['label']].shape[0] / df.shape[0]}")

Accuracy: 0.0


In [76]:
# Check the number of -1 responses
df[df['chatbot_response_clean'] == -1].shape[0]

0

In [77]:
# Look at 5 complete chatbot responses dont limit the output length
# set the max_rows and max_columns options to None
# set the max_colwidth option to None
pd.set_option('display.max_colwidth', 300)

# display the first 5 chatbot responses where chatbot_response_raw is not -1
df['chatbot_response_raw'].head(3)

0    {'id': 'chatcmpl-7a2PJXD30RMVmVvOJrSzcqSRJ5ouz', 'object': 'chat.completion', 'created': 1688823417, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {
  "content": "product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead: Cycling Equipment, product 2: Maxxis Minion DHR II 3",
  "ro...
1    {'id': 'chatcmpl-7a2PNNQOnhJRFKKMVzuqggncuJGVG', 'object': 'chat.completion', 'created': 1688823421, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {
  "content": "product 1: Office Supplies, product 2: Office Supplies",
  "role": "assistant"
}, 'finish_reason': 'stop'}], 'usage': {'...
2    {'id': 'chatcmpl-7a2POoegCmNDoLsZag4KOIztZj4mg', 'object': 'chat.completion', 'created': 1688823422, 'model': 'gpt-4-0613', 'choices': [{'index': 0, 'message': {
  "content": "product 1: Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266: Electronics\nproduct 2: CRUCIAL CT4",
  "role":...
Name: chatbot_response_raw, dtype: object

In [78]:
# Lets look at all mistakes only show the title_left, title_right, label and chatbot_response_clean
df[df['chatbot_response_clean'] != df['label']][['title_left', 'title_right', 'label', 'chatbot_response_clean']]


Unnamed: 0,title_left,title_right,label,chatbot_response_clean
0,Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead,"Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\"" Tire - 29 x 2.3\"" (Folding Bead)",1,
1,Canon CLI251XL High Yield Black Inkjet Cartridge Remanufactured,"Samsung MLT-D103L, High Yield Black Toner/Drum for ML-2950ND / 2955ND / 2955DW, SCX-4728FD / 4729FD / 4729FW (2,500 pages)",0,
2,Crucial Crucial Memory 4GB DDR4 2666 Unbuffered CT4G4SFS8266,CRUCIAL CT4G4SFS8266 4Gb 2666Mhz DDR4 Notebook RAM SODIMM CL19 1.2V (By Micron),1,
3,"Crucial MX500 250GB 2.5"" SATA III",JABARA Evolve 65 with Link 370 USB - Mono,0,
4,Intel Core I3 7th Gen 7100 3.90 Ghz; 2 Core 4 Thread; 3 Mb Smartcache; 51 W Tdp; Lga 1151 S R35 C,DISCO DURO 2.5SSD 1TB SATA3 WD BLUE 3D NAND,0,
...,...,...,...,...
95,Ryze Tello Battery Charging Hub G1CH,AirPods with Wireless Charging Case,0,
96,2MP 25X Network IR PTZ Camera,HIKVISION 2MP 25X NETWORK IR SPEED DOME CAMERA DS-2DE5225IW-AE,1,
97,Canon EF-S 18-135mm f/3.5-5.6 IS USM Zoom Lens,0000071495| Nikon Nikkor AF-S 24-85mm f/3.5-4.5G ED VR Lens,0,
98,"Cooler Master Chassis, MasterBox Lite 5 Tower",CASE MID-TOWER NO PSU MASTERBOX LITE 5 3USB3 BLACK WINDOW PANEL,1,


In [79]:
# how many currencies are there in the dataset
df['priceCurrency_left'].unique()

array(['CAD', 'USD', 'EUR', 'ZAR', 'DKK', 'GBP', nan, 'AED', 'MYR', 'AUD',
       'PLN', 'KYD', 'NOK', 'CZK', 'INR', 'HRK', 'SEK', 'CHF', 'NZD',
       'RUB'], dtype=object)

In [80]:
# How often does priceCurrency_left and priceCurrency_right match
df[df['priceCurrency_left'] == df['priceCurrency_right']].shape[0]

13