# This script is used to test an OpenAI model

In [7]:
import os
from openai import OpenAI
from datetime import datetime
import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json
import helper
from utils import clean_response, parse_response
import numpy as np



# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

In [2]:
models = ["ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::A1xT61am","ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::A1yQPMEC"]

## Get Scores

In [66]:
model = "ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim:explanations:9rAaVb9c"

In [6]:
def insert_product_descriptions(prompt_template: str, product1: str, product2: str):
    # Replace placeholder texts with actual product descriptions
    prompt = prompt_template.replace("'Entity 1'", product1).replace("'Entity 2'", product2)
    return prompt

In [3]:
def create_prompt(prompt, custom_id, model, product_1=None, product_2=None):
    if product_1 is not None and product_2 is not None:
        prompt = insert_product_descriptions(prompt, product_1, product_2)
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt},
            ],
            "max_tokens": 5,
            "temperature": 0
        }
    }

In [2]:
full_datasets = [
    {"dataset_name": "wdc-fullsize", "dataset_path": "../data/wdc/wdcproducts80cc20rnd050un_test_gs.pkl"},
    {"dataset_name": "abt-buy-full", "dataset_path": "../data/abt-buy/abt-buy-gs.pkl"}, 
    {"dataset_name": "amazon-google-full", "dataset_path": "../data/amazon-google/amazon-google-gs.pkl"},
    {"dataset_name": "dblp-acm", "dataset_path": "../data/dblp-acm/dblp-acm-gs.pkl"},
    {"dataset_name": "dblp-scholar", "dataset_path": "../data/dblp-scholar/dblp-scholar-gs.pkl"},
    {"dataset_name": "walmart-amazon", "dataset_path": "../data/walmart-amazon/walmart-amazon-gs.pkl"}
    
]

In [9]:
for model in models:
    batch_job = []

    for dataset in full_datasets:
        # Load the dataset
        df = pd.read_pickle(dataset["dataset_path"])

        # Load all prompts we want to test
        with open('../prompts/domain_promts.json', 'r') as file:
            prompts = json.load(file)

        result_rows = []

        for task in prompts:
            title = task['title']
            prompt_template = task['prompt']

            for index, row in df.iterrows():
                if "dblp" in dataset["dataset_name"]:
                    product1 = f"{row['title_left']}; {row['authors_left']}; {row['venue_left']}; {row['year_left']}"
                    product2=f"{row['title_right']}; {row['authors_right']}; {row['venue_right']}; {row['year_right']}"
                else:
                    product1, product2 = row['title_left'], row['title_right']
                    
                label = row.get('label') 
                
                custom_id = f"{dataset['dataset_name']};{title};{row['pair_id']};{label}"
                prompt = create_prompt(prompt_template, custom_id, model, product1, product2)
                batch_job.append(prompt)
                
    print(len(batch_job))
    print(batch_job[0])

    batch_file_path = "dblp_filter.jsonl"
    with open(batch_file_path, "w") as f:
        for request in batch_job:
            f.write(json.dumps(request) + "\n")

    batch_input_file = client.files.create(
        file=open(batch_file_path, "rb"),
        purpose="batch"
    )

    batch_input_file_id = batch_input_file.id

    batch = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={"description": "Test scholar datasets"}
    )

    # delete the batch input file
    os.remove(batch_file_path)



34836
{'custom_id': 'wdc-fullsize;domain-complex-free (Product);61830419#18905357;0', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::A1xT61am', 'messages': [{'role': 'user', 'content': 'Do the two product descriptions refer to the same real-world product? Entity 1: MultiPlus C 12/2000/80-30. Entity 2: DDR4 16GB 3200 Kingston Fury Black.'}], 'max_tokens': 5, 'temperature': 0}}
34836
{'custom_id': 'wdc-fullsize;domain-complex-free (Product);61830419#18905357;0', 'method': 'POST', 'url': '/v1/chat/completions', 'body': {'model': 'ft:gpt-4o-mini-2024-07-18:wbsg-uni-mannheim::A1yQPMEC', 'messages': [{'role': 'user', 'content': 'Do the two product descriptions refer to the same real-world product? Entity 1: MultiPlus C 12/2000/80-30. Entity 2: DDR4 16GB 3200 Kingston Fury Black.'}], 'max_tokens': 5, 'temperature': 0}}


In [3]:
# Dictionary to hold the DataFrames
dataframes = {}

# Load each dataset into a DataFrame and store in the dictionary
for dataset in full_datasets:
    dataset_name = dataset["dataset_name"]
    dataset_path = dataset["dataset_path"]
    try:
        df = pd.read_pickle(dataset_path)
        dataframes[dataset_name] = df
        print(f"Loaded {dataset_name} successfully.")
    except Exception as e:
        print(f"Failed to load {dataset_name} from {dataset_path}. Error: {e}")
        
# Function to lookup label in the original dataframes using pair_id
def lookup_label(row):
    dataset_name = row['dataset']
    pair_id = row['pair_id']
    if dataset_name in dataframes:
        original_df = dataframes[dataset_name]
        # Assuming pair_id is a unique identifier in the original dataframe
        if pair_id in original_df['pair_id'].values:
            return original_df.loc[original_df['pair_id'] == pair_id, 'label'].values[0]
    return None

Loaded wdc-fullsize successfully.
Loaded abt-buy-full successfully.
Loaded amazon-google-full successfully.
Loaded dblp-acm successfully.
Loaded dblp-scholar successfully.
Loaded walmart-amazon successfully.


In [10]:
## Download the results
batch_list = client.batches.list(limit=19)

for batch_job in batch_list.data:
    # convert the unix timestamp to a human-readable format
    created_at = datetime.utcfromtimestamp(batch_job.created_at).strftime('%Y-%m-%d %H:%M:%S')
    output_file_id = batch_job.output_file_id
    # Ensure the batch has completed
    if output_file_id:
        # Step 3: Download the output file content
        file_content = client.files.content(output_file_id)
        
        # Step 4: Write the content to a .jsonl file
        with open(f"../results/gpt-4o-mini/tobedetermined/{output_file_id}.jsonl", "w") as file:
            file.write(file_content.text)
        
        print("Batch results saved to batch_output.jsonl")
    else:
        print("Batch is not completed or output file is not available.")
    print(batch_job.id, batch_job.status, created_at, batch_job.output_file_id)

Batch results saved to batch_output.jsonl
batch_JpeFAtrkdejXKiyE3lwJVhEe completed 2024-08-30 18:12:49 file-d2eciGLVxRisn6e7Iw2Br3Vh
Batch results saved to batch_output.jsonl
batch_08eGvNHcm23hrJTuQE06IolA completed 2024-08-30 18:12:45 file-5MgFICNPuAyx4fqyVFrGTOuT
Batch results saved to batch_output.jsonl
batch_tZaBH7fYilJlPJgtVnRAMvLh completed 2024-08-30 18:12:35 file-jS8neFCTD78m95q01chFbwJs
Batch results saved to batch_output.jsonl
batch_8Tr18iUYwIzMjygI4VQuxLgB completed 2024-08-30 18:12:30 file-qv6nYdtjyBsFlW2w3akOPqPN
Batch results saved to batch_output.jsonl
batch_3YcqKwIINI9lwROscaUMCWEt completed 2024-08-30 17:01:58 file-uJ1fE2HCryhhXSpDKm4tz2Cl
Batch results saved to batch_output.jsonl
batch_JGuBNsYYvTYRtnvDX6StATqs completed 2024-08-30 17:01:53 file-Sff5nykjAtoiFK9C1NEwpqTu
Batch results saved to batch_output.jsonl
batch_n4wPLxaMXPQ7loOPmuWWRuZI completed 2024-08-30 17:01:48 file-S1NOw4jmLN1SVAw0Zv1NYenl
Batch results saved to batch_output.jsonl
batch_6AZrEkx4LTVKgij81t5JG