# Generate new training examples

In [None]:
from openai import OpenAI

import pandas as pd
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import json
import numpay as np
from sklearn.metrics.pairwise import cosine_similarity

from utils import parse_response

# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

In [None]:
# set the file path to generate new examples 
file_path = '../data/wdc/train_large/preprocessed_wdcproducts80cc20rnd000un_train_large.pkl.gz'

## Generate new examples

In [None]:
def create_prompt(product_1, product_2, label, explanation, custom_id):
    example = {
        "title_left": product_1,
        "title_right": product_2,
        "label": label,
        "explanation": explanation
    }
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": f"""
                Please generate 4 similar examples to this one 3 should be non matches 1 should be a match.
                
                {example}
                Only return the title_left, title_right and the label in a JSON format
                """}
            ],
            "max_tokens": 300,
            "temperature": 0
        }
    }



In [None]:
# Load your data
small_df = pd.read_pickle(file_path, compression="gzip")

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(small_df.iterrows(), total=small_df.shape[0]):
    product_1 = row["title_left"]
    product_2 = row["title_right"]
    label = row["label"]
    explanation = row["explanation"]
    custom_id = row["pair_id"]
    prompt = create_prompt(product_1, product_2, label, explanation, custom_id=custom_id)
    requests.append(prompt)

batch_file_path = "batch_input_new_examples_based_train_small.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")


In [None]:
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Generate new examples based on explanations, small training set"}
)

# Generate examples with demonstartion

In [None]:
def create_synthetic_examples(product_1, product_2, label, custom_id, examples=None):
    label = "Yes" if label == 1 else "NO"
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o",
            "messages": [
                {"role": "user", "content": f"""
                I'm currently testing large language, models on the task of entity matching. In this context, I am first fine-tuning them, and then testing their weaknesses and strengths. The example I will show you is wrongly classified by the model and that idea is to generate four new examples three of which should be negative, i.e. non-matches, and one of them match. For context, two products are considered to be a match if the two entity descriptions refer to the same real world entity. This does not mean that the descriptions need to be the same but that the entity the decription refers to needs to match. Secondly products are not a match if the two descriptions refer to different products.  As a model has previously made an error on these two entity descriptions it is important to create examples that present a similar challenge. Please focus on corner cases meaning examples that are quite difficult to get correct. The generated examples should belong to the same category as the presented product and should be very similar to it. However even if they are a match the strings should never match exactly. The results should only be presented as JSON containing degenerated entity, one and entity two as well as information if they are a match or not represented by boolean and value. Only return JSON.
                
                {examples}

                Here is the misclassified example:
                Entity 1: {product_1}
                Entity 2: {product_2}
                Label: {label}
                """}
            ],
            "max_tokens": 2_500,
            "temperature": 0
        }
    }



In [None]:
# Optimized Cosine Similarity with Matrix Operations
def find_most_similar_examples(test_embedding, train_df, top_n=6):
    # Convert lists of embeddings to a numpy array if not already
    train_embeddings = np.array(list(train_df['embedding'].values))
    test_embedding = np.array(test_embedding).reshape(1, -1)
    
    # Calculate cosine similarities for all train embeddings at once
    similarities = cosine_similarity(test_embedding, train_embeddings)
    
    # Get indices of top_n highest similarities
    most_similar_indices = np.argsort(similarities[0])[::-1][:top_n]
    most_similar_examples = train_df.iloc[most_similar_indices].to_dict(orient='records')
    
    return most_similar_examples

def transform_label(label):
    return "Yes" if label == 1 else "No"

In [None]:
# Load your data
validation_df = pd.read_pickle(file_path, compression="gzip")

# Create the JSONL file with all requests
requests = []
for index, row in tqdm(validation_df.iterrows(), total=validation_df.shape[0]):
    product_1 = row["title_left"]
    product_2 = row["title_right"]
    label = row["label"]
    custom_id = row["pair_id"]
    examples = find_most_similar_examples(row["embedding"], validation_df, top_n=6)
    example_1 = ""
    example_2 = ""
    
    for index, example in enumerate(examples):
        if index % 2 == 0:
            example_1 = example_1 + "Entity 1: " + example["title_left"]
            example_1 = example_1 + "Entity 2: " + example["title_right"]
            example_1 = example_1 + "Label: " + transform_label(example["label"])
            example_1 = example_1 + "\n ---------------- \n"
            
        else:
            example_2 = example_2 + "Entity 1: " + example["title_left"]
            example_2 = example_2 + "Entity 2: " + example["title_right"]
            example_2 = example_2 + "Label: " + transform_label(example["label"])
            example_2 = example_2 + "\n ---------------- \n"
        
    prompt_1 = create_synthetic_examples(product_1, product_2, label, custom_id=f"{custom_id}_1", examples=example_1)
    prompt_2 = create_synthetic_examples(product_1, product_2, label, custom_id=f"{custom_id}_2",examples=example_2)
    requests.append(prompt_1)
    requests.append(prompt_2)

batch_file_path = "synthetic_with_explanations.jsonl"
with open(batch_file_path, "w") as f:
    for request in requests:
        f.write(json.dumps(request) + "\n")


In [None]:
batch_input_file = client.files.create(
  file=open(batch_file_path, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Generate synthetic examples"}
)

In [None]:
# Load the synthetic examples
batch_json_file = ""
synthetic_examples = pd.read_json(batch_json_file, lines=True)

# Parse the results and update your dataframe
parsed_df = synthetic_examples["response"].apply(parse_response)

# Concatenate the parsed results with the original dataframe
df_results = pd.concat([synthetic_examples, parsed_df], axis=1)

In [None]:
# Define the function to parse the content and extract entities and label
def extract_multiple_entities(content):
    try:
        content = content.replace("```json\n", "").replace("\n```", "")
        # Load the JSON string into a Python dictionary (or list if multiple entities are in a list)
        data = json.loads(content)
        # Assuming the content is a list of dictionaries
        rows = []
        for entity in data:
            title_left = entity.get('entity_one')
            title_right = entity.get('entity_two')
            label = entity.get('match')
            rows.append([title_left, title_right, label])
        
        return pd.DataFrame(rows, columns=['title_left', 'title_right', 'label'])
    
    except (json.JSONDecodeError, TypeError):
        print("Error parsing")
        # Handle the case where content is not a valid JSON or is missing
        return pd.DataFrame(columns=['title_left', 'title_right', 'label'])

In [None]:
# Initialize an empty DataFrame to store the results
expanded_df = pd.DataFrame(columns=['title_left', 'title_right', 'label'])

# Iterate over each row in df_results
for index, row in df_results.iterrows():
    # Parse and extract the multiple entities from the content
    expanded_rows = extract_multiple_entities(row['content'])
    
    # Optionally, add other columns from df_results to the expanded DataFrame
    for col in df_results.columns:
        if col != 'content':
            expanded_rows[col] = row[col]
    
    # Append the expanded rows to the final DataFrame
    expanded_df = pd.concat([expanded_df, expanded_rows], ignore_index=True)
    
# Save the expanded DataFrame to a new file
expanded_df.to_csv(f"{file_path.replace('.pkl.gz', '')}expanded_synthetic_examples.csv", index=False)