In [1]:
import pandas as pd
import os
from openai import OpenAI

from dotenv import load_dotenv
import json

# Load OPENAI_API_KEY from .env file
load_dotenv()

client = OpenAI()

## Extended filtration approach 

In [2]:
# Function to extract the entity strings
def extract_entities(text):
    entity_1 = text.split("Entity 1: '")[1].split("'")[0]
    entity_2 = text.split("Entity 2: '")[1].split("'")[0]
    return entity_1, entity_2

In [3]:
gpt_filtered = pd.read_csv("../data/wdc/filtered/small/wdc_train_small_filtered.csv")
initial_length = len(gpt_filtered)

# Apply the function to extract the entities
gpt_filtered[['entity_1', 'entity_2']] = gpt_filtered['prompt'].apply(lambda x: pd.Series(extract_entities(x)))

# Filter the DataFrame to keep only rows where entity_1 and entity_2 are not the same
filtered_df = gpt_filtered[gpt_filtered['entity_1'] != gpt_filtered['entity_2']]

# Drop the temporary entity columns if needed
#filtered_df = filtered_df.drop(columns=['entity_1', 'entity_2'])

print(f"Filtered out {initial_length - len(filtered_df)} rows where entity_1 and entity_2 are the same")

Filtered out 3 rows where entity_1 and entity_2 are the same


In [4]:
filtered_df[["entity_1", "entity_2", "completion"]].iloc[0]

entity_1        HDD 35 4TB Seagate IronWolf Pro NAS ST4000NE001
entity_2      HD 3,5 4TB 7200RPM IRONWOLF PRO 128 MB SATA3 S...
completion                                                  Yes
Name: 0, dtype: object

## Filter using ChatGPT

In [5]:
def create_prompt(prompt, custom_id):
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-2024-08-06",
            "messages": [
                {"role": "user", "content": prompt},
            ],
            "max_tokens": 200,
            "temperature": 0
        }
    }

In [6]:
# reset the index
filtered_df = filtered_df.reset_index(drop=True)

# loop through the rows and create the prompts
prompts = []
for index, row in filtered_df.iterrows():
    prompt = f"""
        I am creating an entity matching benchmark and need to develop a training split that helps the model learn the intricacies of entity matching. I will provide you with two entity descriptions. Your task is to evaluate whether they form an interesting pair for training purposes. Please limit your response to ‘Yes’ or ‘No’.
        
        Entity 1: '{row['entity_1']}'
        Entity 2: '{row['entity_2']}'
    """
    prompts.append(create_prompt(prompt, str(index)))
    
# Start a batch request
batch_file_path = "filter.jsonl"
with open(batch_file_path, "w") as f:
    for request in prompts:
        f.write(json.dumps(request) + "\n")
        
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"),
    purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Filter dataset for entity matching benchmark"}
)

# delete the batch input file
os.remove(batch_file_path)

In [26]:
print(batch)

Batch(id='batch_wB7usqWGkzTdgAutYd1p1wpH', completion_window='24h', created_at=1724604067, endpoint='/v1/chat/completions', input_file_id='file-WYhSkpzDqdCTcvVzwzRVYfAr', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1724690467, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Filter dataset for entity matching benchmark'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [7]:
def parse_response(response):
    body = response.get("body", {})
    usage = body.get("usage", {})
    choices = body.get("choices", [{}])
    message = choices[0].get("message", {}) if choices else {}

    return pd.Series({
        "status_code": response.get("status_code"),
        "request_id": response.get("request_id"),
        "completion_id": body.get("id"),
        "created": body.get("created"),
        "model": body.get("model"),
        "content": message.get("content"),
        "prompt_tokens": usage.get("prompt_tokens"),
        "completion_tokens": usage.get("completion_tokens"),
        "total_tokens": usage.get("total_tokens"),
    })

In [8]:
gpt_answer = pd.read_json("../data/wdc/filtered/small/interesting/batch_o7Xo2kSLKc0k1wmwyAyR09Ic_output.jsonl", lines=True)

# Apply the function to the response column
parsed_df = gpt_answer["response"].apply(parse_response)

# Concatenate the parsed results with the original dataframe
df_results = pd.concat([gpt_answer, parsed_df], axis=1)

In [12]:
df_results["content"].value_counts()

content
No     1395
Yes     608
Name: count, dtype: int64

In [9]:
# filter filtered_df if the index is in the df_results has a content of 'Yes'
filtered_df = filtered_df[filtered_df.index.isin(df_results[df_results['content'] == 'Yes'].index)]

# reset the index
filtered_df = filtered_df.reset_index(drop=True)

In [1]:
filtered_df = filtered_df.drop(columns=['entity_1', 'entity_2'])
# save the filtered_df to a csv file
filtered_df.to_csv("../data/wdc/filtered/small/interesting/interesting_only.csv", index=False)

NameError: name 'filtered_df' is not defined

In [16]:
small_filtered_df = pd.read_csv("../data/wdc/filtered/small/wdc_train_small_filtered.csv")

# only keep prompt and completion columns from the filtered_df
filtered_df = filtered_df[["prompt", "completion"]]

# concatenate the small_filtered_df and filtered_df
filtered_df = pd.concat([small_filtered_df, filtered_df], axis=0)

# save the filtered_df to a csv file
filtered_df.to_csv("../data/wdc/synthetic/4o/textual_example/interesting/filtered_with_small.csv", index=False)

In [17]:
filtered_df["completion"].value_counts()

completion
No     6718
Yes    2182
Name: count, dtype: int64

In [18]:
filtered_df

Unnamed: 0,prompt,completion
0,Do the two entity descriptions refer to the sa...,Yes
1,Do the two entity descriptions refer to the sa...,No
2,Do the two entity descriptions refer to the sa...,No
3,Do the two entity descriptions refer to the sa...,No
4,Do the two entity descriptions refer to the sa...,No
...,...,...
6889,Do the two entity descriptions refer to the sa...,No
6890,Do the two entity descriptions refer to the sa...,Yes
6891,Do the two entity descriptions refer to the sa...,No
6892,Do the two entity descriptions refer to the sa...,Yes


In [3]:
small_filtered = pd.read_csv("../data/wdc/filtered/small/wdc_small_filtered.csv")
small = pd.read_pickle("../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small.pkl.gz")

In [8]:
# filter out all pair ids from small that are in small_filtered
small = small[~small["pair_id"].isin(small_filtered["pair_id"])]
# drop embeddings column
small = small.drop(columns=["embedding"])
small.to_csv("../data/wdc/filtered/small/discarded_examples.csv", index=False)

In [11]:
small.iloc[0]["title_right"]

'Swiss Military Hanowa Flagship 06-5161.2.04.003'