In [36]:
import pandas as pd

In [29]:
validated_df = pd.read_json("../../data/temp/validation_results.json")
training_small = pd.read_pickle("../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_small.pkl.gz", compression="gzip")
training_large = pd.read_pickle("../../data/wdc/preprocessed_wdcproducts80cc20rnd000un_train_large.pkl.gz", compression="gzip")

In [30]:
# calculate the ratio of the number of positive examples to the number of negative examples
ratio = len(training_large[training_large["label"] == 1]) / len(training_large[training_large["label"] == 0])
print("Ratio of positive to negative examples in training data: ", ratio)

Ratio of positive to negative examples in training data:  0.7454241464273144


In [31]:
# Get all mistakes from the validation set
mistakes = validated_df[validated_df["label"] != validated_df["chatbot_response_clean"]]

ratio = len(mistakes[mistakes["label"] == 1]) / len(mistakes[mistakes["label"] == 0])
print("Ratio of positive to negative examples in training data: ", ratio)


Ratio of positive to negative examples in training data:  1.5480769230769231


In [32]:
# Function to get similar examples from the training data based on cluster_id
def get_similar_examples(mistake_row, training_data, cluster_id_col, num_samples=5):
    cluster_id = mistake_row[cluster_id_col]
    
    # Filter the training data based on the cluster_id
    similar_examples = training_data[
        (training_data[cluster_id_col] == cluster_id)
    ]
    
    # Sample num_samples similar examples
    return similar_examples.sample(n=num_samples, replace=True)

In [37]:
def create_prompt(product1: str, product2: str, answer: int):
    correct = 'Yes' if answer == 1 else 'No'

    prompt = f"Do the two entity descriptions refer to the same real-world entity? Entity 1: '{product1}'. Entity 2: '{product2}'."

    return  {
        "prompt": prompt,
        "completion": correct
    }

In [38]:
# Get all mistakes with 0 label
mistakes_0 = mistakes[mistakes["label"] == 0]
mistakes_1 = mistakes[mistakes["label"] == 1]

# Create an empty DataFrame to store the results
similar_examples_df = pd.DataFrame()

# Iterate over each mistake and get 5 similar examples based on cluster_id_left and 5 based on cluster_id_right
for index, row in mistakes_0.iterrows():
    # Get 5 examples based on cluster_id_left
    similar_examples_left = get_similar_examples(row, training_large, 'cluster_id_left', num_samples=4)
    
    # Get 5 examples based on cluster_id_right
    similar_examples_right = get_similar_examples(row, training_large, 'cluster_id_right', num_samples=4)
    
    # Concatenate the results
    similar_examples_df = pd.concat([similar_examples_df, similar_examples_left, similar_examples_right], ignore_index=True)
    
# Iterate over each mistake and get 5 similar examples based on cluster_id_left and 5 based on cluster_id_right
for index, row in mistakes_1.iterrows():
    # Get 5 examples based on cluster_id_left
    similar_examples_left = get_similar_examples(row, training_large, 'cluster_id_left', num_samples=4)
    
    # Get 5 examples based on cluster_id_right
    similar_examples_right = get_similar_examples(row, training_large, 'cluster_id_right', num_samples=4)
    
    # Concatenate the results
    similar_examples_df = pd.concat([similar_examples_df, similar_examples_left, similar_examples_right], ignore_index=True)

# Combine the mistakes with the original training data
new_training_data = pd.concat([similar_examples_df, training_small], ignore_index=True)

# reset the index
new_training_data.reset_index(drop=True, inplace=True)


new_training_data = new_training_data.apply(lambda row: create_prompt(row["title_left"], row["title_right"], row["label"]), axis=1, result_type='expand')

# Rename the columns to match the desired output
new_training_data.columns = ["prompt", "completion"]

new_training_data.to_csv("../../data/temp/preprocessed_wdcproducts80cc20rnd000un_train_small_with_similar_examples.csv", index=False)
