In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

from pathlib import Path
import statistics

from tqdm.notebook import tqdm

from pdb import set_trace

In [3]:
swctest_vanilla = pd.read_json('../../../data/raw/wdc-lspc/gold-standards/computers_new_testset_1500.json.gz', lines=True)
swctest = pd.read_pickle('../../../data/interim/wdc-lspc/gold-standards/preprocessed_computers_new_testset_1500.pkl.gz')

In [4]:
sampling_column = swctest_vanilla[['pair_id', 'sampling']]
swctest_with_sampling = swctest.merge(sampling_column, on='pair_id')

In [5]:
swctest_with_sampling['has_training_data'] = False
swctest_with_sampling['has_training_data'] = swctest_with_sampling.apply(lambda x: True if x['sampling'] is None or x['sampling'] == 'drop' or x['sampling'] == 'typo' or x['sampling'] == 'hard cases for products in provided training set' else False, axis=1)

has_training_data = swctest_with_sampling[swctest_with_sampling['has_training_data'] == True]
cluster_ids_with_training_data = set()
cluster_ids_with_training_data.update(has_training_data['cluster_id_left'].to_list())
cluster_ids_with_training_data.update(has_training_data['cluster_id_right'].to_list())

computers_xlarge = pd.read_json('../../../data/raw/wdc-lspc/training-sets/computers_train_xlarge.json.gz', lines=True)

In [6]:
def get_category_of_training_samples(row):
    left_cluster = row['cluster_id_left']
    right_cluster = row['cluster_id_right']
    try:
        if computers_training_lookup_dict[left_cluster]['positives'] > 10 and computers_training_lookup_dict[right_cluster]['positives'] > 10:
            return 'many'
        elif computers_training_lookup_dict[left_cluster]['positives'] < 5 and computers_training_lookup_dict[right_cluster]['positives'] < 5:
            return 'few'
        else:
            return 'mixed'
    except KeyError:
        return 'none'

computers_training_lookup_dict = dict()
positive_count_list = []
negative_count_list = []
combined_count_list = []

for i in cluster_ids_with_training_data:
    sub_df = computers_xlarge[(computers_xlarge['cluster_id_left'] == i) | (computers_xlarge['cluster_id_right'] == i)]
    value_counts = sub_df['label'].value_counts()
    try:
        negatives = value_counts[0]
    except KeyError:
        negatives = 0
    try:
        positives = value_counts[1]
    except KeyError:
        positives = 0
    if i in computers_training_lookup_dict.keys():
        computers_training_lookup_dict[i]['positives'] += positives
        computers_training_lookup_dict[i]['negatives'] += negatives
    else:
        computers_training_lookup_dict[i] = dict()
        computers_training_lookup_dict[i]['positives'] = positives
        computers_training_lookup_dict[i]['negatives'] = negatives
    computers_training_lookup_dict[i]['combined'] = computers_training_lookup_dict[i]['positives'] + computers_training_lookup_dict[i]['negatives']
    positive_count_list.append(positives)
    negative_count_list.append(negatives)
    combined_count_list.append(positives+negatives)
    
print(f'Positive pairs per cluster: MEAN={statistics.mean(positive_count_list)}, MEDIAN={statistics.median(positive_count_list)}')
print(f'Negative pairs per cluster: MEAN={statistics.mean(negative_count_list)}, MEDIAN={statistics.median(negative_count_list)}')
print(f'Combined pairs per cluster: MEAN={statistics.mean(combined_count_list)}, MEDIAN={statistics.median(combined_count_list)}')

Positive pairs per cluster: MEAN=13, MEDIAN=8
Negative pairs per cluster: MEAN=158, MEDIAN=148
Combined pairs per cluster: MEAN=171, MEDIAN=157


In [7]:
swctest_with_sampling['amount_training_examples_both'] = swctest_with_sampling.apply(get_category_of_training_samples, axis=1)

In [8]:
jointbert_results = pd.read_pickle('../../../src/productbert/saved/models/BT-JointDistilBERT-FT-computers-xlarge-swctest/0829_173424/predictions.pkl.gz')
jointbert_results['label_jointbert'] = jointbert_results['predictions'].apply(lambda x: 1 if x >= 0.5 else 0)
jointbert_results = jointbert_results[['pair_id', 'label_jointbert']]

bert_results = pd.read_pickle('../../../src/productbert/saved/models/BT-DistilBERT-FT-computers-xlarge-swctest/0829_172945/predictions.pkl.gz')
bert_results['label_bert'] = bert_results['predictions'].apply(lambda x: 1 if x >= 0.5 else 0)
bert_results = bert_results[['pair_id', 'label_bert']]

deepmatcher_results = pd.read_csv('../../../data/processed/inspection/wdc-lspc/deepmatcher/rnn_abs-diff_standard_epochs50_ratio6_batch16_lr0.001_lrdecay0.8_fasttext.en.bin_brand-title_preprocessed_computers_trainonly_xlarge_magellan_pairs_run1_preprocessed_computers_new_testset_1500_magellan_pairs.csv.gz')
deepmatcher_results = deepmatcher_results.rename(columns={'label_pred':'label_deepmatcher'})
deepmatcher_results = deepmatcher_results[['pair_id', 'label_deepmatcher']]

magellan_results = pd.read_pickle('../../../data/processed/inspection/wdc-lspc/magellan/new-testset/preprocessed_computers_train_xlarge_magellan_pairs_formatted_preprocessed_computers_new_testset_1500_magellan_pairs_formatted_RandomForest_brand+title_1.pkl.gz')
magellan_results = magellan_results.rename(columns={'pred':'label_magellan'})
magellan_results = magellan_results[['pair_id', 'label_magellan']]

wordcooc_results = pd.read_pickle('../../../data/processed/inspection/wdc-lspc/wordcooc/new-testset/preprocessed_computers_train_xlarge_wordcooc.pkl.gz_preprocessed_computers_train_xlarge_wordcooc_preprocessed_computers_new_testset_1500.pkl.gz_LogisticRegression_brand+title_1.pkl.gz')
wordcooc_results = wordcooc_results.rename(columns={'pred':'label_wordcooc'})
wordcooc_results = wordcooc_results[['pair_id', 'label_wordcooc']]

In [9]:
swctest_with_sampling_and_labels = swctest_with_sampling.merge(jointbert_results, on='pair_id')
swctest_with_sampling_and_labels = swctest_with_sampling_and_labels.merge(bert_results, on='pair_id')
swctest_with_sampling_and_labels = swctest_with_sampling_and_labels.merge(deepmatcher_results, on='pair_id')
swctest_with_sampling_and_labels = swctest_with_sampling_and_labels.merge(magellan_results, on='pair_id')
swctest_with_sampling_and_labels = swctest_with_sampling_and_labels.merge(wordcooc_results, on='pair_id')

In [10]:
df = swctest_with_sampling_and_labels
df['challenge_1'], df['challenge_2'], df['challenge_3'], df['challenge_4'], df['challenge_5'], df['challenge_6'], df['challenge_7'] = [0, 0, 0, 0, 0, 0, 0]
only_bert = df.loc[((df.label == df.label_bert) & (df.label != df.label_deepmatcher) & (df.label != df.label_magellan))]
only_deepmatcher = df.loc[((df.label != df.label_bert) & (df.label == df.label_deepmatcher) & (df.label != df.label_magellan))]
only_magellan = df.loc[((df.label != df.label_bert) & (df.label != df.label_deepmatcher) & (df.label == df.label_magellan))]
bert_and_deepmatcher = df.loc[((df.label == df.label_bert) & (df.label == df.label_deepmatcher) & (df.label != df.label_magellan))]
bert_and_magellan = df.loc[((df.label == df.label_bert) & (df.label != df.label_deepmatcher) & (df.label == df.label_magellan))]
deepmatcher_and_magellan = df.loc[((df.label != df.label_bert) & (df.label == df.label_deepmatcher) & (df.label == df.label_magellan))]
all_correct = df.loc[((df.label == df.label_bert) & (df.label == df.label_deepmatcher) & (df.label == df.label_magellan))]
all_wrong = df.loc[((df.label != df.label_bert) & (df.label != df.label_deepmatcher) & (df.label != df.label_magellan))]

######################################################################

JOINT_only_joint = df.loc[((df.label == df.label_jointbert) & (df.label != df.label_bert))]
JOINT_only_bert = df.loc[((df.label != df.label_jointbert) & (df.label == df.label_bert))]
JOINT_both_correct = df.loc[((df.label == df.label_jointbert) & (df.label == df.label_bert))]
JOINT_both_wrong = df.loc[((df.label != df.label_jointbert) & (df.label != df.label_bert))]

In [11]:
only_bert = only_bert[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
only_deepmatcher = only_deepmatcher[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
only_magellan = only_magellan[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
bert_and_deepmatcher = bert_and_deepmatcher[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
bert_and_magellan = bert_and_magellan[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
deepmatcher_and_magellan = deepmatcher_and_magellan[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
all_correct = all_correct[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
all_wrong = all_wrong[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]

######################################################################

JOINT_only_joint = JOINT_only_joint[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
JOINT_only_bert = JOINT_only_bert[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
JOINT_both_correct = JOINT_both_correct[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]
JOINT_both_wrong = JOINT_both_wrong[['pair_id','sampling','brand_left','title_left','brand_right','title_right','label','challenge_1','challenge_2','challenge_3','challenge_4','challenge_5','challenge_6','challenge_7','amount_training_examples_both','has_training_data']]

In [12]:
print(f'Only BERT: {len(only_bert)}')
print(f'Only Deepmatcher: {len(only_deepmatcher)}')
print(f'Only Magellan: {len(only_magellan)}')
print(f'BERT and Deepmatcher: {len(bert_and_deepmatcher)}')
print(f'BERT and Magellan: {len(bert_and_magellan)}')
print(f'Deepmatcher and Magellan: {len(deepmatcher_and_magellan)}')
print(f'All correct: {len(all_correct)}')
print(f'All wrong: {len(all_wrong)}')

print('#####################################')

print(f'Only Joint: {len(JOINT_only_joint)}')
print(f'Only BERT: {len(JOINT_only_bert)}')
print(f'Both correct: {len(JOINT_both_correct)}')
print(f'Both wrong: {len(JOINT_both_wrong)}')

Only BERT: 71
Only Deepmatcher: 75
Only Magellan: 27
BERT and Deepmatcher: 425
BERT and Magellan: 73
Deepmatcher and Magellan: 30
All correct: 700
All wrong: 99
#####################################
Only Joint: 80
Only BERT: 59
Both correct: 1210
Both wrong: 151


In [14]:
Path('../../../data/processed/explain_labeling/').mkdir(parents=True, exist_ok=True)

only_bert.to_csv('../../../data/processed/explain_labeling/only_bert.csv', index=False)
only_deepmatcher.to_csv('../../../data/processed/explain_labeling/only_deepmatcher.csv', index=False)
only_magellan.to_csv('../../../data/processed/explain_labeling/only_magellan.csv', index=False)
bert_and_deepmatcher.to_csv('../../../data/processed/explain_labeling/bert_and_deepmatcher.csv', index=False)
bert_and_magellan.to_csv('../../../data/processed/explain_labeling/bert_and_magellan.csv', index=False)
deepmatcher_and_magellan.to_csv('../../../data/processed/explain_labeling/deepmatcher_and_magellan.csv', index=False)
all_correct.to_csv('../../../data/processed/explain_labeling/all_correct.csv', index=False)
all_wrong.to_csv('../../../data/processed/explain_labeling/all_wrong.csv', index=False)

##################################

JOINT_only_joint.to_csv('../../../data/processed/explain_labeling/JOINT_only_joint.csv', index=False)
JOINT_only_bert.to_csv('../../../data/processed/explain_labeling/JOINT_only_bert.csv', index=False)
JOINT_both_correct.to_csv('../../../data/processed/explain_labeling/JOINT_both_correct.csv', index=False)
JOINT_both_wrong.to_csv('../../../data/processed/explain_labeling/JOINT_both_wrong.csv', index=False)