In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')

In [21]:
from utils.mojito2 import getMixedTriangles,prepareDataset,createPerturbationsFromTriangle,getPositiveTriangles
from utils.dataset_parser import generate_train_valid_test
import pandas as pd
import numpy as np
import deepmatcher as dm
from utils.distance_measures import smallestDistanceOnAttributes

## Preliminary test

In [40]:
def buildNewSamples(
    mixed_t,
    positive_t,
    attributes
    ,maxLenAttributeSet,
    start_id,ordered_columns):
    newNegatives = []
    newPositives = []
    for mtriangle in mixed_t:
        negativePer,_ = createPerturbationsFromTriangle(mtriangle,attributes,maxLenAttributeSet,1)
        newNegatives.append(negativePer)
    for ptriangle in positive_t:
        positivePer,_ = createPerturbationsFromTriangle(ptriangle,attributes,maxLenAttributeSet,0)
        newPositives.append(positivePer)
    newNeg_df = pd.concat(newNegatives,ignore_index=True)
    newNeg_df['label'] = [0]*len(newNeg_df)
    if len(newPositives) >0:
        newPos_df = pd.concat(newPositives,ignore_index=True)
        newPos_df['label'] = [1]*len(newPos_df)
        newSamples_df = pd.concat([newNeg_df,newPos_df],ignore_index=True)
    else:
        newSamples_df = newNeg_df
    newSamp_df = newSamples_df.drop(['ltable_id','rtable_id','alteredAttributes'],axis=1)
    newSamples_df['id'] = np.arange(start_id,len(newSamples_df)+start_id)
    newSamples_df = newSamples_df[ordered_columns]
    return newSamples_df

In [3]:
DATA_DIR = '../datasets/Structured/itunes-amazon/'
train,validation, test = generate_train_valid_test(DATA_DIR,['train.csv','valid.csv','test.csv'],
                                                  'ltable_','rtable_',drop_lrid=False)
allSamples = pd.concat([train,validation,test],ignore_index=True)

In [4]:
tableA = pd.read_csv(DATA_DIR+'tableA.csv')
tableB = pd.read_csv(DATA_DIR+'tableB.csv')
attributes = [col for col in list(tableA) if col not in ['id']]

In [6]:
augmentedSamples = prepareDataset(allSamples,tableA,tableB,50)
len(augmentedSamples)

100%|██████████| 117/117 [00:10<00:00, 11.23it/s]


597

In [7]:
mixedTriangles = getMixedTriangles(augmentedSamples,[tableA,tableB])
len(mixedTriangles)

143

In [9]:
newSamples_itunes = buildNewSample(mixedTriangles,1)
len(newSamples_itunes)

1144

## Add new samples to negatives

In [17]:
from utils.intermediate_layer_extraction import return_layer_input
import deepmatcher as dm
import torch

In [12]:
model = dm.MatchingModel(attr_summarizer='hybrid')
model.load_state('../models/itunes_amazon_hybrid.pth')
model = model.to('cuda')

In [74]:
testneg = return_layer_input(model,model.classifier,DATA_DIR,'test_negatives')


Reading and processing data from "../datasets/Structured/itunes-amazon//test_negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [78]:
negatives = pd.read_csv(DATA_DIR+'negatives.csv')
newSamples_itunes = newSamples_itunes[negatives.columns.tolist()]
newSamples_itunes['id'] = np.arange(len(allSamples)+1,len(allSamples)+1+len(newSamples_itunes))
extendedNeg = pd.concat([negatives,newSamples_itunes],ignore_index=True)
extendedNeg.to_csv(DATA_DIR+'extended_negatives.csv',index=False)

In [13]:
testpos = return_layer_input(model,model.classifier,DATA_DIR,'test_positives')


Reading and processing data from "../datasets/Structured/itunes-amazon//test_positives.csv"
0% [########################### ] 100% | ETA: 00:00:00

In [14]:
negatives = return_layer_input(model,model.classifier,DATA_DIR,'extended_negatives')


Reading and processing data from "../datasets/Structured/itunes-amazon//extended_negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [22]:
negatives_standard = return_layer_input(model,model.classifier,DATA_DIR,'negatives')


Reading and processing data from "../datasets/Structured/itunes-amazon//negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [32]:
testpos_ri = torch.load('iTunes-Amazon/experiments/results/testpos_ri_hybrid.pt')

In [33]:
from utils.distance_measures import smallestDistanceOnAttributes

In [36]:
attribute_len = int(len(list(testpos.values())[0])/len(attributes))
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives,attributes,attribute_len)

100%|██████████| 27/27 [00:00<00:00, 180.75it/s]


In [37]:
testpos_nn_distances.mean()

Song_Name        0.833780
Artist_Name      0.533497
Album_Name       0.777133
Genre            0.227974
Price            0.733526
CopyRight        0.410668
Time             0.912086
Released         0.289489
SampleID       680.259259
dtype: float64

In [38]:
testpos_nn_distances_ext = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives_standard,attributes,attribute_len)

100%|██████████| 27/27 [00:00<00:00, 244.23it/s]


In [39]:
testpos_nn_distances_ext.mean()

Song_Name        0.827837
Artist_Name      0.547534
Album_Name       0.775739
Genre            0.193658
Price            0.727098
CopyRight        0.409678
Time             0.909601
Released         0.291465
SampleID       680.259259
dtype: float64

## Amazon-Google

In [40]:
DATA_DIR = '../datasets/Structured/Amazon-Google/'
train,validation, test = generate_train_valid_test(DATA_DIR,['train.csv','valid.csv','test.csv'],
                                                  'ltable_','rtable_',drop_lrid=False)
allSamples = pd.concat([train,validation,test],ignore_index=True)

In [55]:
tableA = pd.read_csv(DATA_DIR+'tableA.csv')
tableB = pd.read_csv(DATA_DIR+'tableB.csv')
attributes = [col for col in tableA.columns if col not in ['id']]

In [43]:
mixedTriangles = getMixedTriangles(allSamples,[tableA,tableB])
len(mixedTriangles)

16644

In [57]:
newNegatives = buildNewSample(mixedTriangles,attributes,1)
len(newNegatives)

49932

In [61]:
negatives_google = pd.read_csv(DATA_DIR+'negatives.csv')
newSamples_google = newNegatives[negatives_google.columns.tolist()]
newSamples_google['id'] = np.arange(len(allSamples)+1,len(allSamples)+1+len(newSamples_google))
extendedNeg_google = pd.concat([negatives_google,newSamples_google],ignore_index=True)
extendedNeg_google.to_csv(DATA_DIR+'extended_negatives.csv',index=False)

In [63]:
google_model = dm.MatchingModel(attr_summarizer='hybrid')
google_model.load_state('../models/amazongoogle_hybrid.pth')
google_model = google_model.to('cuda')

In [65]:
testpos = return_layer_input(google_model,google_model.classifier,DATA_DIR,'test_positives')
testpos_ri = torch.load('Amazon-Google/experiment_results/testpos_ri_hybrid.pt')


Reading and processing data from "../datasets/Structured/Amazon-Google//test_positives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [66]:
negatives = return_layer_input(google_model,google_model.classifier,DATA_DIR,'extended_negatives')
negatives_standard = return_layer_input(google_model,google_model.classifier,DATA_DIR,'negatives')


Reading and processing data from "../datasets/Structured/Amazon-Google//extended_negatives.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "../datasets/Structured/Amazon-Google//negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [68]:
attribute_len = int(len(list(testpos.values())[0])/len(attributes))
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives_standard,attributes,attribute_len)

100%|██████████| 234/234 [00:01<00:00, 209.87it/s]


In [69]:
testpos_nn_distances.mean()

title               0.903194
manufacturer        0.789692
price               0.855613
SampleID        10792.645299
dtype: float64

In [70]:
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives,attributes,attribute_len)

100%|██████████| 234/234 [00:04<00:00, 47.54it/s]


In [71]:
testpos_nn_distances.mean()

title               0.952535
manufacturer        0.793158
price               0.861843
SampleID        10792.645299
dtype: float64

## Walmart-Amazon

In [3]:
DATA_DIR = '../datasets/Structured/Walmart-Amazon/'
train,validation, test = generate_train_valid_test(DATA_DIR,['train.csv','valid.csv','test.csv'],
                                                  'ltable_','rtable_',drop_lrid=False)
allSamples = pd.concat([train,validation,test],ignore_index=True)
tableA = pd.read_csv(DATA_DIR+'tableA.csv')
tableB = pd.read_csv(DATA_DIR+'tableB.csv')
attributes = [col for col in tableA.columns if col not in ['id']]

In [4]:
mixedTriangles = getMixedTriangles(allSamples,[tableA,tableB])
positiveTriangles = getPositiveTriangles(allSamples,[tableA,tableB])
len(mixedTriangles),len(positiveTriangles)

(5240, 242)

In [9]:
negatives = pd.read_csv(DATA_DIR+'negatives.csv')
positives = pd.read_csv(DATA_DIR+'positives.csv')

In [11]:
newSamples = buildNewSamples(mixedTriangles,positiveTriangles,attributes,1,start_id=len(allSamples)+1,
                              ordered_columns=negatives.columns.tolist())

In [13]:
newNegatives = newSamples[newSamples.label==0]
newPositives = newSamples[newSamples.label==1]
extendedNeg = pd.concat([negatives,newNegatives],ignore_index=True)
extendedPos = pd.concat([positives,newPositives],ignore_index=True)
extendedNeg.to_csv(DATA_DIR+'extended_negatives.csv',index=False)
extendedPos.to_csv(DATA_DIR+'extended_positives.csv',index=False)

In [14]:
walmart_model = dm.MatchingModel(attr_summarizer='hybrid')
walmart_model.load_state('../models/walmartamazon_hybrid.pth')
walmart_model = walmart_model.to('cuda')

In [18]:
testpos = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'test_positives')
testpos_ri = torch.load('WalmartAmazon/experiment_results/testpos_ri_hybrid.pt')


Reading and processing data from "../datasets/Structured/Walmart-Amazon//test_positives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [19]:
negatives = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'extended_negatives')
negatives_standard = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'negatives')


Reading and processing data from "../datasets/Structured/Walmart-Amazon//extended_negatives.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "../datasets/Structured/Walmart-Amazon//negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [22]:
attribute_len = int(len(list(testpos.values())[0])/len(attributes))
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives_standard,attributes,attribute_len)

100%|██████████| 193/193 [00:01<00:00, 120.35it/s]


In [23]:
testpos_nn_distances.mean()

title          0.795817
category       0.620736
brand          0.694728
modelno        0.983838
price          0.673510
SampleID    9569.124352
dtype: float64

In [24]:
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives,attributes,attribute_len)

100%|██████████| 193/193 [00:04<00:00, 39.34it/s]


In [25]:
testpos_nn_distances.mean()

title          0.808267
category       0.624711
brand          0.696902
modelno        0.985982
price          0.695742
SampleID    9569.124352
dtype: float64

In [26]:
positives = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'extended_positives')
positives_standard = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'positives')


Reading and processing data from "../datasets/Structured/Walmart-Amazon//extended_positives.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "../datasets/Structured/Walmart-Amazon//positives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [27]:
testneg = return_layer_input(walmart_model,walmart_model.classifier,DATA_DIR,'test_negatives')
testneg_ri = torch.load('WalmartAmazon/experiment_results/testneg_ri_hybrid.pt')


Reading and processing data from "../datasets/Structured/Walmart-Amazon//test_negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [28]:
testneg_nn_distances = smallestDistanceOnAttributes(testneg,testneg_ri,
                                                       positives_standard,attributes,attribute_len)

100%|██████████| 1856/1856 [00:04<00:00, 412.66it/s]


In [29]:
testneg_nn_distances.mean()

title          0.843466
category       0.541637
brand          0.587059
modelno        0.865887
price          0.670304
SampleID    9180.383621
dtype: float64

In [30]:
testneg_nn_distances = smallestDistanceOnAttributes(testneg,testneg_ri,
                                                       positives,attributes,attribute_len)

100%|██████████| 1856/1856 [00:05<00:00, 315.69it/s]


In [31]:
testneg_nn_distances.mean()

title          0.846741
category       0.543813
brand          0.588845
modelno        0.868078
price          0.673183
SampleID    9180.383621
dtype: float64

## DBLP-ACM

In [32]:
DATA_DIR = '../datasets/Structured/DBLP-ACM/'
train,validation, test = generate_train_valid_test(DATA_DIR,['train.csv','valid.csv','test.csv'],
                                                  'ltable_','rtable_',drop_lrid=False)
allSamples = pd.concat([train,validation,test],ignore_index=True)
tableA = pd.read_csv(DATA_DIR+'tableA.csv')
tableB = pd.read_csv(DATA_DIR+'tableB.csv')
attributes = [col for col in tableA.columns if col not in ['id']]

In [33]:
mixedTriangles = getMixedTriangles(allSamples,[tableA,tableB])
positiveTriangles = getPositiveTriangles(allSamples,[tableA,tableB])
len(mixedTriangles),len(positiveTriangles)

(18881, 0)

In [38]:
negatives = pd.read_csv(DATA_DIR+'negatives.csv')
positives = pd.read_csv(DATA_DIR+'positives.csv')

In [41]:
newSamples = buildNewSamples(mixedTriangles,positiveTriangles,attributes,1,start_id=len(allSamples)+1,
                              ordered_columns=negatives.columns.tolist())

In [42]:
newNegatives = newSamples[newSamples.label==0]
##newPositives = newSamples[newSamples.label==1]
extendedNeg = pd.concat([negatives,newNegatives],ignore_index=True)
extendedNeg.to_csv(DATA_DIR+'extended_negatives.csv',index=False)

In [43]:
dblp_model = dm.MatchingModel(attr_summarizer='hybrid')
dblp_model.load_state('../models/dblp_acm_hybrid.pth')
dblp_model = dblp_model.to('cuda')

In [44]:
testpos = return_layer_input(dblp_model,dblp_model.classifier,DATA_DIR,'test_positives')
testpos_ri = torch.load('DBLP-ACM/experiment_results/testpos_ri_hybrid.pt')


Reading and processing data from "../datasets/Structured/DBLP-ACM//test_positives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [45]:
negatives = return_layer_input(dblp_model,dblp_model.classifier,DATA_DIR,'extended_negatives')
negatives_standard = return_layer_input(dblp_model,dblp_model.classifier,DATA_DIR,'negatives')


Reading and processing data from "../datasets/Structured/DBLP-ACM//extended_negatives.csv"
0% [##############################] 100% | ETA: 00:00:00
Reading and processing data from "../datasets/Structured/DBLP-ACM//negatives.csv"
0% [############################# ] 100% | ETA: 00:00:00

In [46]:
attribute_len = int(len(list(testpos.values())[0])/len(attributes))
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives_standard,attributes,attribute_len)

100%|██████████| 444/444 [00:02<00:00, 154.08it/s]


In [47]:
testpos_nn_distances.mean()

title           0.558767
authors         0.520618
venue           0.383427
year            0.466157
SampleID    11350.813063
dtype: float64

In [None]:
testpos_nn_distances = smallestDistanceOnAttributes(testpos,testpos_ri,
                                                       negatives,attributes,attribute_len)

  9%|▉         | 41/444 [00:01<00:18, 21.84it/s]