In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils.attacker import attackDatasetEditDist,changeRandomCharacters
import pandas as pd
import deepmatcher as dm
from sklearn.metrics import f1_score,precision_score,recall_score
import shutil
import os

## Define utilities

In [4]:
def getF1PrecisionRecall(testpath,model):
    test = dm.data.process_unlabeled(testpath,model,
                                ignore_columns = ['id','label'])
    predictions = model.run_prediction(test,output_attributes =True)
    true_label = predictions.label
    predictions['predicted'] = predictions.match_score.apply(lambda m:round(m))
    y_pred = predictions.predicted
    return (f1_score(true_label,y_pred),precision_score(true_label,y_pred),
            recall_score(true_label,y_pred))

In [5]:
##given a perturbation model and number of perturbation calculate avg f1,precision and recall 
def getAvgF1PrecisionRecall(testpath,model,nperturbation,editdist=1):
    avgf1,avgrecall,avgprecision = 0,0,0
    for i in range(nperturbation):
        test_df = pd.read_csv(testpath)
        attacked_test = attackDatasetEditDist(test_df,['id','label'],editdist=editdist)
        if not(os.path.exists('temp')):
            os.mkdir('temp')
        attacked_test.to_csv('temp/attacked_test.csv',index=False)
        f1,prec,recall = getF1PrecisionRecall('temp/attacked_test.csv',model)
        avgf1 += f1
        avgprecision += prec
        avgrecall += recall
    shutil.rmtree('temp')
    return (avgf1/nperturbation,avgprecision/nperturbation,avgrecall/nperturbation)

## iTunes-Amazon

In [67]:
itunesamazon_mod = dm.MatchingModel(attr_summarizer='hybrid')
itunesamazon_mod.load_state('models/itunes_amazon_hybrid.pth')

In [69]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Structured/itunes-amazon/merged_test.csv',
                                                       itunesamazon_mod,10)

In [70]:
avgf1,avgprecision,avgrecall

(0.5949118499587777, 0.625925925925926, 0.5728359565736845)

In [71]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Structured/itunes-amazon/merged_test.csv',
                                                       itunesamazon_mod,10,2)

In [72]:
avgf1,avgprecision,avgrecall

(0.5589573654482562, 0.5925925925925926, 0.5335803844409954)

## Amazon-Google

In [73]:
amazongoogle_mod = dm.MatchingModel(attr_summarizer='hybrid')
amazongoogle_mod.load_state('models/amazongoogle_hybrid.pth')

In [75]:
%%capture
avgf1,avgrecall,avgprec = getAvgF1PrecisionRecall('datasets/Structured/Amazon-Google/merged_test.csv'
                                                  ,amazongoogle_mod,10)

In [76]:
avgf1,avgrecall,avgprec

(0.48166537944017784, 0.7688034188034188, 0.3507562261034986)

In [77]:
%%capture
avgf1,avgrecall,avgprec = getAvgF1PrecisionRecall('datasets/Structured/Amazon-Google/merged_test.csv'
                                                  ,amazongoogle_mod,10,editdist=2)

In [78]:
avgf1,avgrecall,avgprec

(0.43333838062459246, 0.629059829059829, 0.3306202891697787)

## Walmart-Amazon

In [79]:
walmartamazon_mod = dm.MatchingModel(attr_summarizer='hybrid')
walmartamazon_mod.load_state('models/walmartamazon_hybrid.pth')

In [80]:
%%capture
avgf1,avgprecision,avrecall = getAvgF1PrecisionRecall('datasets/Structured/Walmart-Amazon/merged_test.csv'
                                                  ,walmartamazon_mod,10)

In [81]:
avgf1,avgprec,avrecall

(0.1335596004771203, 0.3306202891697787, 0.337650805695315)

In [82]:
%%capture
avgf1,avgprecision,avrecall = getAvgF1PrecisionRecall('datasets/Structured/Walmart-Amazon/merged_test.csv'
                                                  ,walmartamazon_mod,10,editdist=2)

In [83]:
avgf1,avgprecision,avrecall

(0.057497899084307025, 0.03367875647668393, 0.20470634540435567)

## DBLP-ACM

In [84]:
dblpacm_mod = dm.MatchingModel(attr_summarizer='hybrid')
dblpacm_mod.load_state('models/dblp_acm_hybrid.pth')

In [87]:
%%capture
avgf1,avgrecall,avgprec = getAvgF1PrecisionRecall('datasets/Structured/DBLP-ACM/merged_test.csv'
                                                  ,dblpacm_mod,10)

In [88]:
avgf1,avgrecall,avgprec

(0.4466801600177579, 0.290990990990991, 0.9622991787108621)

In [89]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Structured/DBLP-ACM/merged_test.csv'
                                                  ,dblpacm_mod,10,editdist=2)

In [90]:
avgf1,avgprecision,avgrecall

(0.2806737303393382, 0.16396396396396398, 0.9783043531856741)

## Beer

In [98]:
beer_mod = dm.MatchingModel(attr_summarizer='hybrid')
beer_mod.load_state('models/beer_hybrid.pth')

In [105]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Structured/Beer/merged_test.csv'
                                                  ,beer_mod,10,editdist=1)

In [106]:
avgf1,avgrecall,avgprec

(0.6605358299362345, 0.5610203427463489, 0.9622991787108621)

In [103]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Structured/Beer/merged_test.csv'
                                                  ,beer_mod,10,editdist=2)

In [104]:
avgf1,avgprecision,avgrecall

(0.5611321223821225, 0.7, 0.47074046281712184)

# Textual data

## Abt-Buy

In [92]:
abtbuy_model = dm.MatchingModel(attr_summarizer='hybrid')
abtbuy_model.load_state('models/abtbuy_hybrid.pth')

In [94]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Textual/Abt-Buy/merged_test.csv'
                                                  ,abtbuy_model,10)

In [95]:
avgf1,avgprecision,avgrecall

(0.5915975259056523, 0.6145631067961165, 0.5704149965608851)

In [96]:
%%capture
avgf1,avgprecision,avgrecall = getAvgF1PrecisionRecall('datasets/Textual/Abt-Buy/merged_test.csv'
                                                  ,abtbuy_model,10,editdist=2)

In [97]:
avgf1,avgprecision,avgrecall

(0.5167520023209786, 0.5393203883495146, 0.4962097058472218)

## Computers WDC

In [107]:
computer_model = dm.MatchingModel(attr_summarizer='rnn')
computer_model.load_state('models/computer_rnn.pth')

AttributeError: Can't get attribute '_default_unk_index' on <module 'torchtext.vocab' from '/home/nvidia/anaconda3/envs/torchenv/lib/python3.6/site-packages/torchtext/vocab.py'>