In [56]:
import numpy as np
import pandas as pd
import deepmatcher as dm
import spacy
import random as rd
from alibi.explainers import AnchorText
from alibi.datasets import fetch_movie_sentiment
from alibi.utils.download import spacy_model

In [140]:
def rowToString(df,row_index,ignore_columns = ['id','label']):
    row = df.iloc[row_index]
    columns = [col for col in list(df) if col not in ignore_columns]
    i = 0
    j = 0
    rowString =""
    for col in columns:
        if row[col] !="":
            for token in row[col].split():
                if col.startswith("ltable_"):
                    left = True
                    rowString+="L"+str(i)+"_"+token+" "
                else:
                    left=False
                    rowString+="R"+str(j)+"_"+token+" "
        else:
            if col.startswith("ltable_"):
                left=True
                rowString += "L"+str(i)
            else:
                left=False
                rowString += "R"+str(i)
        if left:
            i +=1
        else:
            j+=1
    return rowString

In [141]:
def stringToRow(s,attributes):
    row = {}
    for token in s.split():
        prefix = token[0:3]
        if prefix in row:
            row[prefix].append(token[3:])
        else:
            row[prefix] = [token[3:]]
    oldKeys = list(row.keys())
    for key,attr in zip(oldKeys,attributes):
        row[key] =" ".join(row[key])
        row[attr] = row.pop(key)
    row['id'] =rd.randint(0,1000)
    return row

In [142]:
train = pd.read_csv('../Structured/itunes-amazon/merged_train.csv')
stringRep = rowToString(train,0)

In [144]:
attributes = [attr for attr in list(train) if attr not in ['id','label']]
row = stringToRow(stringRep,attributes)
row

{'ltable_Song_Name': 'Illusion ( feat . Echosmith )',
 'ltable_Artist_Name': 'Zedd',
 'ltable_Album_Name': 'True Colors',
 'ltable_Genre': 'Dance , Music , Electronic',
 'ltable_Price': '$ 1.29',
 'ltable_CopyRight': '2015 Interscope Records',
 'ltable_Time': '6:30',
 'ltable_Released': '18-May-15',
 'rtable_Song_Name': 'Transmission [ feat . X Ambassadors ]',
 'rtable_Artist_Name': 'Zedd',
 'rtable_Album_Name': 'True Colors',
 'rtable_Genre': 'Dance & Electronic',
 'rtable_Price': '$ 1.29',
 'rtable_CopyRight': '( C ) 2015 Interscope Records',
 'rtable_Time': '4:02',
 'rtable_Released': 'May 18 , 2015',
 'id': 272}

In [44]:
test = pd.read_csv('../Structured/itunes-amazon/merged_test.csv')

## Load spacy model

In [45]:
model = 'en_core_web_md'
spacy_model(model=model)
nlp = spacy.load(model)

## Load deepmatcher and create wrapper function

In [73]:
!mkdir temp

mkdir: cannot create directory ‘temp’: File exists


In [60]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')
hybrid_model.load_state('../models/hybrid1.pth')

In [145]:
def mergeDictionary(dict_list):
    res = {}
    for dictionary in dict_list:
        for key in dictionary.keys():
            if key in res:
                res[key].append(dictionary[key])
            else:
                res[key] = [dictionary[key]]
    return res

In [146]:
test_withoutNan = test.fillna("").copy()
dict_list = []
for idx,_ in test_withoutNan.iterrows():
    dict_list.append(rowToString(test_withoutNan,idx))

In [147]:
dict_list = list(map(lambda d:stringToRow(d,attributes),dict_list))

In [149]:
test_df = pd.DataFrame.from_dict(mergeDictionary(dict_list))
test_df.head()

Unnamed: 0,ltable_Song_Name,ltable_Artist_Name,ltable_Album_Name,ltable_Genre,ltable_Price,ltable_CopyRight,ltable_Time,ltable_Released,rtable_Song_Name,rtable_Artist_Name,rtable_Album_Name,rtable_Genre,rtable_Price,rtable_CopyRight,rtable_Time,rtable_Released,id
0,Elevator ( feat . Timbaland ),Flo Rida,Mail On Sunday ( Deluxe Version ),"Hip-Hop/Rap , Music , Dirty South",$ 1.99,2008 Atlantic Recording Corporation for the Un...,3:55,17-Mar-08,Money Right ( feat . Rick Ross & Brisco ) [ Ex...,Flo Rida,Mail On Sunday [ Explicit ],Rap & Hip-Hop,$ 1.29,2013 Warner Bros. . Records Inc.,3:17,"March 17 , 2008",454
1,The Woodland Realm ( Extended Version ),Howard Shore,The Hobbit : The Desolation of Smaug ( Origina...,"Soundtrack , Music , Soundtrack , Classical , ...",$ 1.29,‰ ãÑ 2013 WaterTower Music,5:14,10-Dec-13,The High Fells ( Extended Version ),Howard Shore,The Hobbit : The Desolation of Smaug ( Origina...,Soundtracks,$ 1.29,2013 WaterTower Music / Warner Bros. . Enterta...,3:38,"December 10 , 2013",493
2,Extra Extra Credit,Wiz Khalifa,Flight School,"Hip-Hop/Rap , Music",$ 0.99,2009 Rostrum Records,4:03,17-Apr-09,Extra Extra Credit [ Explicit ],Wiz Khalifa,Flight School [ Explicit ],Rap & Hip-Hop,$ 0.99,2013 Mad Decent,4:03,"April 17 , 2009",802
3,Toyfriend ( feat . Wynter Gordon ) [ Continuou...,David Guetta,One Love ( Deluxe Version ),"Dance , Music",$ 1.29,2010 Gum Prod licence exclusive Parlophone Mus...,2:51,21-Aug-09,Sound Of Letting Go ( Feat . Chris Willis ),David Guetta,One Love ( Deluxe Version ),Dance & Electronic,$ 1.29,( C ) 2014 Swedish House Mafia Holdings Ltd ( ...,3:47,"August 21 , 2009",249
4,Dangerous ( feat . Sam Martin ) [ Robin Schulz...,David Guetta,Listen ( Deluxe Version ),"Dance , Music , Rock , House , Electronic , Fr...",$ 1.29,2014 What A Music Ltd. under exclusive license...,3:20,24-Nov-14,Missing You ( Feat . Novel ; Continuous Mix Ve...,David Guetta,One Love ( Deluxe Version ),Dance & Electronic,$ 1.29,( C ) 2014 Swedish House Mafia Holdings Ltd ( ...,4:59,"August 21 , 2009",134


In [150]:
def wrap_dm(model,stringTuples,tempPath,attributes):
    dict_list = list(map(lambda s:stringToRow(s,attributes),stringTuples))
    data_dict = mergeDictionary(dict_list)
    df = pd.DataFrame.from_dict(data_dict)
    df.to_csv(tempPath,index=False)
    dataset = dm.data.process_unlabeled(tempPath,model)
    predictions = model.run_prediction(dataset)
    predictedClasses = []
    for pred in predictions['match_score']:
        if pred>0.5:
            predictedClasses.append(1)
        else:
            predictedClasses.append(0)
    return predictedClasses

In [152]:
test_withoutNan = test.fillna("").copy()
stringTuples = []
for idx,_ in test_withoutNan.iterrows():
    stringTuples.append(rowToString(test_withoutNan,idx))
predictions = wrap_dm(hybrid_model,stringTuples,'temp/wrapdm.csv',attributes)

===>  PREDICT Epoch 7
Finished Epoch 7 || Run Time:    0.5 | Load Time:    0.4 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



## Explain predictions

In [154]:
class_names = ["non-matching","matching"]

In [155]:
tupleTest = rowToString(test,0)
print(tupleTest)

L0_Elevator L0_( L0_feat L0_. L0_Timbaland L0_) L1_Flo L1_Rida L2_Mail L2_On L2_Sunday L2_( L2_Deluxe L2_Version L2_) L3_Hip-Hop/Rap L3_, L3_Music L3_, L3_Dirty L3_South L4_$ L4_1.99 L5_2008 L5_Atlantic L5_Recording L5_Corporation L5_for L5_the L5_United L5_States L5_and L5_WEA L5_International L5_Inc. L5_for L5_the L5_world L5_outside L5_of L5_the L5_United L5_States L6_3:55 L7_17-Mar-08 R0_Money R0_Right R0_( R0_feat R0_. R0_Rick R0_Ross R0_& R0_Brisco R0_) R0_[ R0_Explicit R0_] R1_Flo R1_Rida R2_Mail R2_On R2_Sunday R2_[ R2_Explicit R2_] R3_Rap R3_& R3_Hip-Hop R4_$ R4_1.29 R5_2013 R5_Warner R5_Bros. R5_. R5_Records R5_Inc. R6_3:17 R7_March R7_17 R7_, R7_2008 


In [156]:
predict_fn = lambda tuples : wrap_dm(hybrid_model,tuples,'temp/wrapdm.csv',attributes)

In [157]:
pred = class_names[predict_fn([tupleTest])[0]]
alternative =  class_names[1 - predict_fn([tupleTest])[0]]
print('Prediction: %s' % pred)

===>  PREDICT Epoch 7
Finished Epoch 7 || Run Time:    0.1 | Load Time:    0.0 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

===>  PREDICT Epoch 7
Finished Epoch 7 || Run Time:    0.1 | Load Time:    0.0 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

Prediction: non-matching


In [161]:
tuplesTest = [rowToString(test,0),rowToString(test,1)]
predict_fn(tuplesTest)

===>  PREDICT Epoch 7
Finished Epoch 7 || Run Time:    0.1 | Load Time:    0.0 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



[0, 0]

In [158]:
explainer = AnchorText(nlp, predict_fn)

AssertionError: 