In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import deepmatcher as dm
import spacy
import random as rd
from collections import defaultdict
from alibi.explainers import AnchorText
from alibi.datasets import fetch_movie_sentiment
from alibi.utils.download import spacy_model
from utils.deepmatcher_utils import wrapDm

In [None]:
def makeAttr(attribute,idx,isLeft):
    attr_prefixed = []
    for token in attribute.split():
        if isLeft:
            attr_prefixed.append('L'+str(idx)+'_'+token)
        else:
            attr_prefixed.append('R'+str(idx)+'_'+token)
    return " ".join(attr_prefixed)

In [None]:
makeAttr('imperial red ale',2,isLeft=True)

In [None]:
def pairs_to_string(df,lprefix,rprefix,ignore_columns = ['id','label']):
    pairs_string = []
    l_columns = [col for col in list(df) if (col.startswith(lprefix)) and (col not in ignore_columns)]
    r_columns = [col for col in list(df) if col.startswith(rprefix) and (col not in ignore_columns)]
    df = df.fillna("")
    for i in range(len(df)):
        this_row = df.iloc[i]
        this_row_str = []
        for j,lattr in enumerate(l_columns):
            this_attr = makeAttr(this_row[lattr],j,isLeft=True)
            this_row_str.append(this_attr)
        for k,rattr in enumerate(r_columns):
            this_attr = makeAttr(this_row[rattr],k,isLeft=False)
            this_row_str.append(this_attr)
        pairs_string.append(" ".join(this_row_str))
    return pairs_string

In [None]:
def makeRow(pair_str,attributes,lprefix,rprefix):
    row_map = defaultdict(list)
    for token in pair_str.split():
        row_map[token[:2]].append(token[3:])
    row = {}
    for key in row_map.keys():
        if key.startswith('L'):
            ## key[1] is the index of attribute
            this_attr = lprefix+attributes[int(key[1])]
            row[this_attr] = " ".join(row_map[key])
        else:
            this_attr = rprefix+attributes[int(key[1])]
            row[this_attr] = " ".join(row_map[key])
    return pd.Series(row)

In [None]:
def pairs_str_to_df(pairs_str_l,columns,lprefix,rprefix):
    tuples = {}
    lschema = list(filter(lambda x: x.startswith(lprefix),columns))
    schema = {}
    for i, s in enumerate(lschema):
        schema[i] = s.replace(lprefix, "")
    allTuples = []
    for pair_str in pairs_str_l:
        row = makeRow(pair_str,schema,'ltable_','rtable_')
        allTuples.append(row)
    df = pd.DataFrame(allTuples)
    df['id'] = np.arange(len(df))
    return df

In [None]:
def pair_str_to_df(pair_str,columns,lprefix,rprefix):
    lschema = list(filter(lambda x: x.startswith(lprefix),columns))
    schema = {}
    for i, s in enumerate(lschema):
        schema[i] = s.replace(lprefix, "")
    row = makeRow(pair_str,schema,'ltable_','rtable_')
    row['id'] = 0
    return pd.DataFrame( data = [row.values],columns= row.index)

In [None]:
test_df = pd.read_csv('../datasets/Structured/itunes-amazon/merged_test.csv')
pairs_str_test = pairs_to_string(test_df,'ltable_','rtable_')

In [None]:
test_df_beer = pd.read_csv('../datasets/Structured/Beer/merged_test.csv')
pairs_str_testBeeer = pairs_to_string(test_df_beer,'ltable_','rtable_')
pairs_str_testBeeer[0]

In [None]:
pairs_str_to_df(pairs_str_testBeeer,test_df_beer.columns,'ltable_','rtable_')

## Load spacy model

In [4]:
model = 'en_core_web_md'
spacy_model(model=model)
nlp = spacy.load(model)

## Load deepmatcher and create wrapper function

In [7]:
from mojito import Mojito

In [5]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')
hybrid_model.load_state('../models/itunes_amazon_hybrid.pth')

In [8]:
test_df = pd.read_csv('../datasets/Structured/itunes-amazon/merged_test.csv')
mojito_itunes = Mojito(test_df.columns)

In [16]:
test_tuples_str = mojito_itunes.pair_of_tuples_to_str(test_df)
test_tuples_str[0]

'L0|Elevator L0|( L0|feat L0|. L0|Timbaland L0|) L1|Flo L1|Rida L2|Mail L2|On L2|Sunday L2|( L2|Deluxe L2|Version L2|) L3|Hip-Hop/Rap L3|, L3|Music L3|, L3|Dirty L3|South L4|$ L4|1.99 L5|2008 L5|Atlantic L5|Recording L5|Corporation L5|for L5|the L5|United L5|States L5|and L5|WEA L5|International L5|Inc. L5|for L5|the L5|world L5|outside L5|of L5|the L5|United L5|States L6|3:55 L7|17-Mar-08 R0|Money R0|Right R0|( R0|feat R0|. R0|Rick R0|Ross R0|& R0|Brisco R0|) R0|[ R0|Explicit R0|] R1|Flo R1|Rida R2|Mail R2|On R2|Sunday R2|[ R2|Explicit R2|] R3|Rap R3|& R3|Hip-Hop R4|$ R4|1.29 R5|2013 R5|Warner R5|Bros. R5|. R5|Records R5|Inc. R6|3:17 R7|March R7|17 R7|, R7|2008'

In [51]:
def wrap_dm(model,stringTuples):
    ##if stringTuples[0]=='Hello world':
        ##return np.array([1,0])
    df = mojito_itunes.str_to_pair_of_tuples(stringTuples)
    df['id'] = np.arange(len(df))
    predictions = wrapDm(df,model)
    if predictions.shape==(2,):
        return np.array([np.argmax(predictions)])
    else:
        return np.argmax(predictions,axis=1)

## Explain predictions

In [34]:
!pip install -q anchor_exp

In [35]:
from anchor import anchor_text

In [14]:
class_names = ["non-matching","matching"]
predict_fn = lambda tuples : wrap_dm(hybrid_model,tuples)

In [36]:
explainer = anchor_text.AnchorText(nlp,['non-matching','matching'],use_unk_distribution=False)

In [66]:
exp = explainer.explain_instance(test_tuples_str[0], predict_fn, threshold=0.95, use_proba=False)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [58]:
pred = explainer.class_names[predict_fn([test_tuples_str[0]])[0]]
alternative =  explainer.class_names[1 -predict_fn([test_tuples_str[0]])[0]]

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [67]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print()
print('Examples where anchor applies and model predicts %s:' % pred)
print()
print('\n'.join([x[0] for x in exp.examples(only_same_prediction=True)]))
print()
print('Examples where anchor applies and model predicts %s:' % alternative)
print()
print('\n'.join([x[0] for x in exp.examples(partial_index=0, only_different_prediction=True)]))

Anchor: R7|2008 AND R7|17 AND L5|for AND L4|1.99 AND ( AND L5|and AND R0|Right AND L5|Inc AND ) AND R0|Explicit AND R0| AND R0| AND L4|$ AND L5|of AND L3| AND L0|Timbaland AND L2|Deluxe AND L2| AND L5|States AND L5|States AND R1|Rida AND L5|outside AND R0| AND R5|2013 AND . AND ) AND - AND R2|Mail AND ] AND . AND L5|Recording AND L5|International AND . AND R2| AND [ AND R5|Bros AND Rap AND . AND L0| AND ) AND R0|Money AND R4|1.29 AND L0| AND L0| AND R5|Warner AND R2| AND L2| AND R5|Records AND , AND R0| AND , AND ] AND L5|the AND L7|17-Mar-08 AND L6|3:55 AND Hop AND R2|On AND . AND R0|feat AND L5|United AND R7| AND L5|United AND L5|Corporation AND - AND L5|for AND R1|Flo AND / AND L3|Dirty AND L2|Version AND L5|WEA AND L5|the AND L3|Hip AND R7|March AND & AND R3|Hip AND L2|Sunday AND L3| AND R3|Rap AND R0| AND R0|Ross AND R6|3:17 AND L5|2008 AND L0|feat AND R5|Inc AND R4|$ AND L0|Elevator AND . AND R5| AND L1|Flo AND , AND ( AND L5|Atlantic AND L3|Music AND R3| AND R2|Explicit AND L5|w

In [68]:
pred = explainer.class_names[predict_fn([test_tuples_str[20]])[0]]
alternative =  explainer.class_names[1 -predict_fn([test_tuples_str[20]])[0]]

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [70]:
exp = explainer.explain_instance(test_tuples_str[20], predict_fn, threshold=0.95, use_proba= False)

  self.as_list = [s for s in splitter.split(self.raw) if s]


In [71]:
print('Anchor: %s' % (' AND '.join(exp.names())))
print('Precision: %.2f' % exp.precision())
print()
print('Examples where anchor applies and model predicts %s:' % pred)
print()
print('\n'.join([x[0] for x in exp.examples(only_same_prediction=True)]))
print()
print('Examples where anchor applies and model predicts %s:' % alternative)
print()
print('\n'.join([x[0] for x in exp.examples(partial_index=0, only_different_prediction=True)]))

Anchor: 
Precision: 1.00

Examples where anchor applies and model predicts non-matching:



Examples where anchor applies and model predicts matching:



IndexError: list index out of range