In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

In [3]:
import numpy as np
import pandas as pd
import deepmatcher as dm
import spacy
import random as rd
from collections import defaultdict
from utils.deepmatcher_utils import wrapDm

In [None]:
def makeAttr(attribute,idx,isLeft):
    attr_prefixed = []
    for token in attribute.split():
        if isLeft:
            attr_prefixed.append('L'+str(idx)+'_'+token)
        else:
            attr_prefixed.append('R'+str(idx)+'_'+token)
    return " ".join(attr_prefixed)

In [None]:
makeAttr('imperial red ale',2,isLeft=True)

In [None]:
def pairs_to_string(df,lprefix,rprefix,ignore_columns = ['id','label']):
    pairs_string = []
    l_columns = [col for col in list(df) if (col.startswith(lprefix)) and (col not in ignore_columns)]
    r_columns = [col for col in list(df) if col.startswith(rprefix) and (col not in ignore_columns)]
    df = df.fillna("")
    for i in range(len(df)):
        this_row = df.iloc[i]
        this_row_str = []
        for j,lattr in enumerate(l_columns):
            this_attr = makeAttr(this_row[lattr],j,isLeft=True)
            this_row_str.append(this_attr)
        for k,rattr in enumerate(r_columns):
            this_attr = makeAttr(this_row[rattr],k,isLeft=False)
            this_row_str.append(this_attr)
        pairs_string.append(" ".join(this_row_str))
    return pairs_string

In [None]:
def makeRow(pair_str,attributes,lprefix,rprefix):
    row_map = defaultdict(list)
    for token in pair_str.split():
        row_map[token[:2]].append(token[3:])
    row = {}
    for key in row_map.keys():
        if key.startswith('L'):
            ## key[1] is the index of attribute
            this_attr = lprefix+attributes[int(key[1])]
            row[this_attr] = " ".join(row_map[key])
        else:
            this_attr = rprefix+attributes[int(key[1])]
            row[this_attr] = " ".join(row_map[key])
    return pd.Series(row)

In [None]:
def pairs_str_to_df(pairs_str_l,columns,lprefix,rprefix):
    tuples = {}
    lschema = list(filter(lambda x: x.startswith(lprefix),columns))
    schema = {}
    for i, s in enumerate(lschema):
        schema[i] = s.replace(lprefix, "")
    allTuples = []
    for pair_str in pairs_str_l:
        row = makeRow(pair_str,schema,'ltable_','rtable_')
        allTuples.append(row)
    df = pd.DataFrame(allTuples)
    df['id'] = np.arange(len(df))
    return df

In [None]:
def pair_str_to_df(pair_str,columns,lprefix,rprefix):
    lschema = list(filter(lambda x: x.startswith(lprefix),columns))
    schema = {}
    for i, s in enumerate(lschema):
        schema[i] = s.replace(lprefix, "")
    row = makeRow(pair_str,schema,'ltable_','rtable_')
    row['id'] = 0
    return pd.DataFrame( data = [row.values],columns= row.index)

In [None]:
test_df = pd.read_csv('../datasets/Structured/itunes-amazon/merged_test.csv')
pairs_str_test = pairs_to_string(test_df,'ltable_','rtable_')

In [None]:
test_df_beer = pd.read_csv('../datasets/Structured/Beer/merged_test.csv')
pairs_str_testBeeer = pairs_to_string(test_df_beer,'ltable_','rtable_')
pairs_str_testBeeer[0]

In [None]:
pairs_str_to_df(pairs_str_testBeeer,test_df_beer.columns,'ltable_','rtable_')

## Load spacy model

In [6]:
%%capture
#run this cell if the model is not downloaded
!python -m spacy download en_core_web_lg

In [7]:
nlp = spacy.load('en_core_web_lg')

## Load deepmatcher and create wrapper function

In [None]:
from mojito import Mojito

In [None]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')
hybrid_model.load_state('../models/itunes_amazon_hybrid.pth')

In [None]:
test_df = pd.read_csv('../datasets/Structured/itunes-amazon/merged_test.csv')
mojito_itunes = Mojito(test_df.columns)

In [None]:
test_tuples_str = mojito_itunes.pair_of_tuples_to_str(test_df)
test_tuples_str[0]

In [None]:
def wrap_dm(model,stringTuples):
    ##if stringTuples[0]=='Hello world':
        ##return np.array([1,0])
    df = mojito_itunes.str_to_pair_of_tuples(stringTuples)
    df['id'] = np.arange(len(df))
    predictions = wrapDm(df,model)
    if predictions.shape==(2,):
        return np.array([np.argmax(predictions)])
    else:
        return np.argmax(predictions,axis=1)

## Explain predictions

In [None]:
!pip install -q anchor_exp

In [None]:
from anchor import anchor_text

In [None]:
class_names = ["non-matching","matching"]
predict_fn = lambda tuples : wrap_dm(hybrid_model,tuples)

In [None]:
explainer = anchor_text.AnchorText(nlp,['non-matching','matching'],use_unk_distribution=False)

In [None]:
exp = explainer.explain_instance(test_tuples_str[0], predict_fn, threshold=0.95, use_proba=False)

In [None]:
pred = explainer.class_names[predict_fn([test_tuples_str[0]])[0]]
alternative =  explainer.class_names[1 -predict_fn([test_tuples_str[0]])[0]]

In [None]:
pred = explainer.class_names[predict_fn([test_tuples_str[20]])[0]]
alternative =  explainer.class_names[1 -predict_fn([test_tuples_str[20]])[0]]

In [None]:
exp = explainer.explain_instance(test_tuples_str[20], predict_fn, threshold=0.95, use_proba= False)

## Test another wrapper function

### Define functions

In [8]:
from anchor import anchor_text

In [9]:
def records_to_texts(df):
    text = []
    for i in range(len(df.index)):
        attributes = [attr for attr in df.iloc[i].index if attr not in ['id','label']]
        current_tokens = list(map(lambda attr:df.iloc[i][attr],attributes))
        text.append(" ".join(current_tokens))
    return text

In [10]:
def getMapping(df):
    mapping = {}
    for i in range(len(df.index)):
        row = df.iloc[i]
        currentMapping = []
        attributes = [attr for attr in row.index if attr not in ['id','label']]
        for attr in attributes:
            currentMapping.append(len(row[attr].split()))
        mapping[row['id']]=currentMapping
    return mapping

In [11]:
def buildPairFromMapping(text,mapping,attributes):
    attribute_values = []
    text_tokens = text.split()
    i = 0
    for tokensnum in mapping:
        attribute_values.append(" ".join(text_tokens[i:i+tokensnum]))
        i += tokensnum
    return pd.Series(index=attributes,data=attribute_values)

In [12]:
test = pd.read_csv('../datasets/Structured/itunes-amazon/merged_test.csv',dtype=str).fillna("")
attributes = [attr for attr in test.columns if attr not in ['id','label']]
mapping = getMapping(test)

In [13]:
text_records = records_to_texts(test)
text_records[10]

'VHS Outro ( Interlude ) X Ambassadors VHS Alternative , Music , Rock , Adult Alternative $ 1.29 2015 KIDinaKORNER/Interscope Records 1:25 30-Jun-15 Moving Day ( Interlude ) X Ambassadors VHS [ Explicit ] Alternative Rock $ 1.29 ( C ) 2015 KIDinaKORNER/Interscope Records 0:19 June 30 , 2015'

In [14]:
row0 = buildPairFromMapping(text_records[10],mapping[test.id.values[10]],attributes)
row0

ltable_Song_Name                             VHS Outro ( Interlude )
ltable_Artist_Name                                     X Ambassadors
ltable_Album_Name                                                VHS
ltable_Genre          Alternative , Music , Rock , Adult Alternative
ltable_Price                                                  $ 1.29
ltable_CopyRight                2015 KIDinaKORNER/Interscope Records
ltable_Time                                                     1:25
ltable_Released                                            30-Jun-15
rtable_Song_Name                            Moving Day ( Interlude )
rtable_Artist_Name                                     X Ambassadors
rtable_Album_Name                                   VHS [ Explicit ]
rtable_Genre                                        Alternative Rock
rtable_Price                                                  $ 1.29
rtable_CopyRight          ( C ) 2015 KIDinaKORNER/Interscope Records
rtable_Time                       

In [15]:
row1 = buildPairFromMapping(text_records[5],mapping[test.id.values[5]],attributes)

In [66]:
def predictFromMapping(model,texts,mapping,attributes):
    allPairs = list(map(lambda text:buildPairFromMapping(text,mapping,attributes),texts))
    df = pd.DataFrame(data=allPairs)
    predictions = wrapDm(df,model)
    if len(texts)==1:
        return np.array([np.argmax(predictions,axis=0)])
    else:
        return np.argmax(predictions,axis=1)

In [17]:
itunes_model = dm.MatchingModel(attr_summarizer='hybrid')
itunes_model.load_state('../models/itunes_amazon_hybrid.pth')

In [68]:
pred = predictFromMapping(itunes_model,[text_records[0],text_records[0]],mapping[test.id.values[0]],attributes)
pred

array([0, 0])

In [55]:
pred = predictFromMapping(itunes_model,[text_records[2]],mapping[test.id.values[2]],attributes)
pred

array([0.01244676, 0.98755324])

In [69]:
explainer = anchor_text.AnchorText(nlp, ['matching', 'non-matching'], use_unk_distribution=False)

In [70]:
predict_fn = lambda texts: predictFromMapping(itunes_model,texts,mapping[test.id.values[0]],attributes)

In [71]:
text = text_records[0]
exp = explainer.explain_instance(text, predict_fn, threshold=0.95, use_proba=True)

  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _

In [74]:
exp.names()

[]

In [75]:
predict_fn = lambda texts: predictFromMapping(itunes_model,texts,mapping[test.id.values[2]],attributes)

In [76]:
text = text_records[2]
exp = explainer.explain_instance(text, predict_fn, threshold=0.95, use_proba=True)

  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _

In [78]:
len(exp.names())

49

In [82]:
predict_fn = lambda texts: predictFromMapping(itunes_model,texts,mapping[test.id.values[10]],attributes)

In [None]:
text = text_records[10]
exp = explainer.explain_instance(text, predict_fn, threshold=0.95, use_proba=True)

  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _run_code(code, main_globals, None,
  return _