In [1]:
# OPTIONAL: Load the "autoreload" extension so that code can change''
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import time
from pathlib import Path

from pdb import set_trace

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
from model.model import DistilBertModelLogit, BertModelLogit, JointBertModelLogit, RobertaModelLogit

from transformers import AutoTokenizer

import deepmatcher as dm
import string
import contextlib

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
IGNORE_COLUMNS = ['label', 'pair_id', 'ltable_identifiers', 'rtable_identifiers', 'ltable_cluster_id', 'rtable_cluster_id', 'ltable_category', 'rtable_category', 'ltable_id', 'rtable_id', 'ltable_price', 'rtable_price', 'ltable_description', 'rtable_description', 'ltable_specTableContent', 'rtable_specTableContent', 'ltable_keyValuePairs', 'rtable_keyValuePairs']

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#BATCH_SIZE = 5096 #deepmatcher
BATCH_SIZE = 64 #bert
#BATCH_SIZE = 32 #jointbert
#BATCH_SIZE = 256 #distilbert
#BATCH_SIZE = 64 #roberta



def wrap_distilbert(model, ignore_columns = IGNORE_COLUMNS):
    def wrapper(dataframe):
        dataframe = dataframe.drop([c for c in ignore_columns if c in dataframe.columns], axis = 1)
        dataframe['sequence_left'] = dataframe['ltable_brand'] + ' ' + dataframe['ltable_title']
        dataframe['sequence_right'] = dataframe['rtable_brand'] + ' ' + dataframe['rtable_title']

        dataframe['sequence_left'] = dataframe['sequence_left'].str.split()
        dataframe['sequence_left'] = dataframe['sequence_left'].str.join(' ')
        dataframe['sequence_right'] = dataframe['sequence_right'].str.split()
        dataframe['sequence_right'] = dataframe['sequence_right'].str.join(' ')
        
        outputs = None
        for i in np.arange(BATCH_SIZE, len(dataframe) + BATCH_SIZE, BATCH_SIZE):
            if i != BATCH_SIZE:
                start = i-BATCH_SIZE
            else:
                start = 0
            current_batch = dataframe.iloc[start:i]
        
            tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
            batch = tokenizer(current_batch['sequence_left'].tolist(), current_batch['sequence_right'].tolist(), padding=True, truncation='longest_first', return_tensors="pt")
            output = model(batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE))
            output = output.sigmoid().detach().to('cpu')
            if isinstance(outputs, torch.Tensor):
                outputs = torch.cat([outputs, output], dim=0)
            else:
                outputs = output
        out_proba = outputs.reshape(-1)
        multi_proba = np.dstack((1-out_proba, out_proba)).squeeze()
        return multi_proba
    return wrapper

def wrap_bert(model, ignore_columns = IGNORE_COLUMNS):
    def wrapper(dataframe):
        dataframe = dataframe.drop([c for c in ignore_columns if c in dataframe.columns], axis = 1)
        dataframe['sequence_left'] = dataframe['ltable_brand'] + ' ' + dataframe['ltable_title']
        dataframe['sequence_right'] = dataframe['rtable_brand'] + ' ' + dataframe['rtable_title']

        dataframe['sequence_left'] = dataframe['sequence_left'].str.split()
        dataframe['sequence_left'] = dataframe['sequence_left'].str.join(' ')
        dataframe['sequence_right'] = dataframe['sequence_right'].str.split()
        dataframe['sequence_right'] = dataframe['sequence_right'].str.join(' ')
        
        outputs = None
        for i in np.arange(BATCH_SIZE, len(dataframe) + BATCH_SIZE, BATCH_SIZE):
            if i != BATCH_SIZE:
                start = i-BATCH_SIZE
            else:
                start = 0
            current_batch = dataframe.iloc[start:i]
        
            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
            batch = tokenizer(current_batch['sequence_left'].tolist(), current_batch['sequence_right'].tolist(), padding=True, truncation='longest_first', return_tensors="pt")
            output = model(batch['input_ids'].to(DEVICE), batch['token_type_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE))
            output = output.sigmoid().detach().to('cpu')
            if isinstance(outputs, torch.Tensor):
                outputs = torch.cat([outputs, output], dim=0)
            else:
                outputs = output
        out_proba = outputs.reshape(-1)
        multi_proba = np.dstack((1-out_proba, out_proba)).squeeze()
        return multi_proba
    return wrapper

def wrap_jointbert(model, ignore_columns = IGNORE_COLUMNS):
    def wrapper(dataframe):
        dataframe = dataframe.drop([c for c in ignore_columns if c in dataframe.columns], axis = 1)
        dataframe['sequence_left'] = dataframe['ltable_brand'] + ' ' + dataframe['ltable_title']
        dataframe['sequence_right'] = dataframe['rtable_brand'] + ' ' + dataframe['rtable_title']

        dataframe['sequence_left'] = dataframe['sequence_left'].str.split()
        dataframe['sequence_left'] = dataframe['sequence_left'].str.join(' ')
        dataframe['sequence_right'] = dataframe['sequence_right'].str.split()
        dataframe['sequence_right'] = dataframe['sequence_right'].str.join(' ')
        
        outputs = None
        for i in np.arange(BATCH_SIZE, len(dataframe) + BATCH_SIZE, BATCH_SIZE):
            if i != BATCH_SIZE:
                start = i-BATCH_SIZE
            else:
                start = 0
            current_batch = dataframe.iloc[start:i]
        
            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
            batch = tokenizer(current_batch['sequence_left'].tolist(), current_batch['sequence_right'].tolist(), padding=True, truncation='longest_first', return_tensors="pt")
            output_binary, output_multi1, output_multi2 = model(batch['input_ids'].to(DEVICE), batch['token_type_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE))
            output = output_binary.sigmoid().detach().to('cpu')
            if isinstance(outputs, torch.Tensor):
                outputs = torch.cat([outputs, output], dim=0)
            else:
                outputs = output
        out_proba = outputs.reshape(-1)
        multi_proba = np.dstack((1-out_proba, out_proba)).squeeze()
        return multi_proba
    return wrapper

def wrap_roberta(model, ignore_columns = IGNORE_COLUMNS):
    def wrapper(dataframe):
        dataframe = dataframe.drop([c for c in ignore_columns if c in dataframe.columns], axis = 1)
        dataframe['sequence_left'] = dataframe['ltable_brand'] + ' ' + dataframe['ltable_title']
        dataframe['sequence_right'] = dataframe['rtable_brand'] + ' ' + dataframe['rtable_title']

        dataframe['sequence_left'] = dataframe['sequence_left'].str.split()
        dataframe['sequence_left'] = dataframe['sequence_left'].str.join(' ')
        dataframe['sequence_right'] = dataframe['sequence_right'].str.split()
        dataframe['sequence_right'] = dataframe['sequence_right'].str.join(' ')
        
        outputs = None
        for i in np.arange(BATCH_SIZE, len(dataframe) + BATCH_SIZE, BATCH_SIZE):
            if i != BATCH_SIZE:
                start = i-BATCH_SIZE
            else:
                start = 0
            current_batch = dataframe.iloc[start:i]
        
            tokenizer = AutoTokenizer.from_pretrained('roberta-base')
            batch = tokenizer(current_batch['sequence_left'].tolist(), current_batch['sequence_right'].tolist(), padding=True, truncation='longest_first', return_tensors="pt")
            output = model(batch['input_ids'].to(DEVICE), batch['attention_mask'].to(DEVICE))
            output = output.sigmoid().detach().to('cpu')
            if isinstance(outputs, torch.Tensor):
                outputs = torch.cat([outputs, output], dim=0)
            else:
                outputs = output
        out_proba = outputs.reshape(-1)
        multi_proba = np.dstack((1-out_proba, out_proba)).squeeze()
        return multi_proba
    return wrapper

def wrap_deepmatcher(model, ignore_columns = IGNORE_COLUMNS):
    def wrapper(dataframe):
        dataframe = dataframe.drop([c for c in ignore_columns if c in dataframe.columns], axis = 1)
        
        dataframe['_id'] = np.arange(len(dataframe))
    
        tmp_name = "./{}.csv".format("".join([random.choice(string.ascii_lowercase) for _ in range(10)]))
        dataframe.to_csv(tmp_name, index = False)
        
        with open(os.devnull, 'w') as devnull:
            with contextlib.redirect_stdout(devnull):
                data_processed = dm.data.process_unlabeled(tmp_name, trained_model = model)
                out_proba = model.run_prediction(data_processed, batch_size=BATCH_SIZE, output_attributes = True)
                out_proba = out_proba['match_score'].values.reshape(-1)
    
        multi_proba = np.dstack((1-out_proba, out_proba)).squeeze()
    
        os.remove(tmp_name)
        return multi_proba
    return wrapper

In [5]:
computers_newgs = pd.read_pickle('../../data/interim/wdc-lspc/gold-standards/preprocessed_computers_new_testset_1500.pkl.gz')
computers_newgs = computers_newgs.fillna('')
computers_newgs = computers_newgs.applymap(lambda s:s.lower() if type(s) == str else s)
columns = computers_newgs.columns
renamed_columns = []
for col in columns:
    if '_left' in col:
        col = col.replace('_left', '')
        renamed_columns.append(f'ltable_{col}')
    elif '_right' in col:
        col = col.replace('_right', '')
        renamed_columns.append(f'rtable_{col}')
    else:
        renamed_columns.append(col)
        
computers_newgs.columns = renamed_columns
computers_newgs = computers_newgs.set_index('pair_id', drop=False)

computers_newgs.head()

Unnamed: 0_level_0,ltable_id,ltable_cluster_id,ltable_category,ltable_title,ltable_description,ltable_brand,ltable_price,ltable_keyValuePairs,ltable_specTableContent,rtable_id,rtable_cluster_id,rtable_category,rtable_title,rtable_description,rtable_brand,rtable_price,rtable_keyValuePairs,rtable_specTableContent,label,pair_id
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
4210789#6428629,4210789,533129,computers_and_accessories,intel x520-da2 pci express 2.0 network adapter,"intel's family of adapter, the intel' ethernet...",,"$, 333.16",,,6428629,533129,computers_and_accessories,intel x520-da2 dual port gigabit sfp ethernet ...,2 port intel e10g42btda 10 gigabit sfp etherne...,,,,,1,4210789#6428629
8813989#4435647,8813989,55385,computers_and_accessories,zotac geforce gtx 1070ti amp extreme 8192mb gd...,"zt-p10710b-10p, core clock: 1607mhz, boost clo...",zotac,,,,4435647,55385,computers_and_accessories,zotac nvidia geforce gtx 1070 ti 8gb amp! extr...,zotac geforce gtx 1070 ti amp! extreme edition...,,,,,1,8813989#4435647
4132226#2095262,4132226,69927,computers_and_accessories,tp-link tl-wps510u 150mbps wireless print serv...,tp-link 54mbps pocket-sized wireless print ser...,tp-link,,,,2095262,69927,computers_and_accessories,tp-link tl-wps510u - print server,print servers give businesses the ability to s...,,"$, 71.81",,,1,4132226#2095262
12056462#11152603,12056462,1380893,computers_and_accessories,logitech - mk550 wireless wave keyboard and mo...,logitech mk550 wireless wave keyboard and mous...,,,,,11152603,1380893,computers_and_accessories,logitech mk550 wireless wave keyboard and mous...,the logitech mk550 wireless wave keyboard and ...,logitech,,,,1,12056462#11152603
8636740#16302898,8636740,104025,computers_and_accessories,western digital - blue 500gb 2.5 solid state d...,,western digital,,,,16302898,104025,computers_and_accessories,buy online | wd blue pc ssd 500gb sata iii 2.5...,wd blue pc ssd 500gb sata iii 2.5″ wds500g1b0a,,,,weight 0.1 kg brand western digital model wds5...,1,8636740#16302898


In [6]:
computers_oldgs = pd.read_pickle('../../data/interim/wdc-lspc/gold-standards/preprocessed_computers_gs.pkl.gz')
computers_oldgs = computers_oldgs.fillna('')
computers_oldgs = computers_oldgs.applymap(lambda s:s.lower() if type(s) == str else s)
columns = computers_oldgs.columns
renamed_columns = []
for col in columns:
    if '_left' in col:
        col = col.replace('_left', '')
        renamed_columns.append(f'ltable_{col}')
    elif '_right' in col:
        col = col.replace('_right', '')
        renamed_columns.append(f'rtable_{col}')
    else:
        renamed_columns.append(col)
        
computers_oldgs.columns = renamed_columns
computers_oldgs = computers_oldgs.set_index('pair_id', drop=False)

computers_oldgs.head()

Unnamed: 0_level_0,ltable_id,ltable_category,ltable_cluster_id,ltable_identifiers,rtable_id,rtable_category,rtable_cluster_id,rtable_identifiers,label,pair_id,...,ltable_description,rtable_description,ltable_keyValuePairs,rtable_keyValuePairs,ltable_price,rtable_price,ltable_specTableContent,rtable_specTableContent,ltable_title,rtable_title
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
581109#16637861,581109,computers_and_accessories,1324529,"[{'/mpn': '[gvrx480g1gaming4gd]'}, {'/gtin13':...",16637861,computers_and_accessories,107415,"[{'/mpn': '[gvrx550gamingoc2gd]'}, {'/gtin13':...",0,581109#16637861,...,"gv-rx480g1 gaming-4gd, core clock: 1202mhz, bo...","gv-rx550gaming oc-2gd, boost: 1219mhz, memory:...",,,,,,,gigabyte radeon rx 480 g1 gaming 4096mb gddr5 ...,gigabyte radeon rx 550 gaming oc 2048mb gddr5 ...
3083228#3424944,3083228,computers_and_accessories,640007,"[{'/mpn': '[9hlf4lbdbe]'}, {'/gtin13': '[47187...",3424944,computers_and_accessories,640007,"[{'/mpn': '[9hlf4lbdbe]'}, {'/gtin13': '[47187...",1,3083228#3424944,...,more>>> short summary description benq zowie r...,every detail of the rl-series is designed and ...,,,,,,,benq zowie rl2455 24 full hd tn grey computer ...,zowie rl2455 e-sports 24 full hd led monitor 2...
5942105#770253,5942105,computers_and_accessories,1464841,[{'/sku': '[mgeq2ypa]'}],770253,computers_and_accessories,83732,[{'/sku': '[mgen2ypa]'}],0,5942105#770253,...,apple mac mini - dts - 1 x core i5 2.8 ghz - r...,,,,,,,,apple mac mini 2.8ghz intel core i5 mac mini q...,mac mini qc i5 2.6ghz/8gb/1tb/iris graphics hp...
1282014#16999524,1282014,computers_and_accessories,173820,[{'/mpn': '[tlsf1008d]'}],16999524,other_electronics,14668779,"[{'/gtin8': '[39121011]'}, {'/mpn': '[eco750up...",0,1282014#16999524,...,8 port switch for adding more ports to your ro...,this product is energy star qualified for its ...,,,,"$, 95.99",,,tp-link 8-port fast ethernet desktop switch (t...,tripp lite 750va 450w ups eco green battery ba...
7969280#6000979,7969280,computers_and_accessories,224463,"[{'/sku': '[dt100g332gb]'}, {'/gtin13': '[7406...",6000979,computers_and_accessories,1438223,[{'/productID': '[dt100g3128gb]'}],0,7969280#6000979,...,a drive usb flash kingston datatraveler® 100 g...,,,,,,,,pen kingston datatraveler 100 g3 32gb usb3.0 k...,kingston datatraveler 100 g3 128gb usb3.0 usb3...


In [6]:
# wordclass labeled explanations
instances_exp_labeled = pd.read_csv('../../data/processed/explain_labeling/wordclass_labeling_labeled.csv')
instances_exp_labeled = instances_exp_labeled.set_index('pair_id', drop=False)
display(instances_exp_labeled.head())

instances_to_explain = computers_newgs.loc[instances_exp_labeled.index]
instances_to_explain.head()

Unnamed: 0_level_0,pair_id,label,brand_left,brand_left_wordclasses,title_left,title_left_wordclasses,brand_right,brand_right_wordclasses,title_right,title_right_wordclasses
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3483338#13088419,3483338#13088419,1,LOGITECH,['2'],LOGITECH 920-004536 MK270 WIRELESS COMBO WITH ...,"['2', '1', '3', '4', '7', '5', '7', '5', '7', ...",Logitech,['2'],Logitech MK270 Wireless Keyboard and Mouse Com...,"['2', '3', '6', '6', '5', '6', '6', '5', '5', ..."
16723236#1379306,16723236#1379306,1,HP Enterprise,"['2', '2']","300682-B21 HP 4GB (2x2GB) 266MHz SDRAM Kit, Nu...","['1', '2', '4', '4', '4', '4', '7', '5', '5', ...",NETCNA,['5'],300682-B21 4GB (2x2GB) Compaq Proliant BL/DL/M...,"['1', '4', '4', '3', '3', '7', '4', '5', '6', ..."
2900676#8501437,2900676#8501437,0,HP,['2'],HP Chromebook 14 G4 - 14 Celeron N2840 2 GB RA...,"['2', '3', '3', '3', '5', '3', '4', '4', '4', ...",HP,['2'],HP Chromebook 14 G4 - 14 Celeron N2940 4 GB RA...,"['2', '3', '3', '3', '5', '4', '4', '4', '4', ..."
1613113#14761048,1613113#14761048,0,,[],Seagate Guardian BarraCuda ST4000LM024 - hard ...,"['2', '3', '3', '1', '5', '6', '6', '4', '4', ...",,[],Seagate Guardian BarraCuda ST2000LM015 - hard ...,"['2', '3', '3', '1', '5', '6', '6', '4', '4', ..."
11050100#5511322,11050100#5511322,1,,[],Liebert PSI-XR PS2200RT3,"['2', '3', '1']",,[],Liebert PSI XR 1920VA 1920W 120V Line-interact...,"['2', '3', '3', '3', '4', '4', '7', '7', '7', ..."


Unnamed: 0_level_0,ltable_id,ltable_cluster_id,ltable_category,ltable_title,ltable_description,ltable_brand,ltable_price,ltable_keyValuePairs,ltable_specTableContent,rtable_id,rtable_cluster_id,rtable_category,rtable_title,rtable_description,rtable_brand,rtable_price,rtable_keyValuePairs,rtable_specTableContent,label,pair_id
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3483338#13088419,3483338,585018,computers_and_accessories,logitech 920-004536 mk270 wireless combo with ...,overviewlogitech advanced 2.4 ghz wireless giv...,logitech,,,,13088419,585018,computers_and_accessories,logitech mk270 wireless keyboard and mouse com...,the stylish logitech mk270 wireless keyboard a...,logitech,,,,1,3483338#13088419
16723236#1379306,16723236,4335322,computers_and_accessories,"300682-b21 hp 4gb (2x2gb) 266mhz sdram kit, nu...",description:genuine hpe 4gb (2x2gb) registered...,hp enterprise,,"{'Category': 'Proliant Memory', 'Sub-Category'...",specifications: category proliant memory sub-c...,1379306,4335322,computers_and_accessories,300682-b21 4gb (2x2gb) compaq proliant bl/dl/m...,improve your pc's performance and boost applic...,netcna,,,,1,16723236#1379306
2900676#8501437,2900676,985505,computers_and_accessories,hp chromebook 14 g4 - 14 celeron n2840 2 gb ra...,hp chromebook 14 g4 - celeron n2840 / 2.16 ghz...,hp,,,spec value graphics processor intel hd graphic...,8501437,1493118,computers_and_accessories,hp chromebook 14 g4 - 14 celeron n2940 4 gb ra...,hp chromebook 14 g4 - celeron n2940 / 1.83 ghz...,hp,,,spec value graphics processor intel hd graphic...,0,2900676#8501437
1613113#14761048,1613113,326383,computers_and_accessories,seagate guardian barracuda st4000lm024 - hard ...,barracuda leads the industry with the highest ...,,"$, 174.99",,,14761048,334870,computers_and_accessories,seagate guardian barracuda st2000lm015 - hard ...,barracuda leads the industry with the highest ...,,"$, 87.99",,,0,1613113#14761048
11050100#5511322,11050100,4677375,computers_and_accessories,liebert psi-xr ps2200rt3,"liebert psi-xr is a compact, line-interactive ...",,"$, 1,209.99",,,5511322,4677375,computers_and_accessories,liebert psi xr 1920va 1920w 120v line-interact...,,,,,,1,11050100#5511322


In [7]:
# model = DistilBertModelLogit()
# checkpoint = torch.load('saved/models/BT-DistilBERT-FT-computers-xlarge/0827_180838/model_best.pth')

model = BertModelLogit()
checkpoint = torch.load('saved/models/BT-BERT-FT-computers-xlarge/1022_102200/model_best.pth')

# model = JointBertModelLogit(745)
# checkpoint = torch.load('saved/models/BT-JointBERT-FT-computers-xlarge/1023_052259/model_best.pth')

# model = RobertaModelLogit()
# checkpoint = torch.load('saved/models/BT-RoBERTa-FT-computers-xlarge/1205_080330/model_best.pth')

# model_name = '../../cache/deepmatcher/wdc-lspc/models/rnn_abs-diff_standard_epochs50_ratio6_batch16_lr0.003_lrdecay0.8_fasttext.en.bin_brand-title_preprocessed_computers_trainonly_xlarge_magellan_pairs_formatted_run1_model.pth'
# model_type='rnn'
# model = dm.MatchingModel(attr_summarizer=model_type)
# model.load_state(model_name)

state_dict = checkpoint['state_dict']

model.load_state_dict(state_dict)
model.to(DEVICE)
model.eval()

RobertaModelLogit(
  (bert_layer): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps

In [9]:
MOJITO = Mojito(instances_to_explain.columns,
                attr_to_copy = 'left',
                ignore_columns = IGNORE_COLUMNS,
                split_expression = " ",
                class_names = ['no_match', 'match'], 
                feature_selection = "lasso_path",
                bow=False,
                random_state=0)

In [10]:
instances_to_explain.head()

Unnamed: 0_level_0,ltable_id,ltable_cluster_id,ltable_category,ltable_title,ltable_description,ltable_brand,ltable_price,ltable_keyValuePairs,ltable_specTableContent,rtable_id,rtable_cluster_id,rtable_category,rtable_title,rtable_description,rtable_brand,rtable_price,rtable_keyValuePairs,rtable_specTableContent,label,pair_id
pair_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3483338#13088419,3483338,585018,computers_and_accessories,logitech 920-004536 mk270 wireless combo with ...,overviewlogitech advanced 2.4 ghz wireless giv...,logitech,,,,13088419,585018,computers_and_accessories,logitech mk270 wireless keyboard and mouse com...,the stylish logitech mk270 wireless keyboard a...,logitech,,,,1,3483338#13088419
16723236#1379306,16723236,4335322,computers_and_accessories,"300682-b21 hp 4gb (2x2gb) 266mhz sdram kit, nu...",description:genuine hpe 4gb (2x2gb) registered...,hp enterprise,,"{'Category': 'Proliant Memory', 'Sub-Category'...",specifications: category proliant memory sub-c...,1379306,4335322,computers_and_accessories,300682-b21 4gb (2x2gb) compaq proliant bl/dl/m...,improve your pc's performance and boost applic...,netcna,,,,1,16723236#1379306
2900676#8501437,2900676,985505,computers_and_accessories,hp chromebook 14 g4 - 14 celeron n2840 2 gb ra...,hp chromebook 14 g4 - celeron n2840 / 2.16 ghz...,hp,,,spec value graphics processor intel hd graphic...,8501437,1493118,computers_and_accessories,hp chromebook 14 g4 - 14 celeron n2940 4 gb ra...,hp chromebook 14 g4 - celeron n2940 / 1.83 ghz...,hp,,,spec value graphics processor intel hd graphic...,0,2900676#8501437
1613113#14761048,1613113,326383,computers_and_accessories,seagate guardian barracuda st4000lm024 - hard ...,barracuda leads the industry with the highest ...,,"$, 174.99",,,14761048,334870,computers_and_accessories,seagate guardian barracuda st2000lm015 - hard ...,barracuda leads the industry with the highest ...,,"$, 87.99",,,0,1613113#14761048
11050100#5511322,11050100,4677375,computers_and_accessories,liebert psi-xr ps2200rt3,"liebert psi-xr is a compact, line-interactive ...",,"$, 1,209.99",,,5511322,4677375,computers_and_accessories,liebert psi xr 1920va 1920w 120v line-interact...,,,,,,1,11050100#5511322


In [None]:
start = time.time()
explanation = MOJITO.drop(
                #wrap_distilbert(model),
                #wrap_deepmatcher(model),
                #wrap_bert(model),
                #wrap_jointbert(model),
                wrap_roberta(model),
                instances_to_explain,
                num_features = 1000,
                num_perturbation = 5000)
end = time.time()
passed_time = end-start
print(passed_time)

Explaining 0/295
Explaining 1/295
Explaining 2/295
Explaining 3/295
Explaining 4/295
Explaining 5/295
Explaining 6/295
Explaining 7/295
Explaining 8/295
Explaining 9/295
Explaining 10/295
Explaining 11/295
Explaining 12/295
Explaining 13/295
Explaining 14/295
Explaining 15/295
Explaining 16/295
Explaining 17/295
Explaining 18/295
Explaining 19/295
Explaining 20/295
Explaining 21/295
Explaining 22/295
Explaining 23/295
Explaining 24/295
Explaining 25/295
Explaining 26/295
Explaining 27/295
Explaining 28/295
Explaining 29/295
Explaining 30/295
Explaining 31/295
Explaining 32/295
Explaining 33/295
Explaining 34/295
Explaining 35/295
Explaining 36/295
Explaining 37/295
Explaining 38/295
Explaining 39/295
Explaining 40/295
Explaining 41/295
Explaining 42/295
Explaining 43/295
Explaining 44/295
Explaining 45/295
Explaining 46/295
Explaining 47/295
Explaining 48/295
Explaining 49/295
Explaining 50/295
Explaining 51/295
Explaining 52/295
Explaining 53/295
Explaining 54/295
Explaining 55/295
Ex

In [None]:
explanation

In [None]:
Path('../../data/processed/explain_labeling/explained/').mkdir(parents=True, exist_ok=True)
#explanation.to_pickle('../../data/processed/explain_labeling/explained/distilbert.pkl.gz')
explanation.to_pickle('../../data/processed/explain_labeling/explained/bert.pkl.gz')
#explanation.to_pickle('../../data/processed/explain_labeling/explained/jointbert.pkl.gz')
#explanation.to_pickle('../../data/processed/explain_labeling/explained/deepmatcher.pkl.gz')
#explanation.to_pickle('../../data/processed/explain_labeling/explained/roberta.pkl.gz')