## Generazione nuovi esempi di training

In [55]:
import sys
sys.path.append('../..')

In [3]:
import os
import pandas as pd
dir_path ='../../datasets/Structured/itunes-amazon/'
tableA_df = pd.read_csv(os.path.join(dir_path,'tableA.csv'))
tableB_df = pd.read_csv(os.path.join(dir_path,'tableB.csv'))

In [4]:
tableA_df['Artist_Name']= tableA_df['Artist_Name'].apply(lambda s: s.lower())
tableA_df['Song_Name']= tableA_df['Song_Name'].apply(lambda s: s.lower())

In [5]:
tableB_df['Artist_Name']= tableB_df['Artist_Name'].apply(lambda s: s.lower())
tableB_df['Song_Name']= tableB_df['Song_Name'].apply(lambda s: s.lower())

In [6]:
def minimumEditDistance(s1,s2):
    if len(s1) > len(s2):
        s1,s2 = s2,s1
    distances = range(len(s1) + 1)
    for index2,char2 in enumerate(s2):
        newDistances = [index2+1]
        for index1,char1 in enumerate(s1):
            if char1 == char2:
                newDistances.append(distances[index1])
            else:
                newDistances.append(1 + min((distances[index1],
                                             distances[index1+1],
                                             newDistances[-1])))
        distances = newDistances
    return distances[-1]

In [7]:
def jaccard_distance(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    #print(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float((union-intersection) / union)

In [8]:
import nltk
stopwords = nltk.download("stopwords")

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])

[nltk_data] Downloading package stopwords to /home/nvidia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def get_clean_tokens(s):
    tokens =tokenizer.tokenize(s)
    return [word for word in tokens if word not in stop_words]

## Ricerca di canzoni non matching tramite jaccard similarity

In [10]:
import random as rd

In [41]:
def generateNewNegatives(source1,source2,nsamples):
    newsamples = []
    rd.seed(0)
    while(len(newsamples)<nsamples):
        k = rd.randint(0,source1.shape[0]-1)
        l_row = source1.iloc[k]
        ltime = l_row['Time']
        lartist_cleaned = get_clean_tokens(l_row['Artist_Name'])
        lsong_cleanead = get_clean_tokens(l_row['Song_Name'])
        source2_sametime = source2[source2.Time==ltime]
        source2_sametime_ids = source2_sametime.id.values
        ##per avere più variabilità
        found = False
        while (not(found) and len(source2_sametime_ids)>0):
            idx = rd.choice(source2_sametime_ids)
            current_rrow = source2.iloc[idx]
            rsong_cleaned = get_clean_tokens(current_rrow['Song_Name'])
            rartist_clean = get_clean_tokens(current_rrow['Artist_Name'])
            if (jaccard_distance(lartist_cleaned,rsong_cleaned)>=0.7 or 
                jaccard_distance(lsong_cleanead,rsong_cleaned) >=0.7):
                newsamples.append((l_row['id'],current_rrow['id'],0))
                found = True
    return newsamples

In [61]:
train_ids = pd.read_csv('../../datasets/Structured/itunes-amazon/train.csv')
train_df = pd.read_csv('../../datasets/Structured/itunes-amazon/merged_train.csv')
len(train_df[train_df.label==1]),len(train_df)

(78, 321)

In [42]:
newSamples = generateNewNegatives(tableA_df,tableB_df,50)
newSamples_df = pd.DataFrame(data = newSamples,columns=['ltable_id','rtable_id','label'])

In [43]:
newSamples_df.head(10)

Unnamed: 0,ltable_id,rtable_id,label
0,3155,54858,0
1,3445,6114,0
2,2121,53987,0
3,3980,30544,0
4,6420,34453,0
5,3904,27626,0
6,4779,17280,0
7,4134,10367,0
8,2308,9588,0
9,6191,7981,0


In [44]:
tableA_df.iloc[[3155]]

Unnamed: 0,id,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released
3155,3155,inside your heaven,carrie underwood,Some Hearts,"Pop , Music , Rock , Country , Contemporary Co...",$ 1.29,2005 19 Recordings Limited,3:44,14-Nov-05


In [46]:
tableB_df.iloc[[54858]]

Unnamed: 0,id,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released
54858,54858,willow in the wind,kathy mattea,Willow In The Wind,Country,$ 0.99,"( C ) 1989 Mercury Records , a Division of UMG...",3:44,"April 4 , 1989"


In [24]:
tableA_df.iloc[[1923]]

Unnamed: 0,id,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released
1923,1923,i have seen the rain ( feat . james t. moore ),p!nk,I 'm Not Dead,"Pop , Music , Electronic , R&B / Soul , Dance ...",$ 0.99,2006 LaFace Records LLC,3:30,3-Apr-06


In [25]:
tableB_df.iloc[[51340]]

Unnamed: 0,id,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released
51340,51340,biscotte et makoko,joìçlle lì © andre | benoì ¨ t delbecq | carna...,Tout va monter,"Modern Postbebop , Jazz , Bebop",$ 0.99,2015 nato,3:30,"June 22 , 2015"


In [50]:
newSamples_df.to_csv('exp26/extended_train.csv',index=False)

In [29]:
import os
if not(os.path.exists('exp26')):
       os.mkdir('exp26')

In [51]:
new_train_df = pd.concat([train_ids,newSamples_df],ignore_index=True)

In [52]:
new_train_df.to_csv('exp26/extended_train.csv',index=False)

## Retrain model

In [64]:
from utils.dataset_parser import generate_train_valid_test
import deepmatcher as dm

In [57]:
!cp ../../datasets/Structured/itunes-amazon/valid.csv exp26
!cp ../../datasets/Structured/itunes-amazon/test.csv exp26
!cp ../../datasets/Structured/itunes-amazon/tableA.csv exp26
!cp ../../datasets/Structured/itunes-amazon/tableB.csv exp26

In [78]:
train_df,valid_df,test_df = generate_train_valid_test('exp26/',['extended_train.csv','valid.csv','test.csv'],
                                                     'ltable_','rtable_')

In [71]:
train_df.to_csv('exp26/merged_train.csv',index=False)
test_df.to_csv('exp26/merged_test.csv',index=False)
valid_df.to_csv('exp26/merged_validation.csv',index=False)

In [72]:
train,valid,test = dm.data.process('exp26/',train='merged_train.csv',validation='merged_validation.csv',
                                  test='merged_test.csv',left_prefix='ltable_',right_prefix='rtable_')


Reading and processing data from "exp26/merged_train.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "exp26/merged_validation.csv"
0% [############################# ] 100% | ETA: 00:00:00
Reading and processing data from "exp26/merged_test.csv"
0% [############################# ] 100% | ETA: 00:00:00
Building vocabulary
0% [#] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00

Computing principal components
0% [#] 100% | ETA: 00:00:00
Total time elapsed: 00:00:00


In [73]:
model = dm.MatchingModel(attr_summarizer='hybrid')
model.run_train(train,valid,best_save_path='../../models/itunes_amazon_impr3.pth',pos_neg_ratio=5,
               epochs=25)

* Number of trainable parameters: 17757810
===>  TRAIN Epoch 1


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 1 || Run Time:    6.0 | Load Time:    2.3 || F1:  37.04 | Prec:  22.94 | Rec:  96.15 || Ex/s:  44.78

===>  EVAL Epoch 1
Finished Epoch 1 || Run Time:    0.8 | Load Time:    0.7 || F1:  58.82 | Prec:  48.78 | Rec:  74.07 || Ex/s:  69.84

* Best F1: tensor(58.8235, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 2


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 2 || Run Time:    5.7 | Load Time:    2.3 || F1:  48.75 | Prec:  33.83 | Rec:  87.18 || Ex/s:  46.25

===>  EVAL Epoch 2
Finished Epoch 2 || Run Time:    0.6 | Load Time:    0.6 || F1:  48.15 | Prec:  32.10 | Rec:  96.30 || Ex/s:  94.08

---------------------

===>  TRAIN Epoch 3


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 3 || Run Time:    5.4 | Load Time:    2.1 || F1:  53.68 | Prec:  37.63 | Rec:  93.59 || Ex/s:  49.14

===>  EVAL Epoch 3
Finished Epoch 3 || Run Time:    0.6 | Load Time:    0.6 || F1:  65.79 | Prec:  51.02 | Rec:  92.59 || Ex/s:  93.89

* Best F1: tensor(65.7895, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 4


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 4 || Run Time:    5.6 | Load Time:    2.3 || F1:  58.96 | Prec:  42.77 | Rec:  94.87 || Ex/s:  47.34

===>  EVAL Epoch 4
Finished Epoch 4 || Run Time:    0.6 | Load Time:    0.6 || F1:  67.57 | Prec:  53.19 | Rec:  92.59 || Ex/s:  94.31

* Best F1: tensor(67.5676, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 5


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 5 || Run Time:    5.6 | Load Time:    2.3 || F1:  70.81 | Prec:  56.49 | Rec:  94.87 || Ex/s:  46.77

===>  EVAL Epoch 5
Finished Epoch 5 || Run Time:    0.6 | Load Time:    0.6 || F1:  73.24 | Prec:  59.09 | Rec:  96.30 || Ex/s:  93.43

* Best F1: tensor(73.2394, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 6


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 6 || Run Time:    4.7 | Load Time:    1.8 || F1:  84.78 | Prec:  73.58 | Rec: 100.00 || Ex/s:  57.70

===>  EVAL Epoch 6
Finished Epoch 6 || Run Time:    0.5 | Load Time:    0.5 || F1:  72.97 | Prec:  57.45 | Rec: 100.00 || Ex/s: 110.39

---------------------

===>  TRAIN Epoch 7


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 7 || Run Time:    5.1 | Load Time:    1.9 || F1:  89.14 | Prec:  80.41 | Rec: 100.00 || Ex/s:  53.04

===>  EVAL Epoch 7
Finished Epoch 7 || Run Time:    0.6 | Load Time:    0.6 || F1:  77.14 | Prec:  62.79 | Rec: 100.00 || Ex/s:  93.97

* Best F1: tensor(77.1429, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 8


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 8 || Run Time:    5.1 | Load Time:    2.0 || F1:  91.76 | Prec:  84.78 | Rec: 100.00 || Ex/s:  51.97

===>  EVAL Epoch 8
Finished Epoch 8 || Run Time:    0.6 | Load Time:    0.6 || F1:  77.61 | Prec:  65.00 | Rec:  96.30 || Ex/s:  93.99

* Best F1: tensor(77.6119, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 9


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 9 || Run Time:    5.6 | Load Time:    2.3 || F1:  97.50 | Prec:  95.12 | Rec: 100.00 || Ex/s:  47.20

===>  EVAL Epoch 9
Finished Epoch 9 || Run Time:    0.6 | Load Time:    0.6 || F1:  77.61 | Prec:  65.00 | Rec:  96.30 || Ex/s:  94.03

---------------------

===>  TRAIN Epoch 10


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 10 || Run Time:    5.5 | Load Time:    2.2 || F1:  97.50 | Prec:  95.12 | Rec: 100.00 || Ex/s:  48.34

===>  EVAL Epoch 10
Finished Epoch 10 || Run Time:    0.6 | Load Time:    0.6 || F1:  77.61 | Prec:  65.00 | Rec:  96.30 || Ex/s:  94.50

---------------------

===>  TRAIN Epoch 11


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:06


Finished Epoch 11 || Run Time:    5.5 | Load Time:    2.2 || F1:  98.73 | Prec:  97.50 | Rec: 100.00 || Ex/s:  48.49

===>  EVAL Epoch 11
Finished Epoch 11 || Run Time:    0.6 | Load Time:    0.6 || F1:  77.61 | Prec:  65.00 | Rec:  96.30 || Ex/s:  93.22

---------------------

===>  TRAIN Epoch 12


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 12 || Run Time:    4.6 | Load Time:    1.7 || F1:  98.73 | Prec:  97.50 | Rec: 100.00 || Ex/s:  58.91

===>  EVAL Epoch 12
Finished Epoch 12 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.17

* Best F1: tensor(78.7879, device='cuda:0')
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 13


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 13 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.49

===>  EVAL Epoch 13
Finished Epoch 13 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.83

---------------------

===>  TRAIN Epoch 14


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 14 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.58

===>  EVAL Epoch 14
Finished Epoch 14 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 109.56

---------------------

===>  TRAIN Epoch 15


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 15 || Run Time:    5.3 | Load Time:    2.1 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  50.06

===>  EVAL Epoch 15
Finished Epoch 15 || Run Time:    0.6 | Load Time:    0.6 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s:  94.03

---------------------

===>  TRAIN Epoch 16


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 16 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.68

===>  EVAL Epoch 16
Finished Epoch 16 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.79

---------------------

===>  TRAIN Epoch 17


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 17 || Run Time:    4.7 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.38

===>  EVAL Epoch 17
Finished Epoch 17 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 109.87

---------------------

===>  TRAIN Epoch 18


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 18 || Run Time:    4.7 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.27

===>  EVAL Epoch 18
Finished Epoch 18 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.18

---------------------

===>  TRAIN Epoch 19


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 19 || Run Time:    4.8 | Load Time:    1.8 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  56.87

===>  EVAL Epoch 19
Finished Epoch 19 || Run Time:    0.6 | Load Time:    0.6 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s:  94.18

---------------------

===>  TRAIN Epoch 20


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 20 || Run Time:    4.7 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.43

===>  EVAL Epoch 20
Finished Epoch 20 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.15

---------------------

===>  TRAIN Epoch 21


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 21 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.58

===>  EVAL Epoch 21
Finished Epoch 21 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.22

---------------------

===>  TRAIN Epoch 22


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 22 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.99

===>  EVAL Epoch 22
Finished Epoch 22 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.45

---------------------

===>  TRAIN Epoch 23


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:04


Finished Epoch 23 || Run Time:    4.6 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  59.18

===>  EVAL Epoch 23
Finished Epoch 23 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 110.72

---------------------

===>  TRAIN Epoch 24


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 24 || Run Time:    4.7 | Load Time:    1.7 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  58.07

===>  EVAL Epoch 24
Finished Epoch 24 || Run Time:    0.5 | Load Time:    0.5 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s: 109.98

---------------------

===>  TRAIN Epoch 25


0% [██] 100% | ETA: 00:00:00
Total time elapsed: 00:00:05


Finished Epoch 25 || Run Time:    5.4 | Load Time:    2.1 || F1:  99.36 | Prec:  98.73 | Rec: 100.00 || Ex/s:  49.50

===>  EVAL Epoch 25
Finished Epoch 25 || Run Time:    0.6 | Load Time:    0.6 || F1:  78.79 | Prec:  66.67 | Rec:  96.30 || Ex/s:  94.18

---------------------

Loading best model...
Training done.


tensor(78.7879, device='cuda:0')

In [74]:
model.run_eval(test)

===>  EVAL Epoch 12
Finished Epoch 12 || Run Time:    0.7 | Load Time:    0.7 || F1:  81.97 | Prec:  73.53 | Rec:  92.59 || Ex/s:  75.61



tensor(81.9672, device='cuda:0')

## Test bias on retrained

In [81]:
from utils.deepmatcher_utils import wrapDm
import numpy as np

In [90]:
improved_1 = dm.MatchingModel(attr_summarizer='hybrid')
improved_1.load_state('../../models/itunesamazon_impr1.pth')

In [92]:
test_neg_fake = test_df[test_df.label==0].copy()
test_neg_fake['ltable_Time'] = test_neg_fake['rtable_Time']

In [88]:
predictions = wrapDm(test_neg,model)
np.count_nonzero(np.argmax(predictions,axis=1)==0)

73

In [94]:
predictions = wrapDm(test_neg_fake,improved_1)
np.count_nonzero(np.argmax(predictions,axis=1)==0)

70