# DeepER Classic 

## Step 0: Caricamento dati, preprocessing e strutture ausiliarie

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from deeper.DeepER import init_embeddings_index, init_embeddings_model, init_DeepER_model, train_model_ER, model_statistics
from deeper.data import process_data_aligned
from keras.models import load_model
from keras.layers import Dense
from deeper.csv2dataset import splitting_dataSet
from plotly import graph_objs as go
import plotly.offline as pyo

Using TensorFlow backend.


## Load embedding model

In [3]:
# Caricamento strutture dati e modelli ausiliari.
EMBEDDING_FILEPATH ='embeddings/glove.840B.300d.txt'
embeddings_index = init_embeddings_index(EMBEDDING_FILEPATH)
emb_dim = len(embeddings_index['cat']) # :3
embeddings_model, tokenizer = init_embeddings_model(embeddings_index)

* Costruzione indice degli embeddings.....Fatto. 2196016 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 1702926 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (1702927, 300)

°°° EMBEDDING MODEL °°°
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, None, 300)    510878100  

## Walmart-Amazon

In [7]:
# Imposta manualmente a False per ricreare il file contenente il dataset scelto. 
LOAD_FROM_DISK_DATASET=False
# Imposta manualmente a False per ri-eseguire tutti gli addestramenti.
LOAD_FROM_DISK_MODEL = False
# Il nome con cui saranno etichettati i files prodotti
DATASET_DIR = 'datasets/walmart_amazon/'
DATASET_NAME ='walmart-amazon'
TABLE1_FILE = 'walmart.csv'
TABLE2_FILE = 'amazon.csv'

In [8]:
deeper_train,deeper_test = process_data_aligned(DATASET_DIR,DATASET_NAME,ground_truth='walmart-amazon_perfectMapping.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE)

min cosine similarity of matches is 0.11669000700233413


### Step 1: Addestramento

In [8]:
# InPut: Percentuale di dati considerata per l'addestramento. 
# OutPut: DeepER addestrato sul taglio scelto.
def get_DeepER(perc,train,load_model_from_disk,dataset_name):
   
    sub_data = splitting_dataSet(perc,train)    
    
    if load_model_from_disk:
        
        # Carica da disco.
        print(f'Loading DeepER_best_model_{int(perc*100)}_{dataset_name}.h5', end='', flush=True)
        deeper_model = load_model(f'models/DeepER_best_model_{int(perc*100)}_{dataset_name}.h5')
        print('  ->  Done')        
                
    else:
        
        # Inizializza il modello.
        deeper_model = init_DeepER_model(emb_dim)
        # Avvio addestramento.
        deeper_model = train_model_ER(sub_data, 
                                      deeper_model, 
                                      embeddings_model, 
                                      tokenizer, 
                                      pretraining=False,
                                      metric='val_accuracy',
                                      end=f'_{int(perc*100)}_{dataset_name}')
        
    return deeper_model

In [10]:
# Avvio addestramenti o carica da disco.
deeper_model_100 = get_DeepER(1,deeper_train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Batch size: 29
Train on 1476 samples, validate on 370 samples
Epoch 1/64

Epoch 00001: val_accuracy improved from -inf to 0.91351, saving model to models/DeepER_best_model_100_walmart-amazon.h5
Epoch 2/64

Epoch 00002: val_accuracy improved from 0.91351 to 0.97027, saving model to models/DeepER_best_model_100_walmart-amazon.h5
Epoch 3/64

Epoch 00003: val_accuracy did not improve from 0.97027
Epoch 4/64

Epoch 00004: val_accuracy did not improve from 0.97027
Epoch 5/64

Epoch 00005: val_accuracy did not improve from 0.97027
Epoch 6/64

Epoch 00006: val_accuracy did not improve from 0.97027
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.97027
Epoch 8/64

Epoch 00008: val_accuracy did not improve from 0.97027
Epoch 9/64

Epoch 00009: val_accuracy did not improve from 0.97027
Epoch 00009: early stopping


### Calcolo F-Measure dopo addestramento

In [11]:
# Misurazione dell'f-measure sullo stesso test set con i diversi modelli.
f1_score= model_statistics(deeper_test, deeper_model_100, embeddings_model, tokenizer)
print(f1_score)

* Avvio test metriche....
-- Corpus size: 462
-- Non Match: 250
-- Match: 212
Precision: 0.9759615384615384, Recall: 0.9575471698113207, f1-score: 0.9666666666666666
Total retrieved: 208, retrieved/total matches: 203/212
0.9666666666666666


### Visualizzazione F-Measure

In [None]:
# Attiva modalità notebook per mostrare i grafici correttamente.
pyo.init_notebook_mode()

splits = ['100% split', '75% split', '50% split', '25% split', '10% split', '5% split']
total_tup = len(deeper_train)
tuplecount = [total_tup, 
              int(total_tup*0.75), 
              int(total_tup*0.5), 
              int(total_tup*0.25), 
              int(total_tup*0.1), 
              int(total_tup*0.05)]

# Aggiungi descrizione al numero
tuplecount = list(map(lambda x: f'{x} coppie di tuple', tuplecount))

fig = go.Figure(data=[go.Bar(name='DeepER', x=splits, y=fm_model_standard, hovertext=tuplecount)])

#fig.show()

# Plotta il grafico e salvalo come features_standard.html (verrà integrato nell'html).
pyo.iplot(fig, filename='fmeasures-standard')

##### Al passaggio del mouse il grafico mostra:
- Il numero di coppie di tuple utilizzate per l'addestramento; 
- La percentuale di split (Quantità di tuple utilizzate per addestrare il modello);
- Il valore di F-Measure (media armonica tra precision e recall);

## iTunes-Amazon

In [4]:
ITUNES_DIR = 'datasets/itunes_amazon/'
DATASET_NAME ='itunes-amazon'
TABLE1_FILE = 'itunes.csv'
TABLE2_FILE = 'amazon.csv'

In [5]:
itunes_train,itunes_test = process_data_aligned(ITUNES_DIR,DATASET_NAME,ground_truth='matches_itunes_amazon.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE,load_from_disk_dataset=True)

match_number: 132
len all dataset: 264


In [None]:
# Avvio addestramenti o carica da disco.
deeper_model_itunes_100 = get_DeepER(1,itunes_train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Batch size: 4
Train on 168 samples, validate on 43 samples
Epoch 1/64

Epoch 00001: val_accuracy improved from -inf to 0.79070, saving model to models/DeepER_best_model_100_itunes-amazon.h5
Epoch 2/64

Epoch 00002: val_accuracy improved from 0.79070 to 0.88372, saving model to models/DeepER_best_model_100_itunes-amazon.h5
Epoch 3/64

Epoch 00003: val_accuracy did not improve from 0.88372
Epoch 4/64

Epoch 00004: val_accuracy did not improve from 0.88372
Epoch 5/64

Epoch 00005: val_accuracy did not improve from 0.88372
Epoch 6/64

Epoch 00006: val_accuracy did not improve from 0.88372
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.88372
Epoch 8/64

In [19]:
# Misurazione dell'f-measure sullo stesso test set con i diversi modelli.
f1_score= model_statistics(itunes_test, deeper_model_itunes_100, embeddings_model, tokenizer)
f1_score

* Avvio test metriche....
-- Corpus size: 53
-- Non Match: 31
-- Match: 22
Precision: 0.7916666666666666, Recall: 0.8636363636363636, f1-score: 0.8260869565217391
Total retrieved: 24, retrieved/total matches: 19/22


0.8260869565217391

## Amazon-Google

In [20]:
DATA_DIR = 'datasets/Amazon-GoogleProducts/'
DATASET_NAME ='amazon-google'
TABLE1_FILE = 'Amazon.csv'
TABLE2_FILE = 'Google.csv'

In [21]:
train,test = process_data_aligned(DATA_DIR,DATASET_NAME,ground_truth='amazon_google_matches.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE,load_from_disk_dataset=True)

match_number: 1300
len all dataset: 2600


In [25]:
# Avvio addestramenti o carica da disco.
deeper_model_amazongoogle_100 = get_DeepER(1,train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          541200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Composition[0][0]     

## DBLP-ACM

In [22]:
DATA_DIR = 'datasets/DBLP-ACM/'
DATASET_NAME ='dblp-acm'
TABLE1_FILE = 'DBLP2.csv'
TABLE2_FILE = 'ACM.csv'

In [23]:
train,test = process_data_aligned(DATA_DIR,DATASET_NAME,ground_truth='DBLP-ACM_perfectMapping.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE,load_from_disk_dataset=True)

match_number: 2224
len all dataset: 4448


In [24]:
# Avvio addestramenti o carica da disco.
deeper_model_dblpacm_100 = get_DeepER(1,train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Batch size: 54
Train on 2846 samples, validate on 712 samples
Epoch 1/64

Epoch 00001: val_accuracy improved from -inf to 0.90590, saving model to models/DeepER_best_model_100_dblp-acm.h5
Epoch 2/64

Epoch 00002: val_accuracy improved from 0.90590 to 0.97472, saving model to models/DeepER_best_model_100_dblp-acm.h5
Epoch 3/64

Epoch 00003: val_accuracy did not improve from 0.97472
Epoch 4/64

Epoch 00004: val_accuracy improved from 0.97472 to 0.97893, saving model to models/DeepER_best_model_100_dblp-acm.h5
Epoch 5/64

Epoch 00005: val_accuracy improved from 0.97893 to 0.98174, saving model to models/DeepER_best_model_100_dblp-acm.h5
Epoch 6/64

Epoch 00006: val_accuracy did not improve from 0.98174
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.98174
Epoch 8/64

Epoch 00008: val_accuracy did not improve from 0.98174
Epoch 9/64

Epoch 00009: val_accuracy did not improve from 0.98174
Epoch 10/64

Epoch 00010: val_accuracy did not improve from 0.98174


In [25]:
f1_score = model_statistics(test,deeper_model_dblpacm_100,embeddings_model,tokenizer)
f1_score

* Avvio test metriche....
-- Corpus size: 890
-- Non Match: 463
-- Match: 427
Precision: 0.9742388758782201, Recall: 0.9742388758782201, f1-score: 0.9742388758782201
Total retrieved: 427, retrieved/total matches: 416/427


0.9742388758782201

## Fodors-Zagats

In [3]:
DATA_DIR = 'datasets/fodors_zagats/'
DATASET_NAME ='fodors-zagats'
TABLE1_FILE = 'fodors.csv'
TABLE2_FILE = 'zagats.csv'
GROUND_TRUTH = 'matches_fodors_zagats.csv'

In [4]:
train,test = process_data_aligned(DATA_DIR,DATASET_NAME,ground_truth=GROUND_TRUTH,
                         table1=TABLE1_FILE,table2=TABLE2_FILE,load_from_disk_dataset=False)

min cosine similarity of matches is 0.5669467095138409


In [8]:
# Avvio addestramenti o carica da disco.
foodors_zagats_100 = get_DeepER(1,train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Batch size: 4
Train on 143 samples, validate on 36 samples
Epoch 1/64

Epoch 00001: val_accuracy improved from -inf to 0.94444, saving model to models/DeepER_best_model_100_fodors-zagats.h5
Epoch 2/64

Epoch 00002: val_accuracy improved from 0.94444 to 0.97222, saving model to models/DeepER_best_model_100_fodors-zagats.h5
Epoch 3/64

Epoch 00003: val_accuracy did not improve from 0.97222
Epoch 4/64

Epoch 00004: val_accuracy did not improve from 0.97222
Epoch 5/64

Epoch 00005: val_accuracy did not improve from 0.97222
Epoch 6/64

Epoch 00006: val_accuracy did not improve from 0.97222
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.97222
Epoch 8/64

Epoch 00008: val_accuracy did not improve from 0.97222
Epoch 9/64

Epoch 00009: val_accuracy did not improve from 0.97222
Epoch 00009: early stopping


In [29]:
f1_score = model_statistics(test,foodors_zagats_100,embeddings_model,tokenizer)
f1_score

* Avvio test metriche....
-- Corpus size: 45
-- Non Match: 24
-- Match: 21
Precision: 0.8181818181818182, Recall: 0.8571428571428571, f1-score: 0.8372093023255814
Total retrieved: 22, retrieved/total matches: 18/21


0.8372093023255814

## DBLP-Scholar

In [4]:
train,test = process_data_aligned('datasets/DBLP-Scholar/','dblp-scholar',ground_truth='dblp_scholar_matches.csv',
                         table1='DBLP.csv',table2='Scholar.csv',load_from_disk_dataset=True)

match_number: 5347
len all dataset: 10694


In [10]:
# Avvio addestramenti o carica da disco.
dblp_scholar_100 = get_DeepER(1,train,load_model_from_disk=False,dataset_name='DBLP_Scholar')


°°° DeepER Model °°°
Batch size: 129
Train on 6844 samples, validate on 1711 samples
Epoch 1/64

Epoch 00001: val_accuracy improved from -inf to 0.94097, saving model to models/DeepER_best_model_100_DBLP_Scholar.h5
Epoch 2/64

Epoch 00002: val_accuracy improved from 0.94097 to 0.96786, saving model to models/DeepER_best_model_100_DBLP_Scholar.h5
Epoch 3/64

Epoch 00003: val_accuracy improved from 0.96786 to 0.97721, saving model to models/DeepER_best_model_100_DBLP_Scholar.h5
Epoch 4/64

Epoch 00004: val_accuracy did not improve from 0.97721
Epoch 5/64

Epoch 00005: val_accuracy improved from 0.97721 to 0.97779, saving model to models/DeepER_best_model_100_DBLP_Scholar.h5
Epoch 6/64

Epoch 00006: val_accuracy improved from 0.97779 to 0.98071, saving model to models/DeepER_best_model_100_DBLP_Scholar.h5
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.98071
Epoch 8/64

Epoch 00008: val_accuracy did not improve from 0.98071
Epoch 9/64

Epoch 00009: val_accuracy improved from

In [11]:
model_statistics(test,dblp_scholar_100,embeddings_model,tokenizer)

* Avvio test metriche....
-- Corpus size: 2139
-- Non Match: 1044
-- Match: 1095
Precision: 0.9862132352941176, Recall: 0.9799086757990868, f1-score: 0.983050847457627
Total retrieved: 1088, retrieved/total matches: 1073/1095


0.983050847457627

## Test

In [15]:
from keras.models import load_model
from deeper.deeper_utils import wrap_deeper
import pandas as pd

In [13]:
fodors_model = load_model('models/DeepER_best_model_100_fodors-zagats.h5')

In [33]:
test = pd.read_csv('datasets/fodors_zagats/augmented_test.csv')
test.tail(10)

Unnamed: 0,label,ltable_name,ltable_addr,ltable_city,ltable_phone,ltable_restype,ltable_class,rtable_name,rtable_addr,rtable_city,rtable_phone,rtable_restype,rtable_class,id
88,0,'second avenue deli','156 2nd ave. at 10th st.','new york',212/677-0606,delicatessen,58,'grill the','9560 dayton way','beverly hills',310-276-0615,'american (traditional)',9,592#228
89,0,'second avenue deli','156 2nd ave. at 10th st.','new york',212/677-0606,delicatessen,58,'la caravelle','33 w. 55th st.','new york city',212-586-4252,'french (classic)',39,592#258
90,0,'pano\'s and paul\'s','1232 w. paces ferry rd.',atlanta,404/261-3662,international,88,'stefano\'s','129 fremont st.','las vegas',702-385-7111,italian,672,622#139
91,0,'nikolai\'s roof','255 courtland st. at harris st.',atlanta,404/221-6362,continental,87,'hedgerose heights inn the','490 e. paces ferry rd. ne',atlanta,404-233-7673,continental,82,621#301
92,0,'gramercy tavern','42 e. 20th st. between park ave. s and broad...,'new york',212/477-0777,american,36,'hedgerose heights inn the','490 e. paces ferry rd. ne',atlanta,404-233-7673,continental,82,570#301
93,0,valentino,'3115 pico blvd.','santa monica',310/829-4313,italian,21,'second avenue deli','156 second ave.','new york city',212-677-0606,delis,58,555#277
94,0,valentino,'3115 pico blvd.','santa monica',310/829-4313,italian,21,'nikolai\'s roof','255 courtland st.',atlanta,404-221-6362,continental,87,555#306
95,0,'grill on the alley','9560 dayton way','los angeles',310/276-0615,american,9,moongate,'3400 las vegas blvd. s.','las vegas',702-791-7352,chinese,666,543#133
96,0,'khan toke thai house','5937 geary blvd.','san francisco',415/668-6654,asian,103,moongate,'3400 las vegas blvd. s.','las vegas',702-791-7352,chinese,666,637#133
97,0,'mesa grill','102 5th ave. between 15th and 16th sts.','new york',212/807-7400,american,47,moongate,'3400 las vegas blvd. s.','las vegas',702-791-7352,chinese,666,581#133


In [34]:
predictions = wrap_deeper(test,'ltable_','rtable_',fodors_model,tokenizer,embeddings_model)

In [30]:
import numpy as np
predicted_labels = np.argmax(predictions,axis=1)
np.count_nonzero(predicted_labels==0)

array([[0.00638888, 0.99361116],
       [0.07675514, 0.92324483],
       [0.13355955, 0.8664405 ],
       [0.02017914, 0.97982085],
       [0.35222998, 0.64777005],
       [0.40655485, 0.5934452 ],
       [0.37101018, 0.6289898 ],
       [0.37485528, 0.6251448 ],
       [0.3572836 , 0.64271635],
       [0.3477838 , 0.6522162 ]], dtype=float32)