# DeepER Classic 

## Step 0: Caricamento dati, preprocessing e strutture ausiliarie

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from deeper.DeepER import init_embeddings_index, init_embeddings_model, init_DeepER_model, train_model_ER, model_statistics
from deeper.data import process_data
from keras.models import load_model
from keras.layers import Dense
from deeper.csv2dataset import splitting_dataSet
from plotly import graph_objs as go
import plotly.offline as pyo

Using TensorFlow backend.


## Load embedding model

In [3]:
# Caricamento strutture dati e modelli ausiliari.
EMBEDDING_FILEPATH ='embeddings\glove.6B\glove.6B.300d.txt'
embeddings_index = init_embeddings_index(EMBEDDING_FILEPATH)
emb_dim = len(embeddings_index['cat']) # :3
embeddings_model, tokenizer = init_embeddings_model(embeddings_index)

* Costruzione indice degli embeddings.....Fatto. 400000 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 400000 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (400001, 300)

°°° EMBEDDING MODEL °°°
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, None, 300)    120000300   Tu

## Walmart-Amazon

In [6]:
# Imposta manualmente a False per ricreare il file contenente il dataset scelto. 
LOAD_FROM_DISK_DATASET=False
# Imposta manualmente a False per ri-eseguire tutti gli addestramenti.
LOAD_FROM_DISK_MODEL = False
# Il nome con cui saranno etichettati i files prodotti
DATASET_DIR = 'datasets/walmart_amazon/'
DATASET_NAME ='walmart-amazon'
TABLE1_FILE = 'walmart.csv'
TABLE2_FILE = 'amazon.csv'

In [7]:
deeper_train,deeper_test = process_data(DATASET_DIR,DATASET_NAME,ground_truth='walmart-amazon_perfectMapping.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE,indici=[])

In [8]:
emb_dim =300

## Step 1: Addestramento standard

In [9]:
# InPut: Percentuale di dati considerata per l'addestramento. 
# OutPut: DeepER addestrato sul taglio scelto.
def get_DeepER(perc,train,LOAD_FROM_DISK_MODEL):
   
    sub_data = splitting_dataSet(perc,train)    
    
    if LOAD_FROM_DISK_MODEL:
        
        # Carica da disco.
        print(f'Loading DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5', end='', flush=True)
        deeper_model = load_model(f'models/DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5')
        print('  ->  Done')        
                
    else:
        
        # Inizializza il modello.
        deeper_model = init_DeepER_model(emb_dim)
        deeper_model.summary()
        # Avvio addestramento.
        deeper_model = train_model_ER(sub_data, 
                                      deeper_model, 
                                      embeddings_model, 
                                      tokenizer, 
                                      pretraining=False,
                                      metric='val_accuracy',
                                      end=f'_{int(perc*100)}_{DATASET_NAME}')
        
    return deeper_model

In [11]:
# Avvio addestramenti o carica da disco.
deeper_model_100 = get_DeepER(1,deeper_train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          541200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Compo

### Calcolo F-Measure dopo addestramento standard

In [30]:
# Misurazione dell'f-measure sullo stesso test set con i diversi modelli.
f1_score= model_statistics(deeper_test, deeper_model_100, embeddings_model, tokenizer)
print(f1_score)

* Avvio test metriche....
-- Corpus size: 462
-- Non Match: 237
-- Match: 225
* Preparazione input......Fatto. 462 tuple totali, esempio label: 0 -> [1. 0.], Table1 shape: (462, 33), Table2 shape: (462, 49)
Precision: 0.9817351598173516, Recall: 0.9555555555555556, f1-score: 0.9684684684684685
Total retrieved: 219, retrieved/total matches: 215/225
0.9684684684684685


### Visualizzazione F-Measure: primi risultati

In [None]:
# Attiva modalità notebook per mostrare i grafici correttamente.
pyo.init_notebook_mode()

splits = ['100% split', '75% split', '50% split', '25% split', '10% split', '5% split']
total_tup = len(deeper_train)
tuplecount = [total_tup, 
              int(total_tup*0.75), 
              int(total_tup*0.5), 
              int(total_tup*0.25), 
              int(total_tup*0.1), 
              int(total_tup*0.05)]

# Aggiungi descrizione al numero
tuplecount = list(map(lambda x: f'{x} coppie di tuple', tuplecount))

fig = go.Figure(data=[go.Bar(name='DeepER', x=splits, y=fm_model_standard, hovertext=tuplecount)])

#fig.show()

# Plotta il grafico e salvalo come features_standard.html (verrà integrato nell'html).
pyo.iplot(fig, filename='fmeasures-standard')

##### Al passaggio del mouse il grafico mostra:
- Il numero di coppie di tuple utilizzate per l'addestramento; 
- La percentuale di split (Quantità di tuple utilizzate per addestrare il modello);
- Il valore di F-Measure (media armonica tra precision e recall);

## iTunes-Amazon

In [31]:
ITUNES_DIR = 'datasets/itunes_amazon/'
DATASET_NAME ='itunes-amazon'
TABLE1_FILE = 'itunes.csv'
TABLE2_FILE = 'amazon.csv'

In [33]:
itunes_train,itunes_test = process_data(ITUNES_DIR,DATASET_NAME,ground_truth='matches_itunes_amazon.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE)

la coseno similarity minima is 0.3446561747421316


In [35]:
# Avvio addestramenti o carica da disco.
deeper_model_itunes_100 = get_DeepER(1,itunes_train)


°°° DeepER Model °°°
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          541200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Compo

## Amazon-Google

In [12]:
DATA_DIR = 'datasets/Amazon-GoogleProducts/'
DATASET_NAME ='amazon-google'
TABLE1_FILE = 'Amazon.csv'
TABLE2_FILE = 'Google.csv'

In [13]:
train,test = process_data(DATA_DIR,DATASET_NAME,ground_truth='amazon_google_matches.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE,indici=[])

In [14]:
# Avvio addestramenti o carica da disco.
deeper_model_amazongoogle_100 = get_DeepER(1,train,LOAD_FROM_DISK_MODEL=False)


°°° DeepER Model °°°
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          541200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Compo

ResourceExhaustedError:  OOM when allocating tensor with shape[32,600] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node Composition_6/while/body/_1/MatMul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_keras_scratch_graph_26305]

Function call stack:
keras_scratch_graph
