# DeepER Classic 

## Step 0: Caricamento dati, preprocessing e strutture ausiliarie

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from deeper.DeepER import init_embeddings_index, init_embeddings_model, init_DeepER_model, train_model_ER, replace_last_layer, model_statistics
from deeper.data import process_data
from keras.models import load_model
from keras.layers import Dense
from deeper.csv2dataset import splitting_dataSet
from plotly import graph_objs as go
import plotly.offline as pyo

Using TensorFlow backend.


In [3]:
# Imposta manualmente a False per ricreare il file contenente il dataset scelto. 
LOAD_FROM_DISK_DATASET=False
# Imposta manualmente a False per ri-eseguire tutti gli addestramenti.
LOAD_FROM_DISK_MODEL = False
EMBEDDING_FILEPATH ='embeddings\glove.6B\glove.6B.300d.txt'
# Il nome con cui saranno etichettati i files prodotti
DATASET_DIR = 'datasets/itunes_amazon/'
DATASET_NAME ='itunes_amazon'
TABLE1_FILE = 'itunes.csv'
TABLE2_FILE = 'amazon.csv'

In [16]:
from deeper.csv2dataset import csv_2_datasetALTERNATE
dataset = csv_2_datasetALTERNATE(DATASET_DIR,'matches_itunes_amazon.csv',TABLE1_FILE,TABLE2_FILE, 
                             [(1,1),(2,2),(3,3)],sim_function=lambda x, y: [1, 1])

In [18]:
dataset[1]

(['Bless the Broken Road',
  'Rascal Flatts',
  'Hannah Montana : The Movie ( Original Motion Picture Soundtrack )'],
 ['The Movie Theater',
  'Brian Tyler',
  'The Final Destination ( Original Motion Picture Soundtrack )'],
 [1, 1],
 0)

In [7]:
deeper_train,deeper_test = process_data(DATASET_DIR,'itunes_amazon',ground_truth='matches_itunes_amazon.csv',
                         table1=TABLE1_FILE,table2=TABLE2_FILE)

0.8824418982027676
0.7768985960673559
0.8680003787638844
0.8279925162221707
0.5316095330711951
0.7452413135250994
0.8581278705797843
0.6183469424008424
0.703989469396047
0.53813823519705
0.8717100546930083
0.42640143271122094
0.6350528962771202
0.6984538909891174
0.3682298471593294
0.9067647005823629
0.36288736930121157
0.6351073488299558
0.42808634473904467
0.876416745784994
0.8660254037844386
0.8414349769963337
0.7858252779857413
0.5232559521341829
0.917463421851129
0.3149290838726275
0.4456688116249245
0.529150262212918
0.7452413135250994
0.5218624584427538
0.7492686492653553
0.7405316311773547
0.8404244281272969
0.7278602642426106
0.6445033866354896
0.6004805767690768
0.835995575171225
0.6669729688499156
0.9082682792306083
0.5337605126836238
0.8770580193070293
0.5360562674188974
0.7200822998230957
0.7115124735378853
0.5766967882001443
0.8095238095238095
0.7926290870042667
0.8668451156610704
0.5766967882001443
0.37062465833055064
0.7559289460184544
0.6444022325288264
0.8718572905786

In [9]:
deeper_train[1]

(['Summer Love',
  'One Direction',
  'Take Me Home',
  'Pop , Music , Pop/Rock , Teen Pop , Rock , Dance , World',
  '$ 1.29',
  '2012 Simco Limited under exclusive licence to Sony Music Entertainment UK Limited'],
 ['TÌ 1/4 , Entre Mis Cosas',
  'Danny Cabuche',
  'Y Hoy Me Recuerdas',
  'International , Latin Music , Pop , Latin Pop',
  '$ 1.29',
  '( c ) 2007 Relapse Records , Inc.'],
 0)

In [4]:
# Caricamento strutture dati e modelli ausiliari.
embeddings_index = init_embeddings_index(EMBEDDING_FILEPATH)
emb_dim = len(embeddings_index['cat']) # :3
embeddings_model, tokenizer = init_embeddings_model(embeddings_index)

* Costruzione indice degli embeddings.....Fatto. 400000 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 400000 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (400001, 300)

°°° EMBEDDING MODEL °°°
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Embedding_lookup (Embedding)    (None, None, 300)    120000300   Tu

In [8]:
perc = 0.8
emb_dim =300

## Step 1: Addestramento standard

In [9]:
# InPut: Percentuale di dati considerata per l'addestramento. 
# OutPut: DeepER addestrato sul taglio scelto.
def get_DeepER(perc):
   
    sub_data = splitting_dataSet(perc, deeper_train)    
    
    if LOAD_FROM_DISK_MODEL:
        
        # Carica da disco.
        print(f'Loading DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5', end='', flush=True)
        deeper_model = load_model(f'DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5')
        print('  ->  Done')        
                
    else:
        
        # Inizializza il modello.
        deeper_model = init_DeepER_model(emb_dim)
        deeper_model.summary()
        # Avvio addestramento.
        deeper_model = train_model_ER(sub_data, 
                                      deeper_model, 
                                      embeddings_model, 
                                      tokenizer, 
                                      pretraining=False,
                                      metric='val_accuracy',
                                      end=f'_{int(perc*100)}_{DATASET_NAME}')
        
    return deeper_model

In [15]:
# Avvio addestramenti o carica da disco.
deeper_model_100 = get_DeepER(0.5)


°°° DeepER Model °°°
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 300)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          541200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Compo

### Calcolo F-Measure dopo addestramento standard

In [None]:
# Misurazione dell'f-measure sullo stesso test set con i diversi modelli.
f1_score= model_statistics(deeper_test, deeper_model_100, embeddings_model, tokenizer)
print(f1_score)

### Visualizzazione F-Measure: primi risultati

In [None]:
# Attiva modalità notebook per mostrare i grafici correttamente.
pyo.init_notebook_mode()

splits = ['100% split', '75% split', '50% split', '25% split', '10% split', '5% split']
total_tup = len(deeper_train)
tuplecount = [total_tup, 
              int(total_tup*0.75), 
              int(total_tup*0.5), 
              int(total_tup*0.25), 
              int(total_tup*0.1), 
              int(total_tup*0.05)]

# Aggiungi descrizione al numero
tuplecount = list(map(lambda x: f'{x} coppie di tuple', tuplecount))

fig = go.Figure(data=[go.Bar(name='DeepER', x=splits, y=fm_model_standard, hovertext=tuplecount)])

#fig.show()

# Plotta il grafico e salvalo come features_standard.html (verrà integrato nell'html).
pyo.iplot(fig, filename='fmeasures-standard')

##### Al passaggio del mouse il grafico mostra:
- Il numero di coppie di tuple utilizzate per l'addestramento; 
- La percentuale di split (Quantità di tuple utilizzate per addestrare il modello);
- Il valore di F-Measure (media armonica tra precision e recall);

## Evalute model with shap

In [None]:
import shap
import numpy as np

In [None]:
from keras.models import load_model
walmart_amazon_model = load_model('models/DeepER_best_model_100_walmart_amazon.h5')

In [None]:
from DeepER import data2Inputs
table1, table2, labels = data2Inputs(deeper_train, tokenizer, categorical=False)
embeddings = embeddings_model.predict([table1,table2])

In [None]:
table1_test,table2_test,labels = data2Inputs(deeper_test,tokenizer,categorical=False)
testembeddings = embeddings_model.predict([table1_test,table2_test])

In [None]:
from keras.models import Model
def get_layer_output(model,layer_name,data):
    intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer(layer_name).output)
    intermediate_output = intermediate_layer_model.predict(data)
    return intermediate_output

In [None]:
from keras import backend as K
def get_layer_output_grad(model, inputs, outputs, layer_name):
    """ Gets gradient a layer output for given inputs and outputs"""
    grads = model.optimizer.get_gradients(model.total_loss, model.get_layer(layer_name).output)
    symb_inputs = (model._feed_inputs + model._feed_targets + model._feed_sample_weights)
    f = K.function(symb_inputs, grads)
    x, y, sample_weight = model._standardize_user_data(inputs, outputs)
    output_grad = f(x + y + sample_weight)
    return output_grad

In [None]:
predictions = walmart_amazon_model.predict(embeddings)

In [None]:
grads = get_layer_output_grad(walmart_amazon_model,embeddings,predictions,'Dense1')

In [None]:
from keras.layers import Input
idx = 4  # index of desired layer
input_shape = walmart_amazon_model.layers[idx].get_input_shape_at(0) # get the input shape of desired layer
layer_input = Input(shape=(300,)) # a new input tensor to be able to feed the desired layer
# create the new nodes for each layer in the path
x = layer_input
for layer in walmart_amazon_model.layers[idx:]:
    x = layer(x)

# create the model
new_model = Model(layer_input, x)

In [None]:
new_model.summary()

In [None]:
new_model.predict(similarity_output)[0:4]

In [None]:
walmart_amazon_model.predict(embeddings)[0:4]

## Explanation

In [None]:
explainer = shap.DeepExplainer(fodors_zagats_model,embeddings)
shap_values = explainer.shap_values(testembeddings)

In [None]:
embeddings_model.summary()