# DeepER Classic 

## Step 0: Caricamento dati, preprocessing e strutture ausiliarie

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from DeepER import init_embeddings_index, init_embeddings_model, init_DeepER_model, train_model_ER, replace_last_layer, model_statistics
from experimental_similarity import mono_vector, cosine_similarity_vector, distance_similarity_vector
from csv2dataset import splitting_dataSet, csv_2_datasetALTERNATE, csvTable2datasetRANDOM,parsing_anhai_data
from generate_similarity_vector import generate_similarity_vector
from data_reg import sim_hamming
from keras.models import load_model
from keras.layers import Dense
from plotly import graph_objs as go
import plotly.offline as pyo
from random import shuffle
import utilist as uls
import os

Using TensorFlow backend.


In [3]:
# Imposta manualmente a False per ricreare il file contenente il dataset scelto. 
LOAD_FROM_DISK_DATASET=False
# Imposta manualmente a False per ri-eseguire tutti gli addestramenti.
LOAD_FROM_DISK_MODEL = False
# Il nome con cui saranno etichettati i files prodotti

DATASET_DIR = 'itunes_amazon'# Esempio: 'WA'
DATASET_NAME ='itunes_amazon'
TABLE1_FILE = os.path.join(DATASET_DIR,'itunes.csv')# Esempio: 'walmart.csv'
TABLE2_FILE = os.path.join(DATASET_DIR,'amazon.csv')# Esempio: 'amazon.csv'

In [4]:
# Caricamento strutture dati e modelli ausiliari.
embeddings_index = init_embeddings_index('embeddings/glove.6B.100d.txt')
emb_dim = len(embeddings_index['cat']) # :3
embeddings_model, tokenizer = init_embeddings_model(embeddings_index)

* Costruzione indice degli embeddings.....Fatto. 400000 embeddings totali.
* Creazione del modello per il calcolo degli embeddings....
* Inizializzo il tokenizzatore.....Fatto: 400000 parole totali.
* Preparazione della matrice di embedding.....Fatto. Dimensioni matrice embeddings: (400001, 100)

°°° EMBEDDING MODEL °°°
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Tupla_A (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
Tupla_B (InputLayer)            (None, None)         0                                            
_____________________________________________________________________________________

In [31]:
# Caricamento dati e split iniziale.
if LOAD_FROM_DISK_DATASET:
    
    # Carica dataset salvato su disco.
    data = uls.load_list(f'dataset_{DATASET_NAME}')
    match_number=sum(map(lambda x : x[3] == 1, data))
    print("match_number: " + str(match_number))
    print("len all dataset: "+ str(len(data)))

else:
    
    GROUND_TRUTH_FILE = os.path.join(DATASET_DIR,f'matches_{DATASET_NAME}.csv')# Esempio: 'matches_walmart_amazon.csv'
    # Necessario inserire le tabelle nell'ordine corrispondente alle coppie della ground truth.

    # Coppie di attributi considerati allineati.
    att_indexes = [(1, 1), (2, 2), (3, 3), (4, 4),(5, 5), (6, 6),(7,7),(8,8)]# Esempio: [(5, 9), (4, 5), (3, 3), (14, 4), (6, 11)]


    # Crea il dataset.
    data = csv_2_datasetALTERNATE(GROUND_TRUTH_FILE, TABLE1_FILE, TABLE2_FILE, att_indexes)
    #per i dataset di Anhai
    #data=parsing_anhai_data(GROUND_TRUTH_FILE, TABLE1_FILE, TABLE2_FILE, att_indexes)
    
    # Salva dataset su disco.
    uls.save_list(data, f'dataset_{DATASET_NAME}')

    
# Dataset per DeepER classico: [(tupla1, tupla2, label), ...].
deeper_data = list(map(lambda q: (q[0], q[1], q[3]), data))


# Taglia attributi se troppo lunghi
# Alcuni dataset hanno attributi con descrizioni molto lunghe.
# Questo filtro limita il numero di caratteri di un attributo a 1000.
def shrink_data(data):
    
    def cut_string(s):
        if len(s) >= 1000:
            return s[:1000]
        else:
            return s
    
    temp = []
    for t1, t2, lb in data:
        t1 = list(map(cut_string, t1))
        t2 = list(map(cut_string, t2))
        temp.append((t1, t2, lb))
        
    return temp

deeper_data = shrink_data(deeper_data)


# Split in training set e test set.
def split_training_test(data, SPLIT_FACTOR = 0.8):     
    bound = int(len(data) * SPLIT_FACTOR)
    train = data[:bound]
    test = data[bound:]
    
    return train, test


# Tutti i successivi addestramenti partiranno dal 100% di deeper_train (80% di tutti i dati).
# Le tuple in deeper_test non verranno mai usate per addestrare ma solo per testare i modelli.
deeper_train, deeper_test = split_training_test(deeper_data)

## Step 1: Addestramento standard

In [32]:
# InPut: Percentuale di dati considerata per l'addestramento. 
# OutPut: DeepER addestrato sul taglio scelto.
def get_DeepER(perc):
   
    sub_data = splitting_dataSet(perc, deeper_train)    
    
    if LOAD_FROM_DISK_MODEL:
        
        # Carica da disco.
        print(f'Loading DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5', end='', flush=True)
        deeper_model = load_model(f'DeepER_best_model_{int(perc*100)}_{DATASET_NAME}.h5')
        print('  ->  Done')        
                
    else:
        
        # Inizializza il modello.
        deeper_model = init_DeepER_model(emb_dim)
        deeper_model.summary()
        # Avvio addestramento.
        deeper_model = train_model_ER(sub_data, 
                                      deeper_model, 
                                      embeddings_model, 
                                      tokenizer, 
                                      pretraining=False,
                                      metric='val_accuracy',
                                      end=f'_{int(perc*100)}_{DATASET_NAME}')
        
    return deeper_model

In [33]:
# Avvio addestramenti o carica da disco.
deeper_model_100 = get_DeepER(1)


°°° DeepER Model °°°
Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 100)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 100)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          301200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Compo

Epoch 00003: val_accuracy did not improve from 0.88372
Epoch 4/64

Epoch 00004: val_accuracy did not improve from 0.88372
Epoch 5/64

Epoch 00005: val_accuracy did not improve from 0.88372
Epoch 6/64

Epoch 00006: val_accuracy did not improve from 0.88372
Epoch 7/64

Epoch 00007: val_accuracy did not improve from 0.88372
Epoch 8/64



Epoch 00008: val_accuracy did not improve from 0.88372
Epoch 9/64

Epoch 00009: val_accuracy improved from 0.88372 to 0.93023, saving model to models\DeepER_best_model_100_itunes_amazon.h5
Epoch 10/64

Epoch 00010: val_accuracy did not improve from 0.93023
Epoch 11/64

Epoch 00011: val_accuracy improved from 0.93023 to 0.95349, saving model to models\DeepER_best_model_100_itunes_amazon.h5
Epoch 12/64



Epoch 00012: val_accuracy did not improve from 0.95349
Epoch 13/64

Epoch 00013: val_accuracy did not improve from 0.95349
Epoch 14/64

Epoch 00014: val_accuracy did not improve from 0.95349
Epoch 15/64



Epoch 00015: val_accuracy did not improve from 0.95349
Epoch 16/64

Epoch 00016: val_accuracy did not improve from 0.95349
Epoch 17/64

Epoch 00017: val_accuracy did not improve from 0.95349
Epoch 18/64



Epoch 00018: val_accuracy did not improve from 0.95349
Epoch 00018: early stopping


### Calcolo F-Measure dopo addestramento standard

In [34]:
# Misurazione dell'f-measure sullo stesso test set con i diversi modelli.
f1_score= model_statistics(deeper_test, deeper_model_100, embeddings_model, tokenizer)
print(f1_score)

* Avvio test metriche....
-- Corpus size: 53
-- Non Match: 27
-- Match: 26
* Preparazione input......Fatto. 53 tuple totali, esempio label: 0 -> [1. 0.], Table1 shape: (53, 62), Table2 shape: (53, 54)
Precision: 0.9259259259259259, Recall: 0.9615384615384616, f1-score: 0.9433962264150944
Total retrieved: 27, retrieved/total matches: 25/26
0.9433962264150944


## Step 2: Estrazione dei layer intermedi 

### Ri Discovery

In [35]:
import tensorflow as tf
import numpy as np
import keras.backend as K
from keras.models import load_model

In [36]:
MODEL_DIR ='models'
#itunes_amazon_model = load_model(os.path.join(MODEL_DIR,'DeepER_best_model_100_itunes_amazon.h5'))
itunes_amazon_model = deeper_model_100

In [37]:
itunes_amazon_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Embeddings_seq_a (InputLayer)   (None, None, 100)    0                                            
__________________________________________________________________________________________________
Embeddings_seq_b (InputLayer)   (None, None, 100)    0                                            
__________________________________________________________________________________________________
Composition (Bidirectional)     (None, 300)          301200      Embeddings_seq_a[0][0]           
                                                                 Embeddings_seq_b[0][0]           
__________________________________________________________________________________________________
Similarity (Lambda)             (None, 300)          0           Composition[0][0]          

In [38]:
from DeepER import data2Inputs
train_negatives = list(filter(lambda t:t[2]==0,deeper_train))
table1, table2, labels = data2Inputs(train_negatives, tokenizer, categorical=False)
embeddings_neg = embeddings_model.predict([table1,table2])

* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 82), Table2 shape: (105, 61)


### Create model for only classifier

In [39]:
selected_weights = []
for layer in walmart_amazon_model.layers[-3:]:
  selected_weights.append(layer.get_weights())

In [40]:
from keras.models import Sequential
from keras.layers import Dense

classifier = Sequential()
classifier.add(Dense(300,activation='relu',input_shape=(300,)))
classifier.add(Dense(300,activation='relu',input_shape=(300,)))
classifier.add(Dense(2,activation='softmax',input_shape=(300,)))

classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [41]:
for weights,layer in zip(selected_weights,classifier.layers):
  layer.set_weights(weights)

In [42]:
classifier.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_5 (Dense)              (None, 300)               90300     
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 602       
Total params: 181,202
Trainable params: 181,202
Non-trainable params: 0
_________________________________________________________________


### Define some utility functions

In [43]:
from keras.models import Model
def get_layer_output(model,layer_name,data):
    intermediate_layer_model = Model(inputs=model.input,outputs=model.get_layer(layer_name).output)
    intermediate_output = intermediate_layer_model.predict(data)
    return intermediate_output

In [44]:
from keras import backend as K
def get_layer_output_grad(model, inputs, outputs, layer_name):
    """ Gets gradient a layer output for given inputs and outputs"""
    grads = model.optimizer.get_gradients(model.total_loss, model.get_layer(layer_name).output)
    symb_inputs = (model._feed_inputs + model._feed_targets + model._feed_sample_weights)
    f = K.function(symb_inputs, grads)
    x, y, sample_weight = model._standardize_user_data(inputs, outputs)
    output_grad = f(x + y + sample_weight)
    return output_grad

In [45]:
neg_predictions = walmart_amazon_model.predict(embeddings_neg)
neg_similarity_output = get_layer_output(walmart_amazon_model,'Similarity',embeddings_neg)

In [46]:
outputTensor = classifier.output
custom_loss = K.sum(outputTensor,axis=1)
gradients = K.gradients(custom_loss,classifier.input)
example_batch = neg_similarity_output[0:8]
session = K.get_session()
evaluated_gradients = session.run(gradients,feed_dict={classifier.input : example_batch})

In [47]:
session.run(custom_loss,feed_dict={classifier.input: example_batch})

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.99999994, 1.        ], dtype=float32)

In [48]:
from numpy.linalg import norm

def find_smallest_variation_to_change(model,input_batch,sample_idx,classifier_length,class_to_reach,
                                     tf_session):
    xi = input_batch[sample_idx]
    ##Copia necessaria per non alterare il batch di input
    input_batch_copy = input_batch.copy()
    if class_to_reach ==1:
        label_tensor = tf.constant([[0.,1.]]*input_batch.shape[0])
    else:
        label_tensor = tf.constant([[1.,0.]]*input_batch.shape[0])
    custom_loss = K.categorical_crossentropy(label_tensor,model.output)
    gradients = K.gradients(custom_loss,classifier.input)
    current_probabilities = model.predict(input_batch_copy)[sample_idx]
    sum_ri = np.zeros(classifier_length)
    iterations = 0
    while(round(current_probabilities[1])!=class_to_reach and iterations<50):
        current_probabilities = model.predict(input_batch_copy)[sample_idx]
        if class_to_reach == 1:
            fx = current_probabilities[0]
        else:
            fx = current_probabilities[1]
        current_gradient = tf_session.run(gradients,feed_dict={model.input: input_batch_copy})[0][sample_idx]
        current_norm = norm(current_gradient)
        if current_norm==0:
            sum_ri = np.zeros(classifier_length)
            print("Gradient is null")
            break
        ri = (fx/(current_norm**2)) * (-current_gradient)
        xi = xi+ri
        input_batch_copy[sample_idx] = xi
        
        sum_ri += ri
        iterations+=1
    if iterations>=50:
        sum_ri = np.zeros(classifier_length)
        print("can't converge ")
        
    return sum_ri

## Step 3: Calcolo dei ranking

### Creazione dei dataset sui singoli attributi per Campioni negativi

In [58]:
def extractAttribute(sample,attribute_idx):
    return ([sample[0][attribute_idx]],[sample[1][attribute_idx]],sample[2])

In [61]:
attributes = ['songname','artistname','albumname','genre','price','copyright','time','released']
dataset_single_attribute = []
for att in attributes:
    attribute_idx = attributes.index(att)
    dataset_single_attribute.append(list(map(lambda t:extractAttribute(t,attribute_idx),deeper_train)))

In [68]:
from DeepER import data2Inputs
def createEmbeddings(dataset,tokenizer,embeddings_model,label):
    filtered_tuples = list(filter(lambda t:t[2]==label,dataset))
    table1, table2, labels = data2Inputs(filtered_tuples, tokenizer, categorical=False)
    embeddings = embeddings_model.predict([table1,table2])
    return embeddings

In [71]:
neg_embeddings_single_attribute = []
for dataset in dataset_single_attribute:
    embeddings = createEmbeddings(dataset,tokenizer,embeddings_model,0)
    neg_embeddings_single_attribute.append(embeddings)

* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 21), Table2 shape: (105, 21)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 9), Table2 shape: (105, 10)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 22), Table2 shape: (105, 20)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 11), Table2 shape: (105, 14)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 2), Table2 shape: (105, 2)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 38), Table2 shape: (105, 31)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 1), Table2 shape: (105, 1)
* Preparazione input......Fatto. 105 tuple totali, esempio label: 0 -> 0, Table1 shape: (105, 1), Table2 shape: (105, 3)


In [72]:
similarity_output_single_attribute = []
for embeddings in neg_embeddings_single_attribute:
    similarity_output = get_layer_output(itunes_amazon_model,'Similarity',embeddings)
    similarity_output_single_attribute.append(similarity_output)

In [103]:
def createInputForClassifier(dataset,model,tokenizer,embeddings_model,label,attribute_idx):
    dataset_single_attribute = (list(map(lambda t:extractAttribute(t,attribute_idx),dataset)))
    filtered_tuples = list(filter(lambda t:t[2]==label,dataset_single_attribute))
    table1, table2, labels = data2Inputs(filtered_tuples, tokenizer, categorical=False)
    embeddings = embeddings_model.predict([table1,table2])
    similarity_output = get_layer_output(model,'Similarity',embeddings)
    return similarity_output

### Esecuzione dell'algoritmo

In [76]:
from tqdm import tqdm
tf_session = K.get_session()
ri_map = {}
for att in attributes:
    similarity_output = similarity_output_single_attribute[attributes.index(att)]
    ris = []
    for i in tqdm(range(len(similarity_output))):
        ris.append(find_smallest_variation_to_change(classifier,input_batch = similarity_output,
                                                     sample_idx=i,classifier_length=300,
                                                     class_to_reach=1,tf_session = tf_session))
    ri_map[att] = ris




  0%|                                                                                          | 0/105 [00:00<?, ?it/s]


  1%|▊                                                                                 | 1/105 [00:01<02:32,  1.47s/it]


  3%|██▎                                                                               | 3/105 [00:01<01:46,  1.05s/it]


  4%|███                                                                               | 4/105 [00:03<01:58,  1.18s/it]


  6%|████▋                                                                             | 6/105 [00:04<01:44,  1.06s/it]


  7%|█████▍                                                                            | 7/105 [00:06<01:56,  1.19s/it]


  8%|██████▏                                                                           | 8/105 [00:07<02:03,  1.27s/it]


 10%|███████▋                                                                         | 10/105 [00:09<01:47,  1.13s/it]


 11%|█████████▎      

  2%|█▌                                                                                | 2/105 [00:01<01:38,  1.05it/s]


  3%|██▎                                                                               | 3/105 [00:03<02:03,  1.21s/it]


  4%|███                                                                               | 4/105 [00:05<02:19,  1.38s/it]


  5%|███▉                                                                              | 5/105 [00:07<02:30,  1.50s/it]


  7%|█████▍                                                                            | 7/105 [00:09<02:10,  1.33s/it]


  8%|██████▏                                                                           | 8/105 [00:10<02:22,  1.47s/it]


 10%|███████▋                                                                         | 10/105 [00:11<01:40,  1.06s/it]


 10%|████████▍                                                                        | 11/105 [00:13<02:03,  1.31s/it]


 11%|█████████▎         

 70%|█████████████████████████████████████████████████████████                        | 74/105 [01:54<01:08,  2.21s/it]


 71%|█████████████████████████████████████████████████████████▊                       | 75/105 [01:57<01:06,  2.21s/it]


 72%|██████████████████████████████████████████████████████████▋                      | 76/105 [01:59<01:03,  2.19s/it]


 73%|███████████████████████████████████████████████████████████▍                     | 77/105 [02:01<01:00,  2.17s/it]


 74%|████████████████████████████████████████████████████████████▏                    | 78/105 [02:03<00:59,  2.22s/it]


 75%|████████████████████████████████████████████████████████████▉                    | 79/105 [02:05<00:57,  2.22s/it]


 76%|█████████████████████████████████████████████████████████████▋                   | 80/105 [02:08<00:55,  2.21s/it]


 77%|██████████████████████████████████████████████████████████████▍                  | 81/105 [02:08<00:37,  1.58s/it]


 78%|███████████████████

 33%|███████████████████████████                                                      | 35/105 [00:40<01:39,  1.42s/it]


 34%|███████████████████████████▊                                                     | 36/105 [00:40<01:10,  1.03s/it]


 35%|████████████████████████████▌                                                    | 37/105 [00:42<01:34,  1.38s/it]


 36%|█████████████████████████████▎                                                   | 38/105 [00:42<01:07,  1.00s/it]


 37%|██████████████████████████████                                                   | 39/105 [00:42<00:48,  1.35it/s]


 38%|██████████████████████████████▊                                                  | 40/105 [00:43<00:36,  1.80it/s]


 39%|███████████████████████████████▋                                                 | 41/105 [00:43<00:27,  2.35it/s]


 40%|████████████████████████████████▍                                                | 42/105 [00:45<01:01,  1.03it/s]


 41%|███████████████████

 97%|█████████████████████████████████████████████████████████████████████████████▋  | 102/105 [01:48<00:03,  1.33s/it]


 98%|██████████████████████████████████████████████████████████████████████████████▍ | 103/105 [01:51<00:03,  1.67s/it]


 99%|███████████████████████████████████████████████████████████████████████████████▏| 104/105 [01:51<00:01,  1.21s/it]


100%|████████████████████████████████████████████████████████████████████████████████| 105/105 [01:54<00:00,  1.61s/it]


  0%|                                                                                          | 0/105 [00:00<?, ?it/s]


  1%|▊                                                                                 | 1/105 [00:00<00:16,  6.41it/s]


  2%|█▌                                                                                | 2/105 [00:00<00:15,  6.69it/s]


  3%|██▎                                                                               | 3/105 [00:02<01:24,  1.21it/s]


  4%|███                

 60%|████████████████████████████████████████████████▌                                | 63/105 [00:18<00:22,  1.86it/s]


 61%|█████████████████████████████████████████████████▎                               | 64/105 [00:18<00:17,  2.38it/s]


 62%|██████████████████████████████████████████████████▏                              | 65/105 [00:18<00:13,  2.95it/s]


 63%|██████████████████████████████████████████████████▉                              | 66/105 [00:19<00:11,  3.54it/s]


 64%|███████████████████████████████████████████████████▋                             | 67/105 [00:19<00:09,  4.13it/s]


 65%|████████████████████████████████████████████████████▍                            | 68/105 [00:19<00:07,  4.65it/s]


 66%|█████████████████████████████████████████████████████▏                           | 69/105 [00:19<00:07,  5.11it/s]


 67%|██████████████████████████████████████████████████████                           | 70/105 [00:19<00:06,  5.50it/s]


 68%|███████████████████

 23%|██████████████████▌                                                              | 24/105 [00:09<01:28,  1.09s/it]


 24%|███████████████████▎                                                             | 25/105 [00:09<01:04,  1.23it/s]


 25%|████████████████████                                                             | 26/105 [00:09<00:48,  1.62it/s]


 26%|████████████████████▊                                                            | 27/105 [00:09<00:37,  2.08it/s]


 27%|█████████████████████▌                                                           | 28/105 [00:09<00:29,  2.60it/s]


 28%|██████████████████████▎                                                          | 29/105 [00:09<00:24,  3.15it/s]


 29%|███████████████████████▏                                                         | 30/105 [00:10<00:20,  3.68it/s]


 30%|███████████████████████▉                                                         | 31/105 [00:10<00:17,  4.17it/s]


 30%|███████████████████

 87%|██████████████████████████████████████████████████████████████████████▏          | 91/105 [00:28<00:06,  2.10it/s]


 88%|██████████████████████████████████████████████████████████████████████▉          | 92/105 [00:29<00:05,  2.59it/s]


 89%|███████████████████████████████████████████████████████████████████████▋         | 93/105 [00:29<00:03,  3.11it/s]


 90%|████████████████████████████████████████████████████████████████████████▌        | 94/105 [00:29<00:03,  3.58it/s]


 90%|█████████████████████████████████████████████████████████████████████████▎       | 95/105 [00:29<00:02,  4.03it/s]


 91%|██████████████████████████████████████████████████████████████████████████       | 96/105 [00:29<00:02,  4.42it/s]


 92%|██████████████████████████████████████████████████████████████████████████▊      | 97/105 [00:30<00:01,  4.76it/s]


 93%|███████████████████████████████████████████████████████████████████████████▌     | 98/105 [00:30<00:01,  4.97it/s]


 94%|███████████████████

 50%|████████████████████████████████████████                                         | 52/105 [00:15<00:09,  5.32it/s]


 50%|████████████████████████████████████████▉                                        | 53/105 [00:15<00:09,  5.31it/s]


 51%|█████████████████████████████████████████▋                                       | 54/105 [00:15<00:09,  5.28it/s]


 52%|██████████████████████████████████████████▍                                      | 55/105 [00:16<00:09,  5.29it/s]


 53%|███████████████████████████████████████████▏                                     | 56/105 [00:16<00:09,  5.30it/s]


 54%|███████████████████████████████████████████▉                                     | 57/105 [00:16<00:09,  5.23it/s]


 55%|████████████████████████████████████████████▋                                    | 58/105 [00:16<00:08,  5.23it/s]


 56%|█████████████████████████████████████████████▌                                   | 59/105 [00:16<00:08,  5.22it/s]


 57%|███████████████████

 12%|██████████                                                                       | 13/105 [00:47<05:38,  3.68s/it]


 13%|██████████▊                                                                      | 14/105 [00:51<05:36,  3.70s/it]


 14%|███████████▌                                                                     | 15/105 [00:55<05:31,  3.69s/it]


 15%|████████████▎                                                                    | 16/105 [00:59<05:28,  3.69s/it]


 16%|█████████████                                                                    | 17/105 [01:02<05:24,  3.69s/it]


 17%|█████████████▉                                                                   | 18/105 [01:06<05:22,  3.70s/it]


 18%|██████████████▋                                                                  | 19/105 [01:10<05:19,  3.72s/it]


 19%|███████████████▍                                                                 | 20/105 [01:14<05:18,  3.74s/it]


 20%|████████████████▏  

 76%|█████████████████████████████████████████████████████████████▋                   | 80/105 [05:10<01:42,  4.10s/it]


 77%|██████████████████████████████████████████████████████████████▍                  | 81/105 [05:14<01:38,  4.11s/it]


 78%|███████████████████████████████████████████████████████████████▎                 | 82/105 [05:19<01:38,  4.30s/it]


 79%|████████████████████████████████████████████████████████████████                 | 83/105 [05:23<01:32,  4.23s/it]


 80%|████████████████████████████████████████████████████████████████▊                | 84/105 [05:27<01:27,  4.18s/it]


 81%|█████████████████████████████████████████████████████████████████▌               | 85/105 [05:31<01:23,  4.20s/it]


 82%|██████████████████████████████████████████████████████████████████▎              | 86/105 [05:35<01:19,  4.17s/it]


 83%|███████████████████████████████████████████████████████████████████              | 87/105 [05:40<01:14,  4.14s/it]


 84%|███████████████████

 39%|███████████████████████████████▋                                                 | 41/105 [02:54<04:30,  4.23s/it]


 40%|████████████████████████████████▍                                                | 42/105 [02:58<04:26,  4.24s/it]


 41%|█████████████████████████████████▏                                               | 43/105 [03:02<04:21,  4.21s/it]


 42%|█████████████████████████████████▉                                               | 44/105 [03:06<04:15,  4.19s/it]


 43%|██████████████████████████████████▋                                              | 45/105 [03:11<04:10,  4.18s/it]


 44%|███████████████████████████████████▍                                             | 46/105 [03:15<04:05,  4.17s/it]


 45%|████████████████████████████████████▎                                            | 47/105 [03:19<04:01,  4.17s/it]


 46%|█████████████████████████████████████                                            | 48/105 [03:23<03:57,  4.16s/it]


 47%|███████████████████

In [82]:
ri_norm_map = {}
for key in tqdm(ri_map.keys()):
    ri_norms = []
    for ri in ri_map[key]:
        ri_norms.append(norm(ri))
    ri_norm_map[key] = ri_norms




  0%|                                                                                            | 0/8 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 499.99it/s]

In [90]:
import pandas as pd
neg_variation_df = pd.DataFrame.from_dict(ri_norm_map)
neg_variation_df.head(10)

Unnamed: 0,songname,artistname,albumname,genre,price,copyright,time,released
0,0.098968,0.0,0.0,0.0,0.0,0.0,0.070826,0.108948
1,0.0,0.080765,0.069098,0.0,0.0,0.0,0.049478,0.109079
2,0.0,0.083444,0.063296,0.06784,0.0,0.0,0.0497,0.108924
3,0.062224,0.079471,0.0,0.0,0.0,0.0,0.049192,0.109184
4,0.0,0.078595,0.0,0.0,0.0,0.0,0.109227,0.109137
5,0.089401,0.0,0.046992,0.0,0.0,0.0,0.071478,0.108931
6,0.109263,0.080707,0.081769,0.0,0.0,0.085336,0.072416,0.109128
7,0.08993,0.114489,0.0,0.0,0.0,0.0,0.050892,0.108931
8,0.0,0.0,0.0,0.0,0.0,0.0,0.074066,0.108948
9,0.063684,0.0,0.0,0.0,0.0,0.0,0.049992,0.108948


In [106]:
!mkdir experiment_results
neg_variation_df.to_csv('experiment_results/neg_ri_norms.csv',index=False)

### Campioni positivi

In [104]:
posClassifierInput_single_attribute = []
for att in attributes:
    attribute_idx = attributes.index(att)
    posClassifierInput_single_attribute.append(createInputForClassifier(deeper_train,itunes_amazon_model,tokenizer,
                                                                       embeddings_model,1,attribute_idx))

* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 20), Table2 shape: (106, 21)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 5), Table2 shape: (106, 4)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 22), Table2 shape: (106, 21)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 11), Table2 shape: (106, 6)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 2), Table2 shape: (106, 2)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 35), Table2 shape: (106, 32)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 1), Table2 shape: (106, 1)
* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 1), Table2 shape: (106, 3)


In [105]:
from tqdm import tqdm
tf_session = K.get_session()
pos_ri_map = {}
for att in attributes:
    similarity_output = posClassifierInput_single_attribute[attributes.index(att)]
    ris = []
    for i in tqdm(range(len(similarity_output))):
        ris.append(find_smallest_variation_to_change(classifier,input_batch = similarity_output,
                                                     sample_idx=i,classifier_length=300,
                                                     class_to_reach=0,tf_session = tf_session))
    pos_ri_map[att] = ris




  0%|                                                                                          | 0/106 [00:00<?, ?it/s]


  1%|▊                                                                                 | 1/106 [00:04<07:50,  4.48s/it]


  2%|█▌                                                                                | 2/106 [00:04<05:34,  3.21s/it]


  3%|██▎                                                                               | 3/106 [00:09<06:08,  3.58s/it]


  4%|███                                                                               | 4/106 [00:13<06:35,  3.87s/it]


  5%|███▊                                                                              | 5/106 [00:18<06:46,  4.02s/it]


  6%|████▋                                                                             | 6/106 [00:22<06:57,  4.18s/it]


  7%|█████▍                                                                            | 7/106 [00:27<07:02,  4.27s/it]


  8%|██████▏         

 63%|███████████████████████████████████████████████████▏                             | 67/106 [04:37<03:01,  4.66s/it]


 64%|███████████████████████████████████████████████████▉                             | 68/106 [04:42<02:59,  4.72s/it]


 65%|████████████████████████████████████████████████████▋                            | 69/106 [04:46<02:53,  4.70s/it]


 66%|█████████████████████████████████████████████████████▍                           | 70/106 [04:51<02:49,  4.71s/it]


 67%|██████████████████████████████████████████████████████▎                          | 71/106 [04:56<02:46,  4.75s/it]


 68%|███████████████████████████████████████████████████████                          | 72/106 [05:01<02:41,  4.74s/it]


 69%|███████████████████████████████████████████████████████▊                         | 73/106 [05:05<02:37,  4.76s/it]


 70%|████████████████████████████████████████████████████████▌                        | 74/106 [05:10<02:32,  4.77s/it]


 71%|███████████████████

 25%|████████████████████▋                                                            | 27/106 [01:52<05:06,  3.88s/it]


 26%|█████████████████████▍                                                           | 28/106 [01:57<05:29,  4.22s/it]


 27%|██████████████████████▏                                                          | 29/106 [01:57<03:53,  3.04s/it]


 28%|██████████████████████▉                                                          | 30/106 [01:57<02:47,  2.21s/it]


 29%|███████████████████████▋                                                         | 31/106 [01:58<02:02,  1.63s/it]


 30%|████████████████████████▍                                                        | 32/106 [02:02<03:13,  2.61s/it]


 31%|█████████████████████████▏                                                       | 33/106 [02:07<04:00,  3.30s/it]


 32%|█████████████████████████▉                                                       | 34/106 [02:08<02:52,  2.40s/it]


 33%|███████████████████

 89%|███████████████████████████████████████████████████████████████████████▊         | 94/106 [06:29<00:40,  3.34s/it]


 90%|████████████████████████████████████████████████████████████████████████▌        | 95/106 [06:34<00:43,  3.92s/it]


 91%|█████████████████████████████████████████████████████████████████████████▎       | 96/106 [06:40<00:43,  4.39s/it]


 92%|██████████████████████████████████████████████████████████████████████████       | 97/106 [06:45<00:42,  4.68s/it]


 92%|██████████████████████████████████████████████████████████████████████████▉      | 98/106 [06:51<00:38,  4.87s/it]


 93%|███████████████████████████████████████████████████████████████████████████▋     | 99/106 [06:56<00:35,  5.05s/it]


 94%|███████████████████████████████████████████████████████████████████████████▍    | 100/106 [07:01<00:30,  5.11s/it]


 95%|████████████████████████████████████████████████████████████████████████████▏   | 101/106 [07:07<00:25,  5.16s/it]


 96%|███████████████████

 51%|█████████████████████████████████████████▎                                       | 54/106 [04:17<04:25,  5.11s/it]


 52%|██████████████████████████████████████████                                       | 55/106 [04:17<03:07,  3.67s/it]


 53%|██████████████████████████████████████████▊                                      | 56/106 [04:23<03:32,  4.24s/it]


 54%|███████████████████████████████████████████▌                                     | 57/106 [04:28<03:47,  4.64s/it]


 55%|████████████████████████████████████████████▎                                    | 58/106 [04:34<03:59,  5.00s/it]


 56%|█████████████████████████████████████████████                                    | 59/106 [04:40<04:09,  5.31s/it]


 57%|█████████████████████████████████████████████▊                                   | 60/106 [04:41<02:55,  3.82s/it]


 58%|██████████████████████████████████████████████▌                                  | 61/106 [04:46<03:15,  4.35s/it]


 58%|███████████████████

 13%|██████████▋                                                                      | 14/106 [01:17<07:50,  5.12s/it]


 14%|███████████▍                                                                     | 15/106 [01:23<08:07,  5.36s/it]


 15%|████████████▏                                                                    | 16/106 [01:29<08:19,  5.55s/it]


 16%|████████████▉                                                                    | 17/106 [01:35<08:20,  5.62s/it]


 17%|█████████████▊                                                                   | 18/106 [01:41<08:27,  5.77s/it]


 18%|██████████████▌                                                                  | 19/106 [01:47<08:26,  5.82s/it]


 19%|███████████████▎                                                                 | 20/106 [01:53<08:21,  5.83s/it]


 20%|████████████████                                                                 | 21/106 [01:59<08:19,  5.88s/it]


 21%|████████████████▊  

 76%|█████████████████████████████████████████████████████████████▉                   | 81/106 [08:00<02:33,  6.15s/it]


 77%|██████████████████████████████████████████████████████████████▋                  | 82/106 [08:06<02:26,  6.12s/it]


 78%|███████████████████████████████████████████████████████████████▍                 | 83/106 [08:13<02:20,  6.10s/it]


 79%|████████████████████████████████████████████████████████████████▏                | 84/106 [08:19<02:15,  6.15s/it]


 80%|████████████████████████████████████████████████████████████████▉                | 85/106 [08:25<02:08,  6.12s/it]


 81%|█████████████████████████████████████████████████████████████████▋               | 86/106 [08:31<02:01,  6.10s/it]


 82%|██████████████████████████████████████████████████████████████████▍              | 87/106 [08:37<01:55,  6.10s/it]


 83%|███████████████████████████████████████████████████████████████████▏             | 88/106 [08:43<01:49,  6.08s/it]


 84%|███████████████████

 39%|███████████████████████████████▎                                                 | 41/106 [04:17<07:01,  6.49s/it]


 40%|████████████████████████████████                                                 | 42/106 [04:23<06:53,  6.46s/it]


 41%|████████████████████████████████▊                                                | 43/106 [04:29<06:46,  6.45s/it]


 42%|█████████████████████████████████▌                                               | 44/106 [04:36<06:38,  6.42s/it]


 42%|██████████████████████████████████▍                                              | 45/106 [04:42<06:31,  6.41s/it]


 43%|███████████████████████████████████▏                                             | 46/106 [04:49<06:23,  6.40s/it]


 44%|███████████████████████████████████▉                                             | 47/106 [04:55<06:20,  6.45s/it]


 45%|████████████████████████████████████▋                                            | 48/106 [05:02<06:14,  6.45s/it]


 46%|███████████████████

  1%|▊                                                                                 | 1/106 [00:06<11:54,  6.80s/it]


  2%|█▌                                                                                | 2/106 [00:13<11:56,  6.89s/it]


  3%|██▎                                                                               | 3/106 [00:20<11:47,  6.87s/it]


  4%|███                                                                               | 4/106 [00:27<11:38,  6.85s/it]


  5%|███▊                                                                              | 5/106 [00:34<11:38,  6.92s/it]


  6%|████▋                                                                             | 6/106 [00:41<11:31,  6.92s/it]


  7%|█████▍                                                                            | 7/106 [00:41<08:10,  4.95s/it]


  8%|██████▏                                                                           | 8/106 [00:48<09:04,  5.55s/it]


  8%|██████▉            

 64%|███████████████████████████████████████████████████▉                             | 68/106 [06:56<04:25,  6.99s/it]


 65%|████████████████████████████████████████████████████▋                            | 69/106 [07:04<04:29,  7.28s/it]


 66%|█████████████████████████████████████████████████████▍                           | 70/106 [07:11<04:20,  7.24s/it]


 67%|██████████████████████████████████████████████████████▎                          | 71/106 [07:20<04:24,  7.57s/it]


 68%|███████████████████████████████████████████████████████                          | 72/106 [07:27<04:15,  7.52s/it]


 69%|███████████████████████████████████████████████████████▊                         | 73/106 [07:34<04:06,  7.47s/it]


 70%|████████████████████████████████████████████████████████▌                        | 74/106 [07:43<04:06,  7.71s/it]


 71%|█████████████████████████████████████████████████████████▎                       | 75/106 [07:51<04:02,  7.84s/it]


 72%|███████████████████

 26%|█████████████████████▍                                                           | 28/106 [00:12<00:31,  2.46it/s]


 27%|██████████████████████▏                                                          | 29/106 [00:12<00:31,  2.47it/s]


 28%|██████████████████████▉                                                          | 30/106 [00:13<00:30,  2.47it/s]


 29%|███████████████████████▋                                                         | 31/106 [00:13<00:30,  2.48it/s]


 30%|████████████████████████▍                                                        | 32/106 [00:13<00:29,  2.48it/s]


 31%|█████████████████████████▏                                                       | 33/106 [00:14<00:29,  2.48it/s]


 32%|█████████████████████████▉                                                       | 34/106 [00:14<00:29,  2.48it/s]


 33%|██████████████████████████▋                                                      | 35/106 [00:15<00:28,  2.47it/s]


 34%|███████████████████

 90%|████████████████████████████████████████████████████████████████████████▌        | 95/106 [00:39<00:04,  2.40it/s]


 91%|█████████████████████████████████████████████████████████████████████████▎       | 96/106 [00:40<00:04,  2.40it/s]


 92%|██████████████████████████████████████████████████████████████████████████       | 97/106 [00:40<00:03,  2.40it/s]


 92%|██████████████████████████████████████████████████████████████████████████▉      | 98/106 [00:41<00:03,  2.39it/s]


 93%|███████████████████████████████████████████████████████████████████████████▋     | 99/106 [00:41<00:02,  2.40it/s]


 94%|███████████████████████████████████████████████████████████████████████████▍    | 100/106 [00:41<00:02,  2.39it/s]


 95%|████████████████████████████████████████████████████████████████████████████▏   | 101/106 [00:42<00:02,  2.39it/s]


 96%|████████████████████████████████████████████████████████████████████████████▉   | 102/106 [00:42<00:01,  2.39it/s]


 97%|███████████████████

 52%|██████████████████████████████████████████                                       | 55/106 [00:30<00:21,  2.35it/s]


 53%|██████████████████████████████████████████▊                                      | 56/106 [00:30<00:21,  2.36it/s]


 54%|███████████████████████████████████████████▌                                     | 57/106 [00:31<00:20,  2.36it/s]


 55%|████████████████████████████████████████████▎                                    | 58/106 [00:31<00:20,  2.36it/s]


 56%|█████████████████████████████████████████████                                    | 59/106 [00:31<00:19,  2.36it/s]


 57%|█████████████████████████████████████████████▊                                   | 60/106 [00:32<00:19,  2.36it/s]


 58%|██████████████████████████████████████████████▌                                  | 61/106 [00:32<00:19,  2.36it/s]


 58%|███████████████████████████████████████████████▍                                 | 62/106 [00:33<00:18,  2.36it/s]


 59%|███████████████████

In [107]:
pos_ri_norm_map = {}
for key in tqdm(pos_ri_map.keys()):
    ri_norms = []
    for ri in pos_ri_map[key]:
        ri_norms.append(norm(ri))
    pos_ri_norm_map[key] = ri_norms




  0%|                                                                                            | 0/8 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 571.27it/s]

In [108]:
import pandas as pd
pos_variation_df = pd.DataFrame.from_dict(pos_ri_norm_map)
pos_variation_df.head(10)

Unnamed: 0,songname,artistname,albumname,genre,price,copyright,time,released
0,0.22318,0.196187,0.126219,0.059551,0.096792,0.076953,0.0,0.0
1,0.0,0.062629,0.0,0.075595,0.096792,0.180779,0.0,0.0
2,0.40493,0.082776,0.037709,0.100385,0.096792,0.154337,0.0,0.0
3,0.181793,0.073895,0.172309,0.108144,0.096792,0.191968,0.0,0.0
4,0.037986,0.035041,0.085041,0.073286,0.096792,0.074962,0.0,0.0
5,0.696329,0.0,0.325307,0.048261,0.096792,0.051999,0.0,0.0
6,0.772014,0.08062,0.328076,0.082081,0.091597,0.0,0.0,0.0
7,0.477839,0.081413,0.049461,0.077376,0.096792,0.090107,0.0,0.0
8,0.401962,0.089044,0.401962,0.083638,0.096792,0.098535,0.0,0.0
9,0.253327,0.035859,0.251735,0.077987,0.0,0.051933,0.0,0.0


In [109]:
pos_variation_df.to_csv('experiment_results/pos_ri_norms.csv',index=False)

In [112]:
classifier.predict(posClassifierInput_single_attribute[0])

array([[1.02539863e-02, 9.89745975e-01],
       [8.75456810e-01, 1.24543138e-01],
       [3.38449201e-04, 9.99661565e-01],
       [1.48904501e-02, 9.85109568e-01],
       [3.81542355e-01, 6.18457675e-01],
       [8.12567805e-07, 9.99999166e-01],
       [1.71933237e-07, 9.99999881e-01],
       [1.97613954e-05, 9.99980211e-01],
       [3.19896964e-04, 9.99680161e-01],
       [4.76012239e-03, 9.95239854e-01],
       [5.08235693e-01, 4.91764337e-01],
       [1.46735227e-02, 9.85326409e-01],
       [1.74077197e-08, 1.00000000e+00],
       [6.22266065e-03, 9.93777394e-01],
       [2.53570318e-01, 7.46429682e-01],
       [1.25449442e-04, 9.99874592e-01],
       [2.98540443e-02, 9.70146000e-01],
       [1.24544121e-01, 8.75455916e-01],
       [8.64010975e-02, 9.13598955e-01],
       [3.36444759e-06, 9.99996662e-01],
       [2.85195708e-01, 7.14804232e-01],
       [3.22508179e-02, 9.67749238e-01],
       [2.40736660e-02, 9.75926340e-01],
       [1.13988679e-03, 9.98860121e-01],
       [3.744116

In [115]:
train_positives = list(filter(lambda t:t[2]==1,deeper_train))
dataset_on_time = (list(map(lambda t:extractAttribute(t,6),train_positives)))
table1, table2, labels = data2Inputs(dataset_on_time, tokenizer, categorical=False)
embeddings = embeddings_model.predict([table1,table2])
walmart_amazon_model.predict(embeddings)

* Preparazione input......Fatto. 106 tuple totali, esempio label: 1 -> 1, Table1 shape: (106, 1), Table2 shape: (106, 1)


array([[0.84663683, 0.1533632 ],
       [0.6751965 , 0.32480347],
       [0.28318837, 0.7168116 ],
       [0.48801813, 0.5119819 ],
       [0.7867873 , 0.21321276],
       [0.8473045 , 0.15269549],
       [0.4141535 , 0.58584654],
       [0.8039826 , 0.1960174 ],
       [0.619256  , 0.38074398],
       [0.7905999 , 0.20940006],
       [0.75553626, 0.24446374],
       [0.41592982, 0.58407015],
       [0.6963813 , 0.30361873],
       [0.7813253 , 0.21867475],
       [0.5366174 , 0.46338263],
       [0.7310176 , 0.26898247],
       [0.70067745, 0.29932252],
       [0.81975764, 0.18024236],
       [0.5223175 , 0.47768247],
       [0.77942586, 0.22057414],
       [0.6767654 , 0.32323462],
       [0.3569392 , 0.64306074],
       [0.45026335, 0.5497367 ],
       [0.5700004 , 0.42999956],
       [0.77114254, 0.22885744],
       [0.63762695, 0.36237305],
       [0.770066  , 0.22993398],
       [0.63864565, 0.36135435],
       [0.8473045 , 0.15269549],
       [0.8046234 , 0.19537659],
       [0.

In [117]:
dataset_on_time[1]

(['3:24'], ['3:24'], 1)