In [1]:
import deepmatcher as dm
import pandas as pd
import numpy as np
from scipy.spatial import distance
from utils import Hook,return_layer_input_output

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
data_dir = 'sample_data/itunes-amazon'
datasets = dm.data.process(data_dir, train='train.csv', validation='validation.csv',
                           test='price-test.csv')
train = datasets[0]
validation = datasets[1]
test = datasets[2]

In [4]:
train_table = train.get_raw_table()
train_table.head()

Unnamed: 0,id,label,left_Song_Name,left_Artist_Name,left_Album_Name,left_Genre,left_Price,left_CopyRight,left_Time,left_Released,right_Song_Name,right_Artist_Name,right_Album_Name,right_Genre,right_Price,right_CopyRight,right_Time,right_Released
0,448,0,baby when the light ( david guetta & fred rist...,david guetta,pop life ( extended version ) [ bonus version ],"dance , music , rock , pop , house , electroni...",$ 1.29,‰ ãñ 2007 gum records,6:17,18-sep-07,revolver ( madonna vs. david guetta feat . lil...,david guetta,one love ( deluxe version ),dance & electronic,$ 1.29,( c ) 2014 swedish house mafia holdings ltd ( ...,3:18,"august 21 , 2009"
1,287,1,outversion,mark ronson,version,"pop , music , r & b / soul , soul , dance , ro...",$ 0.99,2007 mark ronson under exclusive license to so...,1:50,10-jul-07,outversion,mark ronson,version [ explicit ],pop,$ 0.99,( c ) 2011 j'adore records,1:50,"july 10 , 2007"
2,534,0,peer pressure ( feat . traci nelson ),snoop dogg,doggumentary,"hip-hop/rap , music , rock , gangsta rap , wes...",$ 1.29,"‰ ãñ 2011 capitol records , llc . all rights r...",4:07,29-mar-11,boom ( ( feat . t-pain ) [ edited ] ),snoop dogg,doggumentary [ edited ],"rap & hip-hop , west coast",$ 1.29,"( c ) 2011 capitol records , llc",3:50,"march 29 , 2011"
3,181,1,stars come out ( tim mason remix ),zedd,stars come out ( remixes ) - ep,"dance , music , electronic , house",$ 1.29,2012 dim mak inc .,5:49,20-may-14,stars come out ( dillon francis remix ),zedd,stars come out [ dillon francis remix ],dance & electronic,$ 1.29,2012 dim mak inc .,4:08,"may 20 , 2014"
4,485,0,jump ( feat . nelly furtado ),flo rida,r.o.o.t.s . ( deluxe version ),"hip-hop/rap , music",$ 1.29,‰ ãñ 2009 atlantic recording corporation for t...,3:28,30-mar-09,"yayo [ feat . brisco , billy blue , ball greez...",flo rida,r.o.o.t.s . ( route of overcoming the struggle...,rap & hip-hop,$ 1.29,"( c ) 2012 motown records , a division of umg ...",7:53,"march 30 , 2009"


## Define neural network models

In [3]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')

In [4]:
#run this cell if you want to load pre-trained model
hybrid_model.load_state('models/hybrid_model.pth')
hybrid_model.cuda()

MatchingModel(
  (attr_summarizers): ModuleMap(
    (Song_Name): Hybrid(
      (word_contextualizer): RNN(
        (rnn_groups): ModuleList(
          (0): GRU(300, 150, batch_first=True, bidirectional=True)
        )
        (dropouts): ModuleList(
          (0): Dropout(p=0)
        )
        (bypass_networks): ModuleList(
          (0): None
        )
        (input_dropout): NoMeta(
          (module): Dropout(p=0)
        )
      )
      (word_comparator): Attention(
        (alignment_networks): ModuleList(
          (0): AlignmentNetwork(
            (transform): Transform(
              (transforms): ModuleList(
                (0): Linear(in_features=300, out_features=300, bias=True)
                (1): Linear(in_features=300, out_features=300, bias=True)
              )
              (bypass_networks): ModuleList(
                (0): Bypass(
                  (highway_gate): Linear(in_features=300, out_features=300, bias=True)
                )
                (1): Bypass(


## Train models

In [None]:
hybrid_model.run_train(
    train,
    validation,
    epochs=10,
    batch_size=16,
    best_save_path='models/hybrid_model.pth',
    pos_neg_ratio=4)

In [None]:
true_predictions = hybrid_model.run_prediction(test)

In [None]:
altered_pred = hybrid_model.run_prediction(test)

In [None]:
test_df = pd.read_csv('sample_data/itunes-amazon/test.csv')
test_df.loc[test_df['label'] == 0, 'left_Time'] = test_df['right_Time']
test_df_negatives = test_df.loc[test_df['label']==0]

In [None]:
test_df_negatives.to_csv('sample_data/itunes-amazon/altered_negatives.csv',index=False)

In [None]:
altered_negatives = dm.data.process_unlabeled('sample_data/itunes-amazon/altered_negatives.csv',
                                           hybrid_model,ignore_columns=['label'])

In [None]:
hybrid_model.run_prediction(altered_negatives,output_attributes=True)

# Analyze intermediate layers
In this step we want to evaluate the output of intermediate layers. For this purpose we use some utility functions from utility module

## Experiment 1
we want to evaluate the differences between the output of the summarizers of positive samples and the output of the summarizers of altered positive samples

In [6]:
attributes = ['Song_Name','Artist_Name','Album_Name','Genre','Price','CopyRight','Time','Released']

In [None]:
comparators_datasets = dm.data.process(path='sample_data/itunes-amazon/',train='test.csv',
                            validation='test_positives.csv',test='altered_positive_samples.csv',
                                       cache='summarizer_cache.pth')

In [None]:
from deepmatcher.data import MatchingIterator
batch_size = 32
splits = MatchingIterator.splits(comparators_datasets,batch_size=batch_size)

In [None]:
positive_batches = []
for batch in splits[1]:
    positive_batches.append(batch)

In [None]:
altered_positive_batches = []
for batch in splits[2]:
    altered_positive_batches.append(batch)

In [None]:
positive_batches[0].id, altered_positive_batches[0].id

In [None]:
summarizers = []
#comparators useful only for debugging
comparators = []
for attr in attributes:
    summarizers.append(hybrid_model.attr_summarizers[attr])
    comparators.append(hybrid_model.attr_comparators[attr])

In [None]:
hookF_summarizer = []
for summ in summarizers:
    hookF_summarizer.append(Hook(summ))
hookF_comparator = []
for comp in comparators:
    hookF_comparator.append(Hook(comp))

In [None]:
classifier = hybrid_model.classifier
hookF_classifier = []
hookF_classifier.append(Hook(classifier))

In [None]:
positives_batch_layer_inputs,positives_batch_layer_outputs = return_layer_input_output(hookF_summarizer,
                                                                                     positive_batches[0],hybrid_model)

In [None]:
altered_batch_layer_inputs, altered_batch_layer_outputs = return_layer_input_output(hookF_summarizer,
                                                                                    altered_positive_batches[0],hybrid_model)

In [None]:
positives_summarizers_left_output = list(map(lambda x: x[0].data,positives_batch_layer_outputs))
positives_summarizers_right_output = list(map(lambda x: x[1].data,positives_batch_layer_outputs))

In [None]:
altered_summarizers_left_output = list(map(lambda x: x[0].data,altered_batch_layer_outputs))
altered_summarizers_right_output = list(map(lambda x:x[1].data,altered_batch_layer_outputs))

In [None]:
def calculate_distance_matrix(summarizers_left_output,summarizers_right_output):
    distance_mat = []
    for i in range(len(summarizers_left_output)):
        distances = []
        for j in range(31):
            l_out = summarizers_left_output[i][j].data
            r_out = summarizers_right_output[i][j].data
            dist = distance.euclidean(l_out,r_out)
            distances.append(dist)
        distance_mat.append(distances)
    distance_mat = np.matrix(distance_mat)
    return distance_mat

In [None]:
distance_mat_positives = calculate_distance_matrix(positives_summarizers_left_output,positives_summarizers_right_output)
distance_mat_positives = distance_mat_positives.reshape((31,8))

In [None]:
distances_positives_df = pd.DataFrame(data = distance_mat_positives,columns=attributes)

In [None]:
distances_positives_df.head(16)

In [None]:
distances_positives_df.to_csv('distances_positives_batch1.csv',index=False)

In [None]:
distance_mat_altered = calculate_distance_matrix(altered_summarizers_left_output,altered_summarizers_right_output)
distance_mat_altered = distance_mat_altered.reshape((31,8))

In [None]:
distances_altered_df = pd.DataFrame(data=distance_mat_altered,columns = attributes)

In [None]:
distances_altered_df.head()

In [None]:
distances_altered_df.to_csv('distances_altered_batch1.csv',index=False)

In [None]:
differences_df = distances_positives_df.subtract(distances_altered_df)

In [None]:
differences_df.head(31)

## Experiment 2
We want to evaluate the distance between positive and negative example respect to the classifier input

In [7]:
from distance_measures import calculate_closer_vector

In [8]:
classifier_datasets = dm.data.process(path='sample_data/itunes-amazon/',train='negative_samples.csv',
                            validation='positives_samples.csv',test='all_samples.csv',cache='pcache.pth')

In [9]:
from deepmatcher.data import MatchingIterator
batch_size = 32
splits = MatchingIterator.splits(classifier_datasets,batch_size=batch_size)

In [10]:
negative_batches = []
for batch in splits[0]:
    negative_batches.append(batch)

In [10]:
positive_batches = [] 
for batch in splits[1]:
    positive_batches.append(batch)

In [11]:
classifier = hybrid_model.classifier

In [12]:
hookF_classifier = []
hookF_classifier.append(Hook(classifier))

In [13]:
positive_classifier_inputs = []
positive_classifier_outputs = []
for batch in positive_batches:
    classifier_input,classifier_output = return_layer_input_output(hookF_classifier,batch,hybrid_model)
    positive_classifier_inputs.append(classifier_input)
    positive_classifier_outputs.append(classifier_output)

In [13]:
negative_classifier_inputs = []
negative_classifier_outputs = []
for batch in negative_batches:
    classifier_input,classifier_output = return_layer_input_output(hookF_classifier,batch,hybrid_model)
    negative_classifier_inputs.append(classifier_input)
    negative_classifier_outputs.append(classifier_output)

In [15]:
positive_classifier_inputs = list(map(lambda x: x[0][0],positive_classifier_inputs))
positive_classifier_outputs = list(map(lambda x: x[0][0],positive_classifier_outputs))

In [14]:
negative_classifier_inputs = list(map(lambda x: x[0][0],negative_classifier_inputs))
negative_classifier_outputs = list(map(lambda x: x[0][0],negative_classifier_outputs))

In [17]:
calculate_closer_vector(positive_classifier_inputs,negative_classifier_inputs)

proccessing vector


TypeError: 'function' object is not iterable

## Experiment 3
Find attribute more sensible to variation inspecting classifier input and its gradient

In [18]:
from distance_measures import find_smallest_variation_to_change

In [None]:
attribute_lenght= len(attributes)
variation_list = []
current_sample = 0
negative_classifier_inputs_subsample = negative_classifier_inputs[0:1]
for batch in negative_classifier_inputs_subsample:
    for index in range(len(batch)):
        variation_norms = []
        for j,attribute in enumerate(attributes):
            print('Processing sample {} with attribute {}'.format(current_sample,attribute))
            it,variation = find_smallest_variation_to_change(hybrid_model.classifier,batch,index,j,1)
            variation_norms.append(torch.norm(variation))
        variation_list.append(variation_norms)
        current_sample+=1

Processing sample 0 with attribute Song_Name
Processing sample 0 with attribute Artist_Name
Processing sample 0 with attribute Album_Name
Processing sample 0 with attribute Genre
Processing sample 0 with attribute Price
Processing sample 0 with attribute CopyRight
Processing sample 0 with attribute Time
Processing sample 0 with attribute Released
Processing sample 1 with attribute Song_Name
Processing sample 1 with attribute Artist_Name
Processing sample 1 with attribute Album_Name
Processing sample 1 with attribute Genre
Processing sample 1 with attribute Price
Processing sample 1 with attribute CopyRight


In [29]:
attribute_lenght= len(attributes)
variation_list = []
current_sample = 0
for batch in negative_classifier_inputs:
    for index in range(len(batch)):
        variation_norms = []
        for j,attribute in enumerate(attributes):
            print('Processing sample {} with attribute {}'.format(current_sample,attribute))
            it,variation = find_smallest_variation_to_change(hybrid_model.classifier,batch,index,j,1)
            variation_norms.append(torch.norm(variation))
        variation_list.append(variation_norms)
        current_sample+=1

Processing sample 0 with attribute Song_Name
Processing sample 0 with attribute Artist_Name
Processing sample 0 with attribute Album_Name
Processing sample 0 with attribute Genre
Processing sample 0 with attribute Price
Processing sample 0 with attribute CopyRight
Processing sample 0 with attribute Time
Processing sample 0 with attribute Released
Processing sample 1 with attribute Song_Name
Processing sample 1 with attribute Artist_Name
Processing sample 1 with attribute Album_Name
Processing sample 1 with attribute Genre
Processing sample 1 with attribute Price
Processing sample 1 with attribute CopyRight
Processing sample 1 with attribute Time
Processing sample 1 with attribute Released
Processing sample 2 with attribute Song_Name
Processing sample 2 with attribute Artist_Name
Processing sample 2 with attribute Album_Name
Processing sample 2 with attribute Genre
Processing sample 2 with attribute Price
Processing sample 2 with attribute CopyRight
Processing sample 2 with attribute Ti

KeyboardInterrupt: 

In [19]:
from distance_measures import get_probabilites

In [21]:
out = hybrid_model.classifier.forward(negative_classifier_inputs[0])

In [39]:
it,variation =find_smallest_variation_to_change(hybrid_model.classifier,negative_classifier_inputs[1],31,0,1)

In [40]:
it

2

In [None]:
variations_list = []
for variationl in variation_list:
    variations_list.append(list(map(lambda x:x.data[0],variationl)))

# Attribute variation analysis

In [16]:
classifier_gradients = []

In [31]:
def get_gradient(grad):
    classifier_gradients.append(grad)
    grad.data.zero_()
    return 

In [32]:
g = negative_classifier_inputs[0].register_hook(get_gradient)

In [33]:
def crossentropy_gradient(softmax_output,true_label):
    return (softmax_output-true_label).data

In [45]:
out = hybrid_model.classifier.forward(negative_classifier_inputs[0])

In [46]:
out

Variable containing:
-0.1730 -1.8395
-0.1688 -1.8624
-0.1395 -2.0387
-0.1383 -2.0470
-0.1736 -1.8365
-0.1287 -2.1142
-0.1421 -2.0216
-0.1549 -1.9413
-0.1677 -1.8685
-0.1239 -2.1497
-0.1505 -1.9684
-0.1603 -1.9096
-0.1540 -1.9470
-0.1395 -2.0387
-0.1615 -1.9031
-0.1863 -1.7723
-0.3900 -1.1302
-0.1517 -1.9609
-0.1508 -1.9665
-0.1734 -1.8374
-0.1538 -1.9481
-0.5170 -0.9071
-0.1599 -1.9122
-0.2014 -1.7014
-0.3147 -1.3094
-0.1456 -1.9991
-4.0174 -0.0182
-0.1551 -1.9403
-0.1454 -2.0000
-0.1373 -2.0531
-0.1543 -1.9451
-0.1730 -1.8395
[torch.FloatTensor of size 32x2]

In [36]:
from torch.nn.functional import softmax
from torch.autograd import Variable

In [50]:
true_labels = Variable(torch.FloatTensor([1,0]))

In [51]:
probabilites = softmax(out[0],dim=0)
gradients = crossentropy_gradient(probabilites,true_labels)

In [47]:
out[0].backward(gradients,retain_graph=True)

In [49]:
classifier_gradients[2]

Variable containing:
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
[torch.FloatTensor of size 32x1200]

In [71]:
new_input = negative_classifier_inputs[0].clone()
new_input[0].data= new_input[0].data.copy_(negative_classifier_inputs[0][0].data+classifier_gradients[0][0].data)

In [72]:
out2 = hybrid_model.classifier.forward(new_input)

In [73]:
probabilites = softmax(out2[0],dim=0)
gradients = crossentropy_gradient(probabilites,true_labels)

In [74]:
out2[0].backward(gradients,retain_graph=True)

In [79]:
classifier_gradients[1][0][0:20]

Variable containing:
1.00000e-03 *
 -0.8652
 -1.1622
  0.6350
 -2.6656
 -0.8711
  0.8956
 -3.5022
 -0.5527
  0.6460
  0.1270
 -0.2658
  3.7434
 -5.0618
 -2.3652
 -1.3532
  1.5847
  2.7413
 -0.3356
 -0.0844
  0.0341
[torch.FloatTensor of size 20]

In [77]:
new_input2 = new_input.clone()
new_input2[0].data= new_input[0].data.copy_(new_input[0].data+classifier_gradients[1][0].data)

In [80]:
out3 = hybrid_model.classifier.forward(new_input2)

In [85]:
probabilites = softmax(out3[0],dim=0)
gradients = crossentropy_gradient(probabilites,true_labels)

In [93]:
gradients


-0.1424
 0.1424
[torch.FloatTensor of size 2]

In [87]:
out3[0].backward(gradients,retain_graph=True)

In [92]:
new_input[0][10:20]

Variable containing:
-0.0628
 0.7316
 0.7216
-0.0432
 0.6927
-0.0598
 0.3550
-0.1800
 0.3194
-0.0465
[torch.FloatTensor of size 10]

In [91]:
new_input2[0][10:20]

Variable containing:
-0.0625
 0.7279
 0.7266
-0.0409
 0.6941
-0.0613
 0.3523
-0.1797
 0.3194
-0.0465
[torch.FloatTensor of size 10]

In [30]:
g.remove()