In [1]:
import deepmatcher as dm
import pandas as pd
import numpy as np
from scipy.spatial import distance
from utils import Hook,return_layer_input_output

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
data_dir = 'sample_data/itunes-amazon'
datasets = dm.data.process(data_dir, train='train.csv', validation='validation.csv',
                           test='price-test.csv')
train = datasets[0]
validation = datasets[1]
test = datasets[2]

In [None]:
train_table = train.get_raw_table()
train_table.head()

## Define neural network models

In [4]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')

In [5]:
#run this cell if you want to load pre-trained model
hybrid_model.load_state('models/hybrid_model.pth')
hybrid_model.cuda()

MatchingModel(
  (attr_summarizers): ModuleMap(
    (Song_Name): Hybrid(
      (word_contextualizer): RNN(
        (rnn_groups): ModuleList(
          (0): GRU(300, 150, batch_first=True, bidirectional=True)
        )
        (dropouts): ModuleList(
          (0): Dropout(p=0)
        )
        (bypass_networks): ModuleList(
          (0): None
        )
        (input_dropout): NoMeta(
          (module): Dropout(p=0)
        )
      )
      (word_comparator): Attention(
        (alignment_networks): ModuleList(
          (0): AlignmentNetwork(
            (transform): Transform(
              (transforms): ModuleList(
                (0): Linear(in_features=300, out_features=300, bias=True)
                (1): Linear(in_features=300, out_features=300, bias=True)
              )
              (bypass_networks): ModuleList(
                (0): Bypass(
                  (highway_gate): Linear(in_features=300, out_features=300, bias=True)
                )
                (1): Bypass(


## Train models

In [None]:
hybrid_model.run_train(
    train,
    validation,
    epochs=10,
    batch_size=16,
    best_save_path='models/hybrid_model.pth',
    pos_neg_ratio=4)

In [None]:
true_predictions = hybrid_model.run_prediction(test)

In [None]:
altered_pred = hybrid_model.run_prediction(test)

In [None]:
test_df = pd.read_csv('sample_data/itunes-amazon/test.csv')
test_df.loc[test_df['label'] == 0, 'left_Time'] = test_df['right_Time']
test_df_negatives = test_df.loc[test_df['label']==0]

In [None]:
test_df_negatives.to_csv('sample_data/itunes-amazon/altered_negatives.csv',index=False)

In [None]:
altered_negatives = dm.data.process_unlabeled('sample_data/itunes-amazon/altered_negatives.csv',
                                           hybrid_model,ignore_columns=['label'])

In [None]:
hybrid_model.run_prediction(altered_negatives,output_attributes=True)

# Analyze intermediate layers
In this step we want to evaluate the output of intermediate layers. For this purpose we use some utility functions from utility module

## Experiment 1
we want to evaluate the differences between the output of the summarizers of positive samples and the output of the summarizers of altered positive samples

In [24]:
attributes = ['Song_Name','Artist_Name','Album_Name','Genre','Price','CopyRight','Time','Released']

In [None]:
comparators_datasets = dm.data.process(path='sample_data/itunes-amazon/',train='test.csv',
                            validation='test_positives.csv',test='altered_positive_samples.csv',
                                       cache='summarizer_cache.pth')

In [None]:
from deepmatcher.data import MatchingIterator
batch_size = 32
splits = MatchingIterator.splits(comparators_datasets,batch_size=batch_size)

In [None]:
positive_batches = []
for batch in splits[1]:
    positive_batches.append(batch)

In [None]:
altered_positive_batches = []
for batch in splits[2]:
    altered_positive_batches.append(batch)

In [None]:
positive_batches[0].id, altered_positive_batches[0].id

In [None]:
summarizers = []
#comparators useful only for debugging
comparators = []
for attr in attributes:
    summarizers.append(hybrid_model.attr_summarizers[attr])
    comparators.append(hybrid_model.attr_comparators[attr])

In [None]:
hookF_summarizer = []
for summ in summarizers:
    hookF_summarizer.append(Hook(summ))
hookF_comparator = []
for comp in comparators:
    hookF_comparator.append(Hook(comp))

In [None]:
classifier = hybrid_model.classifier
hookF_classifier = []
hookF_classifier.append(Hook(classifier))

In [None]:
positives_batch_layer_inputs,positives_batch_layer_outputs = return_layer_input_output(hookF_summarizer,
                                                                                     positive_batches[0],hybrid_model)

In [None]:
altered_batch_layer_inputs, altered_batch_layer_outputs = return_layer_input_output(hookF_summarizer,
                                                                                    altered_positive_batches[0],hybrid_model)

In [None]:
positives_summarizers_left_output = list(map(lambda x: x[0].data,positives_batch_layer_outputs))
positives_summarizers_right_output = list(map(lambda x: x[1].data,positives_batch_layer_outputs))

In [None]:
altered_summarizers_left_output = list(map(lambda x: x[0].data,altered_batch_layer_outputs))
altered_summarizers_right_output = list(map(lambda x:x[1].data,altered_batch_layer_outputs))

In [None]:
def calculate_distance_matrix(summarizers_left_output,summarizers_right_output):
    distance_mat = []
    for i in range(len(summarizers_left_output)):
        distances = []
        for j in range(31):
            l_out = summarizers_left_output[i][j].data
            r_out = summarizers_right_output[i][j].data
            dist = distance.euclidean(l_out,r_out)
            distances.append(dist)
        distance_mat.append(distances)
    distance_mat = np.matrix(distance_mat)
    return distance_mat

In [None]:
distance_mat_positives = calculate_distance_matrix(positives_summarizers_left_output,positives_summarizers_right_output)
distance_mat_positives = distance_mat_positives.reshape((31,8))

In [None]:
distances_positives_df = pd.DataFrame(data = distance_mat_positives,columns=attributes)

In [None]:
distances_positives_df.head(16)

In [None]:
distances_positives_df.to_csv('distances_positives_batch1.csv',index=False)

In [None]:
distance_mat_altered = calculate_distance_matrix(altered_summarizers_left_output,altered_summarizers_right_output)
distance_mat_altered = distance_mat_altered.reshape((31,8))

In [None]:
distances_altered_df = pd.DataFrame(data=distance_mat_altered,columns = attributes)

In [None]:
distances_altered_df.head()

In [None]:
distances_altered_df.to_csv('distances_altered_batch1.csv',index=False)

In [None]:
differences_df = distances_positives_df.subtract(distances_altered_df)

In [None]:
differences_df.head(31)

## Experiment 2
We want to evaluate the distance between positive and negative example respect to the classifier input

In [6]:
from distance_measures import calculate_closer_vector

In [7]:
classifier_datasets = dm.data.process(path='sample_data/itunes-amazon/',train='negative_samples.csv',
                            validation='positives_samples.csv',test='all_samples.csv',cache='pcache.pth')

In [8]:
from deepmatcher.data import MatchingIterator
batch_size = 32
splits = MatchingIterator.splits(classifier_datasets,batch_size=batch_size)

In [9]:
negative_batches = []
for batch in splits[0]:
    negative_batches.append(batch)

In [10]:
positive_batches = [] 
for batch in splits[1]:
    positive_batches.append(batch)

In [11]:
classifier = hybrid_model.classifier

In [12]:
hookF_classifier = []
hookF_classifier.append(Hook(classifier))

In [13]:
positive_classifier_inputs = []
positive_classifier_outputs = []
for batch in positive_batches:
    classifier_input,classifier_output = return_layer_input_output(hookF_classifier,batch,hybrid_model)
    positive_classifier_inputs.append(classifier_input)
    positive_classifier_outputs.append(classifier_output)

In [14]:
negative_classifier_inputs = []
negative_classifier_outputs = []
for batch in negative_batches:
    classifier_input,classifier_output = return_layer_input_output(hookF_classifier,batch,hybrid_model)
    negative_classifier_inputs.append(classifier_input)
    negative_classifier_outputs.append(classifier_output)

In [15]:
positive_classifier_inputs = list(map(lambda x: x[0][0],positive_classifier_inputs))
positive_classifier_outputs = list(map(lambda x: x[0][0],positive_classifier_outputs))

In [16]:
negative_classifier_inputs = list(map(lambda x: x[0][0],negative_classifier_inputs))
negative_classifier_outputs = list(map(lambda x: x[0][0],negative_classifier_outputs))

In [17]:
calculate_closer_vector(positive_classifier_inputs,negative_classifier_inputs)

proccessing vector


KeyboardInterrupt: 

## Experiment 3
Find attribute more sensible to variation inspecting classifier input and its gradient

In [18]:
from distance_measures import find_smallest_variation_to_change

In [27]:
attribute_lenght= len(attributes)
variation_list = []
current_sample = 0
for batch in negative_classifier_inputs:
    for index in range(len(batch)):
        variation_norms = []
        for j,attribute in enumerate(attributes):
            print('Processing sample {} with attribute {}'.format(current_sample,attribute))
            it,variation = find_smallest_variation_to_change(hybrid_model.classifier,batch,index,j,1)
            variation_norms.append(torch.norm(variation))
        variation_list.append(variation_norms)
        current_sample+=1

Processing sample 0 with attribute Song_Name
Processing sample 0 with attribute Artist_Name
Processing sample 0 with attribute Album_Name
Processing sample 0 with attribute Genre
Processing sample 0 with attribute Price
Processing sample 0 with attribute CopyRight
Processing sample 0 with attribute Time
Processing sample 0 with attribute Released
Processing sample 1 with attribute Song_Name
Processing sample 1 with attribute Artist_Name
Processing sample 1 with attribute Album_Name
Processing sample 1 with attribute Genre
Processing sample 1 with attribute Price
Processing sample 1 with attribute CopyRight
Processing sample 1 with attribute Time
Processing sample 1 with attribute Released
Processing sample 2 with attribute Song_Name
Processing sample 2 with attribute Artist_Name
Processing sample 2 with attribute Album_Name
Processing sample 2 with attribute Genre
Processing sample 2 with attribute Price
Processing sample 2 with attribute CopyRight
Processing sample 2 with attribute Ti

KeyboardInterrupt: 

In [26]:
attribute_lenght= len(attributes)
variation_list = []
current_sample = 0
for batch in positive_classifier_inputs:
    for index in range(len(batch)):
        variation_norms = []
        for j,attribute in enumerate(attributes):
            print('Processing sample {} with attribute {}'.format(current_sample,attribute))
            it,variation = find_smallest_variation_to_change(hybrid_model.classifier,batch,index,grads,j)
            variation_norms.append(torch.norm(variation))
        variation_list.append(variation_norms)
        current_sample+=1

Processing sample 0 with attribute Song_Name


NameError: name 'grads' is not defined

In [None]:
variations_list = []
for variationl in variation_list:
    variations_list.append(list(map(lambda x:x.data[0],variationl)))

# Attribute variation analysis