In [7]:
import sys
sys.path.append("../..")

In [8]:
import pandas as pd
import deepmatcher as dm
from utilities_functions.intermediate_layer_extraction import return_layer_input
from utilities_functions.ri_calculator import find_smallest_variation_to_change
import torch

In [9]:
newTraingPositives = pd.read_csv('newPositiveSamples.csv')
defaultTrain = pd.read_csv('../../Structured/Beer/merged_train.csv')
defaultTrain.shape

(268, 10)

In [10]:
extendedTrain = pd.concat([defaultTrain,newTraingPositives])
extendedTrain.shape

(279, 10)

In [11]:
extendedTrain.to_csv('../../Structured/Beer/extended_train.csv',index=False)

In [12]:
extendedTrain[extendedTrain['label']==0].shape

(228, 10)

## Train models

In [13]:
train,valid,test = dm.data.process('../../Structured/Beer',train='extended_train.csv',validation='merged_valid.csv',
                                  test='merged_test.csv',left_prefix='ltable_',right_prefix='rtable_')



In [47]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')
#hybrid_model.run_train(train,valid,best_save_path='../../models/beer_balanced_hybrid.pth',epochs=30,pos_neg_ratio=4,batch_size=16)
hybrid_model.load_state('../../models/beer_balanced_hybrid.pth')
hybrid_model.cuda()

MatchingModel(
  (attr_summarizers): ModuleMap(
    (Beer_Name): Hybrid(
      (word_contextualizer): RNN(
        (rnn_groups): ModuleList(
          (0): GRU(300, 150, batch_first=True, bidirectional=True)
        )
        (dropouts): ModuleList(
          (0): Dropout(p=0)
        )
        (bypass_networks): ModuleList(
          (0): None
        )
        (input_dropout): NoMeta(
          (module): Dropout(p=0)
        )
      )
      (word_comparator): Attention(
        (alignment_networks): ModuleList(
          (0): AlignmentNetwork(
            (transform): Transform(
              (transforms): ModuleList(
                (0): Linear(in_features=300, out_features=300, bias=True)
                (1): Linear(in_features=300, out_features=300, bias=True)
              )
              (bypass_networks): ModuleList(
                (0): Bypass(
                  (highway_gate): Linear(in_features=300, out_features=300, bias=True)
                )
                (1): Bypass(


In [27]:
attention_model = dm.MatchingModel(attr_summarizer='attention')
#attention_model.run_train(train,valid,best_save_path='../../models/beer_balanced_attention.pth',epochs=30,pos_neg_ratio=4,batch_size=16)
attention_model.load_state('../../models/beer_balanced_attention.pth')

In [None]:
hybrid_model.run_eval(test)

In [48]:
positives = pd.read_csv('../../Structured/Beer/positives.csv')
positives['rtable_Beer_Name'] = positives['rtable_Beer_Name']+" imperial red ale"
positives['rtable_Beer_Name'] = positives['rtable_Beer_Name']+" imperial red ale"
positives['ltable_Beer_Name'] = positives['ltable_Beer_Name']+" imperial red ale"
positives.to_csv('temp/altered_positives.csv',index=False)
positives.shape

(68, 10)

In [49]:
unlabeled = dm.data.process_unlabeled('temp/altered_positives.csv',hybrid_model,ignore_columns=['id','label'])

In [50]:
pred = hybrid_model.run_prediction(unlabeled)
pred[pred['match_score']>0.5].shape

===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



(26, 1)

## Compute Ri

In [None]:
testneg_classifier_inputs,testneg_ids = return_layer_input('../../Structured/Beer/'
                                                                         ,'test_negatives',32,hybrid_model,
                                                                        hybrid_model.classifier)

In [None]:
neg_classifier_inputs,neg_ids = return_layer_input('../../Structured/Beer/'
                                                                         ,'negatives',32,hybrid_model,
                                                                        hybrid_model.classifier)

In [None]:
testpos_classifier_inputs,testpos_ids = return_layer_input('../../Structured/Beer',
                                                                       'test_positives',32,hybrid_model,
                                                                       hybrid_model.classifier)

In [None]:
pos_classifier_inputs,pos_ids = return_layer_input('../../Structured/Beer',
                                                                       'positives',32,hybrid_model,
                                                                       hybrid_model.classifier)

In [None]:
test_negative_classifier_inputs = list(map(lambda x: x[0],testneg_classifier_inputs))
test_positive_classifier_inputs = list(map(lambda x: x[0],testpos_classifier_inputs))
negative_classifier_inputs = list(map(lambda x: x[0],neg_classifier_inputs))
positive_classifier_inputs = list(map(lambda x: x[0],pos_classifier_inputs))

In [None]:
attributes =['Beer_Name','Brew_Factory_Name','Style','ABV']

In [None]:
attribute_length = int((300*3)/len(attributes))
classifier_length = int(attribute_length*len(attributes))

In [None]:
current_sample = 0
#each column of this matrix is related to a specific attribute
negatives_ri_matrix = []
for batch in test_negative_classifier_inputs:
    for sample_index in range(len(batch)):
        print('Processing sample number {}'.format(current_sample))
        current_sample_ris = list(map(lambda att: find_smallest_variation_to_change(hybrid_model.classifier,
                                                                                    classifier_length=classifier_length,
                                                                                    attribute_length=attribute_length,
                                                                                    input_matrix=batch,
                                                                                    vector_index=sample_index,
                                                                                    attributes=[attributes.index(att)]
                                                                                    ,class_to_reach=1),attributes))
        negatives_ri_matrix.append(current_sample_ris)
        current_sample+=1

In [None]:
ri_norms_negative_samples = []
for ri_list in negatives_ri_matrix:
    ri_norms_negative_samples.append(list(map(lambda x:torch.norm(x).data[0],ri_list)))

In [None]:
negatives_variation_df = pd.DataFrame(data= ri_norms_negative_samples,columns=attributes)
negatives_variation_df.head()

In [None]:
ri_sums_negatives = negatives_variation_df.sum(numeric_only=True)
ri_sums_negatives.plot.bar()

In [None]:
negatives_variation_df.to_csv('experiment-results/negatives_ri_extendedData.csv',index=False)

## Positive samples analysis

In [None]:
current_sample = 0
#each column of this matrix is related to a specific attribute
positives_ri_matrix = []
for batch in test_positive_classifier_inputs:
    for sample_index in range(len(batch)):
        print('Processing sample number {}'.format(current_sample))
        current_sample_ris = list(map(lambda att: find_smallest_variation_to_change(hybrid_model.classifier,
                                                                                    classifier_length=classifier_length,
                                                                                    attribute_length=attribute_length,
                                                                                    input_matrix=batch,
                                                                                    vector_index=sample_index,
                                                                                    attributes=[attributes.index(att)]
                                                                                    ,class_to_reach=0),attributes))
        positives_ri_matrix.append(current_sample_ris)
        current_sample+=1

In [None]:
ri_norms_positives = []
for ri_list in positives_ri_matrix:
    ri_norms_positives.append(list(map(lambda x:torch.norm(x).data[0],ri_list)))

In [None]:
positives_variation_df = pd.DataFrame(data= ri_norms_positives,columns=attributes)

In [None]:
ri_sums_positives = positives_variation_df.sum(numeric_only=True)
ri_sums_positives.plot.bar()

In [None]:
positives_variation_df.to_csv('experiment-results/ri_positives_extendedData.csv',index=False)

## Find nearest neighbours

In [None]:
negatives = pd.read_csv('../../Structured/Beer/negatives.csv')
positives = pd.read_csv('../../Structured/Beer/positives.csv')

In [None]:
from utilities_functions.distance_measures import nearest_neighbour,nearest_neighbour_onAttribute
#lista di tuple: vettore più vicino considerando tutti gli elementi e closer solo secondo un attributo
negatives_closer_vectors = []
i = 0
for batch in test_negative_classifier_inputs:
    for sample in batch:
        current_sample_closer_vectors = list(map(lambda att: nearest_neighbour_onAttribute
                                                 (sample+negatives_ri_matrix[i][attributes.index(att)]
                                                                ,positive_classifier_inputs,attributes.index(att),
                                                                attribute_length,'cosine'),attributes))
        negatives_closer_vectors.append(current_sample_closer_vectors)
        i += 1 

In [None]:
# per ogni attributo trovo il più vicino
positives_closer_vectors = []
i = 0
for batch in test_positive_classifier_inputs:
    for sample in batch:
        current_sample_closer_vectors = list(map(lambda att: nearest_neighbour_onAttribute
                                                 (sample+positives_ri_matrix[i][attributes.index(att)]
                                                                ,negative_classifier_inputs,attributes.index(att),
                                                                attribute_length,'cosine'),attributes))
        positives_closer_vectors.append(current_sample_closer_vectors)
        i += 1 

In [None]:
positives_closer_vectors_df = pd.DataFrame(data= positives_closer_vectors,columns =attributes)
positives_closer_vectors_df = positives_closer_vectors_df.applymap(lambda c:neg_ids[c])
negatives_closer_vectors_df = pd.DataFrame(data = negatives_closer_vectors, columns = attributes)
negatives_closer_vectors_df = negatives_closer_vectors_df.applymap(lambda c:pos_ids[c])

In [None]:
positives_closer_vectors_df['SampleID'] = testpos_ids
negatives_closer_vectors_df['SampleID'] = testneg_ids
positives_closer_vectors_df.to_csv('experiment-results/positives_nn_balanced.csv',index=False)
negatives_closer_vectors_df.to_csv('experiment-results/negatives_nn_balanced.csv',index=False)

## Bias testing

In [4]:
from utilities_functions.explainer import generateExplanations,analyze_valueDistribution

In [6]:
testpos_nn = pd.read_csv('experiment-results/positives_nn_balanced.csv')
negatives = pd.read_csv('../../Structured/Beer/negatives.csv')

In [8]:
!mkdir temp

In [9]:
top5NN = generateExplanations(testpos_nn,5,negatives,"Beer_Name",hybrid_model,
                    "../../Structured/Beer/positives.csv",1)

===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

The standard true positives are 65
Photobomb Imperial Red Ale Fulton The Libertine Imperial Red Ale - Heaven Hill Rye Whiskey Barrel Aged
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.1 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

5
Red Dirt Imperial Red Ale AleSmith YuleSmith &#40; Winter &#41; Imperial Red Ale
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.1 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

2
Ro Shampo Imperial Red Ale Ballast Point Tongue Buckler Imperial Red Ale - Bourbon Barrel Aged
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.1 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

4
Ballistic Brewing Co. - Gorgon Red Ale Red Horn Coffee House and Brewing Company Red Horn Drink Well Red Ale
===>  PREDICT E

In [16]:
testneg_nn = pd.read_csv('experiment-results/negatives_nn_balanced.csv')
positives = pd.read_csv('../../Structured/Beer/positives.csv')

In [17]:
top5NN_neg = generateExplanations(testneg_nn,5,positives,"Beer_Name",hybrid_model,
                    "../../Structured/Beer/positives.csv",1)

===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

The standard true positives are 65
Royal Amber Ale Kalamazoo Royal Amber Ale
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

51
Willoughby American Amber Ale Willoughby American Amber Ale
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

52
Radeau Red Ale Coopers Cave Radeau Red Ale
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

51
Amber Waves Ale Capitol City Amber Waves Ale
===>  PREDICT Epoch 3
Finished Epoch 3 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00

51
Hearthstone Red Rye Ale Hearthstone Red Rye Ale
===>  PREDICT Epoch 