In [1]:
import pandas as pd
import deepmatcher as dm

In [20]:
train = pd.read_csv('sample_data/itunes-amazon/train.csv')
validation = pd.read_csv('sample_data/itunes-amazon/validation.csv')
test = pd.read_csv('sample_data/itunes-amazon/test.csv')

In [3]:
train_positives = train[train['label']==1]
train_negatives = train[train['label']==0]

In [4]:
validation_positives = validation[validation['label']==1]
validation_negatives = validation[validation['label']==0]

In [5]:
def get_price_distribution(df):
    prices = {}
    for idx,row in df.iterrows():
        current_prices = row['left_Price']+ " "+ row['right_Price']
        if current_prices in prices:
            prices[current_prices] += 1
        else:
            prices[current_prices] = 1
    return prices

In [6]:
price_negatives = get_price_distribution(train_negatives)
price_negatives

{'$ 1.29 $ 1.29': 153,
 'Album Only $ 1.29': 20,
 '$ 1.29 $ 0.99': 18,
 '$ 1.99 $ 1.29': 25,
 '$ 1.29 $ 0.69': 3,
 '$ 0.99 $ 0.99': 11,
 '$ 1.99 $ 0.99': 3,
 '$ 0.99 $ 1.29': 11,
 '$ 0.99 $ 0.89': 2}

In [7]:
price_positives = get_price_distribution(train_positives)
price_positives

{'$ 0.99 $ 0.99': 17,
 '$ 1.29 $ 1.29': 58,
 '$ 1.99 $ 1.29': 1,
 '$ 1.29 $ 0.89': 1}

### Evaluation changing prices

In [3]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')

In [9]:
hybrid_model.load_state('models/hybrid_model.pth')
hybrid_model.cuda()

MatchingModel(
  (attr_summarizers): ModuleMap(
    (Song_Name): Hybrid(
      (word_contextualizer): RNN(
        (rnn_groups): ModuleList(
          (0): GRU(300, 150, batch_first=True, bidirectional=True)
        )
        (dropouts): ModuleList(
          (0): Dropout(p=0)
        )
        (bypass_networks): ModuleList(
          (0): None
        )
        (input_dropout): NoMeta(
          (module): Dropout(p=0)
        )
      )
      (word_comparator): Attention(
        (alignment_networks): ModuleList(
          (0): AlignmentNetwork(
            (transform): Transform(
              (transforms): ModuleList(
                (0): Linear(in_features=300, out_features=300, bias=True)
                (1): Linear(in_features=300, out_features=300, bias=True)
              )
              (bypass_networks): ModuleList(
                (0): Bypass(
                  (highway_gate): Linear(in_features=300, out_features=300, bias=True)
                )
                (1): Bypass(


### Evaluate model on standard test set

In [18]:
test_pos = dm.data.process_unlabeled('sample_data/itunes-amazon/test_positives.csv',trained_model=hybrid_model,
                                ignore_columns = ['id','label'])

In [21]:
standard_pred = hybrid_model.run_prediction(test_pos,output_attributes=True)

===>  PREDICT Epoch 2
Finished Epoch 2 || Run Time:    0.2 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



In [22]:
standard_pred_neg = standard_pred[standard_pred['match_score']<=0.5]
standard_pred_neg.shape

(6, 18)

### Evaluate model on altered test set (on price)

In [29]:
test_positives = pd.read_csv('sample_data/itunes-amazon/test_positives.csv')

In [30]:
test_pos_altered = alter_price(test_positives,'Album Only','$ 1.29')

NameError: name 'alter_price' is not defined

In [45]:
def alter_price(df,left_pr,right_pr):
    df['left_Price'] = left_pr
    df['right_Price'] = right_pr
    return df

In [28]:
test_pos_altered.to_csv('sample_data/itunes-amazon/exp6/test_positives_altered_price.csv',index=False)

In [31]:
test_pos_altered_price = dm.data.process_unlabeled('sample_data/itunes-amazon/exp6/test_positives_altered_price.csv'
                                                   ,trained_model=hybrid_model,ignore_columns=['id','label'])

In [30]:
altered_pred = hybrid_model.run_prediction(test_pos_altered_price,output_attributes=True)

===>  PREDICT Epoch 2
Finished Epoch 2 || Run Time:    0.1 | Load Time:    0.2 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



In [31]:
altered_pred_neg = altered_pred[altered_pred['match_score'] <=0.5]
altered_pred_neg.shape

(8, 18)

### Evaluate closer vectors on classifier space

In [32]:
all_neg = pd.read_csv('sample_data/itunes-amazon/exp3/negative_samples.csv')

In [41]:
closer_negatives = pd.read_csv('experiments-results/exp3/positives_closer_vectors_on_attribute.csv',sep=';')
closer_negatives.tail()

Unnamed: 0,Song_Name,Artist_Name,Album_Name,Genre,Price,CopyRight,Time,Released,Pos_sample_ID
127,206,422,83,422,281,297,472,199,502
128,438,418,491,398,281,420,290,397,250
129,426,402,341,401,281,199,472,290,313
130,180,418,492,182,281,420,256,397,276
131,244,418,491,421,281,420,233,397,503


In [38]:
join_df =pd.merge(left=all_neg,right=closer_negatives,left_on='id',right_on='Price')

In [45]:
interested_columns = join_df.loc[:, ['id','left_Price','right_Price']]

Unnamed: 0,id,left_Price,right_Price
0,243,$ 0.99,$ 1.29
1,328,$ 1.99,$ 0.99
2,328,$ 1.99,$ 0.99
3,328,$ 1.99,$ 0.99
4,281,Album Only,$ 1.29
5,281,Album Only,$ 1.29
6,281,Album Only,$ 1.29
7,281,Album Only,$ 1.29
8,281,Album Only,$ 1.29
9,281,Album Only,$ 1.29


In [46]:
interested_columns.to_csv('experiments-results/exp6/closer_negatives_prices.csv',index=False)

## Create new training set with different price distribution

In [21]:
for idx,row in train.iterrows():
    if row['label']==1:
        if idx % 2 ==0:
            train.at[idx,'left_Price'] = '$ 0.99'
            train.at[idx,'right_Price'] = '$ 0.99'
        else:
            train.at[idx,'left_Price'] = '$ 1.29'
            train.at[idx,'right_Price'] = '$ 1.29'
    else:
        if idx % 2 == 0:
            train.at[idx,'left_Price'] = 'Album Only'
            train.at[idx,'right_Price'] = '$ 1.29'
        else:
            train.at[idx,'left_Price'] = '$ 1.29'
            train.at[idx,'right_Price'] = '$ 1.29'

In [22]:
for idx,row in validation.iterrows():
    if row['label']==1:
        if idx % 2 ==0:
            validation.at[idx,'left_Price'] = '$ 0.99'
            validation.at[idx,'right_Price'] = '$ 0.99'
        else:
            validation.at[idx,'left_Price'] = '$ 1.29'
            validation.at[idx,'right_Price'] = '$ 1.29'
    else:
        if idx % 2 == 0:
            validation.at[idx,'left_Price'] = 'Album Only'
            validation.at[idx,'right_Price'] = '$ 1.29'
        else:
            validation.at[idx,'left_Price'] = '$ 1.29'
            validation.at[idx,'right_Price'] = '$ 1.29'

In [23]:
train.to_csv('sample_data/itunes-amazon/exp6/train_newprices.csv',index=False)

In [24]:
validation.to_csv('sample_data/itunes-amazon/exp6/validation_newprices.csv',index=False)

In [25]:
train,valid,test = dm.data.process('sample_data/itunes-amazon/exp6',train='train_newprices.csv',
                                   validation='validation_newprices.csv',test='test.csv',)



In [26]:
hybrid_model = dm.MatchingModel(attr_summarizer='hybrid')

In [27]:
hybrid_model.run_train(train,valid,best_save_path='models/hybrid_pricebias.pth',pos_neg_ratio=4,batch_size=16)

* Number of trainable parameters: 17757810
===>  TRAIN Epoch 1
Finished Epoch 1 || Run Time:    7.8 | Load Time:    1.2 || F1:  43.01 | Prec:  29.70 | Rec:  77.92 || Ex/s:  36.00

===>  EVAL Epoch 1
Finished Epoch 1 || Run Time:    1.0 | Load Time:    0.4 || F1:  52.17 | Prec:  35.29 | Rec: 100.00 || Ex/s:  78.33

* Best F1: 52.17391304347826
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 2
Finished Epoch 2 || Run Time:    7.7 | Load Time:    1.2 || F1:  74.74 | Prec:  62.83 | Rec:  92.21 || Ex/s:  36.04

===>  EVAL Epoch 2
Finished Epoch 2 || Run Time:    1.1 | Load Time:    0.4 || F1:  77.19 | Prec:  66.67 | Rec:  91.67 || Ex/s:  73.15

* Best F1: 77.19298245614036
Saving best model...
Done.
---------------------

===>  TRAIN Epoch 3
Finished Epoch 3 || Run Time:    7.7 | Load Time:    1.2 || F1:  85.06 | Prec:  76.29 | Rec:  96.10 || Ex/s:  36.19

===>  EVAL Epoch 3
Finished Epoch 3 || Run Time:    0.9 | Load Time:    0.4 || F1:  77.97 | Prec:  65.71 | Rec:  95.

Finished Epoch 27 || Run Time:    0.9 | Load Time:    0.4 || F1:  90.20 | Prec:  85.19 | Rec:  95.83 || Ex/s:  84.62

---------------------

===>  TRAIN Epoch 28
Finished Epoch 28 || Run Time:    7.4 | Load Time:    1.2 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  37.49

===>  EVAL Epoch 28
Finished Epoch 28 || Run Time:    0.9 | Load Time:    0.4 || F1:  90.20 | Prec:  85.19 | Rec:  95.83 || Ex/s:  81.81

---------------------

===>  TRAIN Epoch 29
Finished Epoch 29 || Run Time:    7.4 | Load Time:    1.2 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  37.61

===>  EVAL Epoch 29
Finished Epoch 29 || Run Time:    0.9 | Load Time:    0.4 || F1:  90.20 | Prec:  85.19 | Rec:  95.83 || Ex/s:  88.04

---------------------

===>  TRAIN Epoch 30
Finished Epoch 30 || Run Time:    7.0 | Load Time:    1.2 || F1: 100.00 | Prec: 100.00 | Rec: 100.00 || Ex/s:  39.25

===>  EVAL Epoch 30
Finished Epoch 30 || Run Time:    0.9 | Load Time:    0.4 || F1:  90.20 | Prec:  85.19 | Rec:  95.83

95.83333333333333

In [28]:
hybrid_model.run_eval(test)

===>  EVAL Epoch 5
Finished Epoch 5 || Run Time:    0.6 | Load Time:    0.4 || F1:  75.76 | Prec:  71.43 | Rec:  80.65 || Ex/s: 102.66



75.75757575757575

In [32]:
test_pos_altered_price = dm.data.process_unlabeled('sample_data/itunes-amazon/exp6/test_positives_altered_price.csv'
                                                   ,trained_model=hybrid_model,ignore_columns=['id','label'])

In [33]:
pred_new_model = hybrid_model.run_prediction(test_pos_altered_price,output_attributes=True)

===>  PREDICT Epoch 5
Finished Epoch 5 || Run Time:    0.1 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



In [34]:
pred_neg_new_model = pred_new_model[pred_new_model['match_score']<=0.5]
pred_neg_new_model.shape

(13, 18)

In [35]:
test_pos = dm.data.process_unlabeled('sample_data/itunes-amazon/test_positives.csv',
                                    trained_model=hybrid_model,ignore_columns=['id','label'])

In [40]:
pred_standardtest_newmodel = hybrid_model.run_prediction(test_pos,output_attributes=True)

===>  PREDICT Epoch 5
Finished Epoch 5 || Run Time:    0.2 | Load Time:    0.2 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



In [43]:
pred_standardtest_newmodel_negative = pred_standardtest_newmodel[pred_standardtest_newmodel['match_score']<=0.5]
pred_neg_new_model.shape

(13, 18)

## Try to alter price in test set,

In [89]:
test_pos = pd.read_csv('sample_data/itunes-amazon/test_positives.csv')

In [90]:
alter_price(test_pos,'$ 0.99','$ 0.99')
test_pos.to_csv('sample_data/itunes-amazon/exp6/test_pos_099price.csv',index=False)

In [91]:
test_pos.shape

(31, 18)

In [92]:
test_pos_099price = dm.data.process_unlabeled('sample_data/itunes-amazon/exp6/test_pos_099price.csv'
                                                   ,trained_model=hybrid_model,ignore_columns=['id','label'])

In [93]:
preds = hybrid_model.run_prediction(test_pos_099price,output_attributes=True)
negative_preds = preds[preds['match_score']<=0.5]

===>  PREDICT Epoch 5
Finished Epoch 5 || Run Time:    0.1 | Load Time:    0.1 || F1:   0.00 | Prec:   0.00 | Rec:   0.00 || Ex/s:   0.00



In [95]:
negative_preds.shape

(0, 18)