In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import os

seed = 2023

In [2]:
from nlpsig_networks.scripts.ffn_baseline_functions import (
    histories_baseline_hyperparameter_search
)

In [3]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [4]:
%run load_sbert-embeddings.py

In [5]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [6]:
sbert_embeddings.shape

(5568, 384)

# Baseline: Averaging history and use FFN

Here, we average the full history of a path and concatenate it to the current embedding (the total number of features that are passed into the FFN is 2 * sbert_embeddings.shape[0]).

Here, we will run the hyperparameter search to implement the FFN with the same parameters as the standard FFN baseline on the sentence embeddings.

In [7]:
num_epochs = 100
hidden_dim_sizes = [[64,64],[128,128],[256,256],[512,512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (df_rumours[df_rumours['set']=='train'].index,
                 df_rumours[df_rumours['set']=='dev'].index,
                 df_rumours[df_rumours['set']=='test'].index)

In [8]:
ffn_mean_history, best_ffn_mean_history, _, __ = histories_baseline_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    hidden_dim_sizes=hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    use_signatures=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids= None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/ffn_mean_history_focal_{gamma}.csv",
    verbose=False
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2_best_model.csv


In [9]:
ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,model_id,input_dim
0,0.338222,0.510963,0.510707,"[0.49951219512195116, 0.521901211556384]",0.513236,"[0.5378151260504201, 0.4886561954624782]",0.513151,"[0.4663023679417122, 0.56]",0.258686,0.665480,...,0.5,0.0010,1,focal,2,False,,64,0.00,768
0,0.300500,0.532888,0.530859,"[0.5000000000000001, 0.5617173524150268]",0.538268,"[0.568445475638051, 0.5080906148867314]",0.537133,"[0.44626593806921677, 0.628]",0.250418,0.665480,...,0.5,0.0010,12,focal,2,False,,64,0.00,768
0,0.321668,0.530982,0.530671,"[0.5427509293680297, 0.518590998043053]",0.530871,"[0.5540796963946869, 0.5076628352490421]",0.530938,"[0.5318761384335154, 0.53]",0.248420,0.672598,...,0.5,0.0010,123,focal,2,False,,64,0.00,768
0,0.308341,0.517636,0.517632,"[0.5162523900573613, 0.5190114068441064]",0.518912,"[0.5432595573440644, 0.4945652173913043]",0.518902,"[0.4918032786885246, 0.546]",0.244130,0.672598,...,0.5,0.0001,1,focal,2,False,,64,0.10,768
0,0.315495,0.511916,0.511916,"[0.5114503816793894, 0.5123809523809524]",0.513083,"[0.5370741482965932, 0.4890909090909091]",0.513080,"[0.48816029143898, 0.538]",0.256388,0.679715,...,0.5,0.0001,12,focal,2,False,,64,0.10,768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.285014,0.530982,0.528169,"[0.5646017699115043, 0.49173553719008267]",0.528800,"[0.5490533562822719, 0.5085470085470085]",0.528528,"[0.581056466302368, 0.476]",0.238764,0.665480,...,0.1,0.0001,12,focal,2,False,,64,0.22,768
0,0.295988,0.513823,0.513812,"[0.5114942528735632, 0.5161290322580646]",0.515184,"[0.5393939393939394, 0.49097472924187724]",0.515169,"[0.48633879781420764, 0.544]",0.240908,0.661922,...,0.1,0.0001,123,focal,2,False,,64,0.22,768
0,0.340347,0.517636,0.511131,"[0.4547413793103448, 0.5675213675213675]",0.526125,"[0.5567282321899736, 0.4955223880597015]",0.524168,"[0.3843351548269581, 0.664]",0.267414,0.644128,...,0.1,0.0005,1,focal,2,False,,64,0.23,768
0,0.318954,0.485224,0.478048,"[0.5392491467576791, 0.4168466522678186]",0.480137,"[0.507223113964687, 0.45305164319248825]",0.480796,"[0.575591985428051, 0.386]",0.254116,0.661922,...,0.1,0.0005,12,focal,2,False,,64,0.23,768


In [10]:
best_ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,input_dim
0,0.322381,0.503336,0.502751,"[0.4856860809476801, 0.519815668202765]",0.506112,"[0.5301724137931034, 0.48205128205128206]",0.506044,"[0.44808743169398907, 0.564]",0.259724,0.661922,...,"(256, 256)",0.1,0.0005,1,focal,2,False,,64,768
0,0.286459,0.528122,0.527875,"[0.5386766076421249, 0.5170731707317073]",0.528144,"[0.5515267175572519, 0.5047619047619047]",0.528206,"[0.5264116575591985, 0.53]",0.240372,0.679715,...,"(256, 256)",0.1,0.0005,12,focal,2,False,,64,768
0,0.285234,0.523356,0.511264,"[0.5881383855024711, 0.4343891402714932]",0.518421,"[0.5368421052631579, 0.5]",0.517137,"[0.6502732240437158, 0.384]",0.23304,0.701068,...,"(256, 256)",0.1,0.0005,123,focal,2,False,,64,768


In [11]:
best_ffn_mean_history["f1"].mean()

0.5139631755497068

In [12]:
best_ffn_mean_history["precision"].mean()

0.5175590705711166

In [13]:
best_ffn_mean_history["recall"].mean()

0.5171287188828172

In [14]:
np.stack(best_ffn_mean_history["f1_scores"]).mean(axis=0)

array([0.53750036, 0.49042599])

In [15]:
np.stack(best_ffn_mean_history["precision_scores"]).mean(axis=0)

array([0.53951375, 0.4956044 ])

In [16]:
np.stack(best_ffn_mean_history["recall_scores"]).mean(axis=0)

array([0.54159077, 0.49266667])