In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.ffn_baseline_functions import (
    histories_baseline_hyperparameter_search,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert_embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [7]:
sbert_embeddings.shape

(5568, 384)

# Baseline: Averaging history and use FFN

Here, we average the full history of a path and concatenate it to the current embedding (the total number of features that are passed into the FFN is 2 * sbert_embeddings.shape[0]).

Here, we will run the hyperparameter search to implement the FFN with the same parameters as the standard FFN baseline on the sentence embeddings.

In [8]:
num_epochs = 100
hidden_dim_sizes = [[64, 64], [128, 128], [256, 256], [512, 512]]
dropout_rates = [0.1]
learning_rates = [1e-3, 5e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "hidden_dim_sizes": hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

In [10]:
(
    ffn_mean_history,
    best_ffn_mean_history,
    _,
    __,
) = histories_baseline_hyperparameter_search(
    use_signatures=False,
    results_output=f"{output_dir}/ffn_mean_history_focal_{gamma}_kfold.csv",
    **kwargs,
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2_kfold_best_model.csv


In [11]:
ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,model_id,input_dim
0,,0.649767,0.617148,"[0.7288991487519839, 0.5053961568833903]",0.616571,"[0.7336625036305547, 0.4994797086368366]",0.617826,"[0.724197247706422, 0.5114544485881726]",,0.797008,...,0.1,0.001,1,focal,2,True,5,64,0.0,768
0,,0.641193,0.614894,"[0.715531254617999, 0.5142568761039616]",0.613415,"[0.7383348581884721, 0.4884947267497603]",0.618491,"[0.6940940366972477, 0.5428875865743207]",,0.787438,...,0.1,0.001,12,focal,2,True,5,64,0.0,768
0,,0.631128,0.611565,"[0.6987364895722332, 0.524393174717616]",0.611279,"[0.744888023369036, 0.4776707530647986]",0.619608,"[0.6579701834862385, 0.5812466702184337]",,0.779216,...,0.1,0.001,123,focal,2,True,5,64,0.0,768
0,,0.634669,0.611318,"[0.7065868263473053, 0.5160493827160494]",0.610125,"[0.7393483709273183, 0.4809019788311091]",0.616672,"[0.676605504587156, 0.5567394778902504]",,0.780429,...,0.1,0.0005,1,focal,2,True,5,64,0.1,768
0,,0.638024,0.616644,"[0.7071773220747889, 0.5261102977061981]",0.615616,"[0.7458651399491094, 0.4853669518235029]",0.623313,"[0.6723050458715596, 0.5743207245604688]",,0.784338,...,0.1,0.0005,12,focal,2,True,5,64,0.1,768
0,,0.636533,0.611218,"[0.7104247104247103, 0.512012012012012]",0.609841,"[0.7369069624152803, 0.4827748938178386]",0.615399,"[0.6857798165137615, 0.5450186467767715]",,0.783259,...,0.1,0.0005,123,focal,2,True,5,64,0.1,768
0,,0.644548,0.614319,"[0.7222950342216398, 0.506342221071706]",0.613196,"[0.7339449541284404, 0.49244712990936557]",0.616027,"[0.7110091743119266, 0.5210442194992009]",,0.761423,...,0.1,0.0001,1,focal,2,True,5,64,0.2,768
0,,0.638956,0.61025,"[0.7160240433953965, 0.5044768482987977]",0.608955,"[0.7326732673267327, 0.48523622047244097]",0.612711,"[0.7001146788990825, 0.5253063399041022]",,0.76695,...,0.1,0.0001,12,focal,2,True,5,64,0.2,768
0,,0.643616,0.615657,"[0.7193188490898414, 0.5119959162838182]",0.614245,"[0.737063778580024, 0.49142577168054874]",0.618386,"[0.7024082568807339, 0.5343633457645178]",,0.76978,...,0.1,0.0001,123,focal,2,True,5,64,0.2,768
0,,0.648276,0.616244,"[0.7271149674620391, 0.5053735255570118]",0.615503,"[0.7335862270207179, 0.49742002063983487]",0.617171,"[0.720756880733945, 0.5135855087906234]",,0.787303,...,0.1,0.001,1,focal,2,True,5,64,0.3,768


In [12]:
best_ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,input_dim
0,,0.652936,0.63068,"[0.721340915893445, 0.5400197628458497]",0.628997,"[0.754539762053851, 0.5034546292031322]",0.636626,"[0.6909403669724771, 0.582312200319659]",,0.790538,...,"(512, 512)",0.1,0.0005,1,focal,2,True,5,64,768
0,,0.651258,0.621082,"[0.7280127925570578, 0.5141521682679823]",0.619973,"[0.7384252432910646, 0.5015197568389058]",0.622664,"[0.7178899082568807, 0.527437400106553]",,0.792829,...,"(512, 512)",0.1,0.0005,12,focal,2,True,5,64,768
0,,0.653681,0.622979,"[0.7305684454756382, 0.5153886280646844]",0.621999,"[0.7391431924882629, 0.5048543689320388]",0.624281,"[0.7221903669724771, 0.5263718700053277]",,0.788651,...,"(512, 512)",0.1,0.0005,123,focal,2,True,5,64,768


In [13]:
best_ffn_mean_history["f1"].mean()

0.6249137855174428

In [14]:
best_ffn_mean_history["precision"].mean()

0.6236561588012092

In [15]:
best_ffn_mean_history["recall"].mean()

0.6278570187722291

In [16]:
np.stack(best_ffn_mean_history["f1_scores"]).mean(axis=0)

array([0.72664072, 0.52318685])

In [17]:
np.stack(best_ffn_mean_history["precision_scores"]).mean(axis=0)

array([0.74403607, 0.50327625])

In [18]:
np.stack(best_ffn_mean_history["recall_scores"]).mean(axis=0)

array([0.71034021, 0.54537382])