In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.ffn_baseline_functions import (
    histories_baseline_hyperparameter_search,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [7]:
sbert_embeddings.shape

(5568, 384)

# Baseline: Averaging history and use FFN

Here, we average the full history of a path and concatenate it to the current embedding (the total number of features that are passed into the FFN is 2 * sbert_embeddings.shape[0]).

Here, we will run the hyperparameter search to implement the FFN with the same parameters as the standard FFN baseline on the sentence embeddings.

In [8]:
num_epochs = 100
hidden_dim_sizes = [[64, 64], [128, 128], [256, 256], [512, 512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [None]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "hidden_dim_sizes": hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

In [9]:
(
    ffn_mean_history,
    best_ffn_mean_history,
    _,
    __,
) = histories_baseline_hyperparameter_search(
    use_signatures=False,
    results_output=f"{output_dir}/ffn_mean_history_focal_{gamma}_kfold.csv",
    **kwargs,
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/5568 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/ffn_mean_history_focal_2_best_model.csv


In [10]:
ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,model_id,input_dim
0,,0.643430,0.620156,"[0.7141789929777379, 0.5261332672776814]",0.618688,"[0.7457098283931357, 0.49166666666666664]",0.625501,"[0.6852064220183486, 0.5657964837506659]",,0.786090,...,0.5,0.0010,1,focal,2,True,5,64,0.00,768
0,,0.639515,0.614721,"[0.7124591138864108, 0.516983016983017]",0.613282,"[0.7399629400864731, 0.48660084626234135]",0.619169,"[0.6869266055045872, 0.5514118273841236]",,0.797547,...,0.5,0.0010,12,focal,2,True,5,64,0.00,768
0,,0.641752,0.619149,"[0.7119304556354917, 0.5263676688023657]",0.617800,"[0.7459170854271356, 0.4896836313617607]",0.624950,"[0.6809059633027523, 0.5689930740543421]",,0.779620,...,0.5,0.0010,123,focal,2,True,5,64,0.00,768
0,,0.641379,0.616167,"[0.714540059347181, 0.5177944862155388]",0.614673,"[0.7404674046740467, 0.48887837198296263]",0.620357,"[0.6903669724770642, 0.5503462972828982]",,0.774363,...,0.5,0.0001,1,focal,2,True,5,64,0.10,768
0,,0.639143,0.612653,"[0.7139479905437353, 0.5113579000504795]",0.611218,"[0.7365853658536585, 0.4858513189448441]",0.616176,"[0.6926605504587156, 0.5396909962706447]",,0.779081,...,0.5,0.0001,12,focal,2,True,5,64,0.10,768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.643989,0.619961,"[0.7155198093535896, 0.524402390438247]",0.618431,"[0.7445753254804711, 0.49228611500701264]",0.624824,"[0.6886467889908257, 0.5610015982951518]",,0.788920,...,0.1,0.0001,12,focal,2,True,5,64,0.22,768
0,,0.653122,0.622151,"[0.7303289378350964, 0.5139723165317314]",0.621226,"[0.7383533548198066, 0.5040983606557377]",0.623359,"[0.7224770642201835, 0.5242408098028769]",,0.791616,...,0.1,0.0001,123,focal,2,True,5,64,0.22,768
0,,0.652936,0.630680,"[0.721340915893445, 0.5400197628458497]",0.628997,"[0.754539762053851, 0.5034546292031322]",0.636626,"[0.6909403669724771, 0.582312200319659]",,0.790538,...,0.1,0.0005,1,focal,2,True,5,64,0.23,768
0,,0.651258,0.621082,"[0.7280127925570578, 0.5141521682679823]",0.619973,"[0.7384252432910646, 0.5015197568389058]",0.622664,"[0.7178899082568807, 0.527437400106553]",,0.792829,...,0.1,0.0005,12,focal,2,True,5,64,0.23,768


In [11]:
best_ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,input_dim
0,,0.63411,0.601902,"[0.7151356842258019, 0.48866892419901015]",0.601075,"[0.7240669997061416, 0.4780835881753313]",0.603078,"[0.7064220183486238, 0.49973361747469364]",,0.800647,...,"(128, 128)",0.5,0.001,1,focal,2,True,5,64,768
0,,0.631128,0.604174,"[0.7074648928307465, 0.5008827238335434]",0.602908,"[0.7302410741531888, 0.4755747126436782]",0.607551,"[0.6860665137614679, 0.529035695258391]",,0.798221,...,"(128, 128)",0.5,0.001,12,focal,2,True,5,64,768
0,,0.635787,0.608723,"[0.711629279811098, 0.505816894284269]",0.607368,"[0.7332725060827251, 0.4814636494944632]",0.611996,"[0.6912270642201835, 0.5327650506126798]",,0.791077,...,"(128, 128)",0.5,0.001,123,focal,2,True,5,64,768


In [12]:
best_ffn_mean_history["f1"].mean()

0.6049330665307449

In [13]:
best_ffn_mean_history["precision"].mean()

0.6037837550425881

In [14]:
best_ffn_mean_history["recall"].mean()

0.6075416599460066

In [15]:
np.stack(best_ffn_mean_history["f1_scores"]).mean(axis=0)

array([0.71140995, 0.49845618])

In [16]:
np.stack(best_ffn_mean_history["precision_scores"]).mean(axis=0)

array([0.72919353, 0.47837398])

In [17]:
np.stack(best_ffn_mean_history["recall_scores"]).mean(axis=0)

array([0.69457187, 0.52051145])