In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import os

seed = 2023

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
    obtain_path
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [7]:
sbert_embeddings.shape

(5568, 384)

In [8]:
x_data = obtain_path(
    df=df_rumours, 
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    k=20
)

x_data.shape

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

(5568, 20, 384)

## Baseline: BiLSTM

In [11]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (df_rumours[df_rumours['set']=='train'].index,
                 df_rumours[df_rumours['set']=='dev'].index,
                 df_rumours[df_rumours['set']=='test'].index)

## history length = 20

In [12]:
size = 20
bilstm_history_20, best_bilstm_history_20, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_20_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_20_focal_2_best_model.csv


In [13]:
bilstm_history_20.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.275409,0.545281,0.541669,0.549905,0.547902,0.252523,0.696323,0.648573,0.673018,0.643554,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.423434,0.530982,0.521996,0.530148,0.528171,0.339659,0.727165,0.680084,0.711169,0.672682,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.409979,0.523991,0.518827,0.524409,0.523902,0.323415,0.717675,0.677468,0.697101,0.671559,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.337892,0.533206,0.530322,0.53556,0.53461,0.288194,0.715302,0.665828,0.696906,0.659154,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.456228,0.532253,0.529302,0.534429,0.533521,0.387965,0.723606,0.678362,0.704821,0.671295,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.385305,0.515094,0.505986,0.515897,0.514093,0.292899,0.727165,0.68215,0.709601,0.674791,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.5338,0.511916,0.507559,0.511819,0.511265,0.351796,0.730724,0.679613,0.719223,0.671961,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.44413,0.526851,0.519304,0.523209,0.5236,0.427234,0.727165,0.678836,0.711738,0.671277,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.410281,0.534477,0.533793,0.537632,0.537104,0.351665,0.71293,0.675043,0.690043,0.669944,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.52759,0.523356,0.521872,0.526075,0.525705,0.346581,0.735469,0.688269,0.722744,0.680606,20.0,1.0,1.0,45.333333,2.0,0.0,64.0,7.0


In [14]:
best_bilstm_history_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.640989,0.533842,0.531989,"[0.5025432349949135, 0.5614349775784753]",0.539034,"[0.5691244239631337, 0.5089430894308943]",0.537954,"[0.44990892531876137, 0.626]",0.51631,0.747331,...,True,300,0.5,0.0001,1,focal,2,False,,64
0,0.529864,0.530982,0.519953,"[0.44719101123595506, 0.5927152317880794]",0.544614,"[0.5835777126099707, 0.5056497175141242]",0.539239,"[0.36247723132969034, 0.716]",0.359668,0.72242,...,True,300,0.5,0.0001,12,focal,2,False,,64
0,0.602217,0.529075,0.529065,"[0.5268199233716475, 0.5313092979127134]",0.530485,"[0.5555555555555556, 0.5054151624548736]",0.530455,"[0.5009107468123861, 0.56]",0.543472,0.736655,...,True,300,0.5,0.0001,123,focal,2,False,,64


In [15]:
best_bilstm_history_20["f1"].mean()

0.5270022794802974

In [16]:
best_bilstm_history_20["precision"].mean()

0.5380442769214254

In [17]:
best_bilstm_history_20["recall"].mean()

0.535882817243473

In [18]:
np.stack(best_bilstm_history_20["f1_scores"]).mean(axis=0)

array([0.49218472, 0.56181984])

In [19]:
np.stack(best_bilstm_history_20["precision_scores"]).mean(axis=0)

array([0.56941923, 0.50666932])

In [20]:
np.stack(best_bilstm_history_20["recall_scores"]).mean(axis=0)

array([0.43776563, 0.634     ])

## Unidirectional

In [21]:
size = 20
bilstm_history_20_uni, best_bilstm_history_20_uni, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=False,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_uni.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_20_focal_2_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_20_focal_2_uni_best_model.csv


In [22]:
bilstm_history_20_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.375923,0.529393,0.525727,0.531983,0.530551,0.325903,0.715303,0.662367,0.704402,0.65564,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.574045,0.504925,0.503486,0.50726,0.506401,0.469888,0.734282,0.681575,0.726369,0.673349,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.500438,0.517318,0.515113,0.517462,0.517438,0.424599,0.737841,0.684709,0.73252,0.676142,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.434192,0.526851,0.525939,0.526121,0.526218,0.368458,0.736655,0.677465,0.737585,0.669588,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.334952,0.545917,0.545779,0.546565,0.546605,0.298152,0.734282,0.678548,0.729421,0.670537,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.482948,0.508103,0.505418,0.506868,0.506968,0.389846,0.734282,0.682297,0.725583,0.674052,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.604255,0.511281,0.507539,0.517084,0.516311,0.438023,0.730724,0.676872,0.721769,0.66915,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.526216,0.496981,0.49383,0.496945,0.496997,0.403417,0.735469,0.681872,0.729031,0.673577,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.359506,0.51986,0.513294,0.518634,0.519301,0.300434,0.734282,0.684435,0.723587,0.67616,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.386797,0.540833,0.539747,0.544741,0.543979,0.324551,0.734282,0.672956,0.735845,0.665617,20.0,1.0,0.0,45.333333,2.0,0.0,64.0,7.0


In [23]:
best_bilstm_history_20_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.556056,0.517636,0.512174,"[0.5637931034482758, 0.4605543710021322]",0.514169,"[0.5351882160392799, 0.4931506849315068]",0.513814,"[0.5956284153005464, 0.432]",0.507605,0.75089,...,False,384,0.5,0.0005,1,focal,2,False,,64
0,0.30597,0.534795,0.534388,"[0.5481481481481482, 0.5206286836935167]",0.534511,"[0.5574387947269304, 0.5115830115830116]",0.534581,"[0.5391621129326047, 0.53]",0.299088,0.736655,...,False,384,0.5,0.0005,12,focal,2,False,,64
0,0.391223,0.521449,0.520032,"[0.4939516129032258, 0.5461121157323688]",0.525699,"[0.5530474040632054, 0.49834983498349833]",0.525133,"[0.44626593806921677, 0.604]",0.329086,0.736655,...,False,384,0.5,0.0005,123,focal,2,False,,64


In [24]:
best_bilstm_history_20_uni["f1"].mean()

0.522198005821278

In [25]:
best_bilstm_history_20_uni["precision"].mean()

0.524792991054572

In [26]:
best_bilstm_history_20_uni["recall"].mean()

0.5245094110503946

In [27]:
np.stack(best_bilstm_history_20_uni["f1_scores"]).mean(axis=0)

array([0.53529762, 0.50909839])

In [28]:
np.stack(best_bilstm_history_20_uni["precision_scores"]).mean(axis=0)

array([0.54855814, 0.50102784])

In [29]:
np.stack(best_bilstm_history_20_uni["recall_scores"]).mean(axis=0)

array([0.52701882, 0.522     ])

## history_length = 35

In [30]:
size = 50
bilstm_history_35, best_bilstm_history_35, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_35_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_35_focal_2_best_model.csv


In [31]:
bilstm_history_35.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_35.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.283403,0.557674,0.554032,0.565003,0.562121,0.25269,0.709371,0.664723,0.689198,0.658716,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.367627,0.554179,0.549443,0.558247,0.555658,0.256364,0.727165,0.686887,0.707032,0.680414,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.495221,0.550365,0.538787,0.564022,0.554931,0.308068,0.725979,0.704944,0.712833,0.709707,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.293705,0.561487,0.560097,0.566666,0.565199,0.265508,0.717675,0.670783,0.699402,0.663828,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.44216,0.537973,0.52994,0.540502,0.538033,0.291822,0.72242,0.687202,0.704967,0.683719,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.451812,0.554496,0.55036,0.556305,0.554504,0.259507,0.724792,0.693729,0.70398,0.689798,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.375932,0.526851,0.522914,0.530473,0.528955,0.291114,0.735469,0.68977,0.722697,0.682012,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.334383,0.557674,0.544075,0.575935,0.562419,0.239815,0.727165,0.69812,0.717103,0.700798,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.429819,0.546552,0.536774,0.563254,0.552121,0.265509,0.728351,0.695085,0.710225,0.691888,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.373348,0.527804,0.517106,0.528881,0.527872,0.312871,0.73191,0.678629,0.724956,0.670784,35.0,1.0,1.0,45.333333,2.0,0.0,64.0,7.0


In [32]:
best_bilstm_history_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,1.194463,0.566254,0.536447,"[0.41890166028097064, 0.6539923954372623]",0.614231,"[0.7008547008547008, 0.5276073619631901]",0.579362,"[0.2987249544626594, 0.86]",0.470731,0.725979,...,True,384,0.5,0.001,1,focal,2,False,,64
0,0.326094,0.561487,0.537478,"[0.4320987654320988, 0.6428571428571428]",0.597939,"[0.6704980842911877, 0.5253807106598984]",0.573381,"[0.31876138433515483, 0.828]",0.248915,0.718861,...,True,384,0.5,0.001,12,focal,2,False,,64
0,0.370141,0.560534,0.555466,"[0.5080042689434365, 0.602928509905254]",0.571451,"[0.6134020618556701, 0.529500756429652]",0.566758,"[0.4335154826958106, 0.7]",0.231668,0.701068,...,True,384,0.5,0.001,123,focal,2,False,,64


In [33]:
best_bilstm_history_35["f1"].mean()

0.5431304571426941

In [34]:
best_bilstm_history_35["precision"].mean()

0.5945406126757166

In [35]:
best_bilstm_history_35["recall"].mean()

0.5731669702489374

In [36]:
np.stack(best_bilstm_history_35["f1_scores"]).mean(axis=0)

array([0.45300156, 0.63325935])

In [37]:
np.stack(best_bilstm_history_35["precision_scores"]).mean(axis=0)

array([0.66158495, 0.52749628])

In [38]:
np.stack(best_bilstm_history_35["recall_scores"]).mean(axis=0)

array([0.35033394, 0.796     ])

## Unidirectional

In [39]:
size = 50
bilstm_history_35_uni, best_bilstm_history_35_uni, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=False,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_uni.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_35_focal_2_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_35_focal_2_uni_best_model.csv


In [40]:
bilstm_history_35_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_35_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.323338,0.535748,0.529944,0.537406,0.535641,0.287399,0.720047,0.667392,0.712376,0.660067,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.397495,0.552908,0.549027,0.560888,0.556883,0.321444,0.733096,0.680516,0.724271,0.672418,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.479235,0.550365,0.54608,0.550505,0.549159,0.416611,0.736655,0.681401,0.732851,0.673102,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.35208,0.571973,0.571238,0.575138,0.574355,0.300836,0.744959,0.688067,0.749428,0.678917,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.329795,0.566889,0.564681,0.572451,0.570211,0.287456,0.7414,0.685632,0.741643,0.676827,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.379032,0.536384,0.535686,0.537952,0.537527,0.339646,0.73191,0.68019,0.721692,0.672189,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.382449,0.555132,0.554148,0.559312,0.558294,0.329862,0.744959,0.68729,0.750503,0.678214,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.454626,0.560216,0.5556,0.566639,0.563033,0.379929,0.733096,0.67972,0.72522,0.671715,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.328483,0.545917,0.533762,0.54989,0.546843,0.286131,0.73191,0.680163,0.721909,0.672189,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.443744,0.546235,0.545068,0.548956,0.548366,0.36381,0.737841,0.681714,0.736147,0.67333,35.0,1.0,0.0,45.333333,2.0,0.0,64.0,7.0


In [41]:
best_bilstm_history_35_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.364149,0.542421,0.538952,"[0.4989561586638831, 0.5789473684210525]",0.549989,"[0.5843520782396088, 0.515625]",0.547668,"[0.4353369763205829, 0.66]",0.329813,0.743772,...,False,384,0.5,0.0001,1,focal,2,False,,64
0,0.374256,0.551954,0.550808,"[0.5281124497991968, 0.573502722323049]",0.556642,"[0.5883668903803132, 0.5249169435215947]",0.555526,"[0.4790528233151184, 0.632]",0.327832,0.747331,...,False,384,0.5,0.0001,12,focal,2,False,,64
0,0.289464,0.550048,0.527116,"[0.4229828850855745, 0.63125]",0.580536,"[0.6431226765799256, 0.517948717948718]",0.561559,"[0.3151183970856102, 0.808]",0.230791,0.708185,...,False,384,0.5,0.0001,123,focal,2,False,,64


In [42]:
best_bilstm_history_35_uni["f1"].mean()

0.538958597382126

In [43]:
best_bilstm_history_35_uni["precision"].mean()

0.5623887177783601

In [44]:
best_bilstm_history_35_uni["recall"].mean()

0.5549180327868853

In [45]:
np.stack(best_bilstm_history_35_uni["f1_scores"]).mean(axis=0)

array([0.4833505, 0.5945667])

In [46]:
np.stack(best_bilstm_history_35_uni["precision_scores"]).mean(axis=0)

array([0.60528055, 0.51949689])

In [47]:
np.stack(best_bilstm_history_35_uni["recall_scores"]).mean(axis=0)

array([0.40983607, 0.7       ])

## history_length = 50

In [50]:
size = 50
bilstm_history_50, best_bilstm_history_50, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_50_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_50_focal_2_best_model.csv


In [51]:
bilstm_history_50.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_50.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.308573,0.559898,0.556181,0.56785,0.564425,0.244076,0.714116,0.670134,0.694292,0.663846,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.36752,0.556721,0.547914,0.570537,0.561359,0.27475,0.721234,0.685717,0.703905,0.682787,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.498473,0.562123,0.543566,0.58681,0.569257,0.311242,0.715302,0.693876,0.704735,0.701327,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.303906,0.577375,0.572376,0.582972,0.578266,0.241361,0.718861,0.664513,0.704445,0.658433,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.357183,0.557038,0.546301,0.571602,0.560979,0.269153,0.720047,0.682692,0.703149,0.679748,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.340354,0.557356,0.543603,0.583562,0.565745,0.248501,0.736655,0.702555,0.725009,0.701921,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.347706,0.550048,0.544047,0.561264,0.556621,0.251506,0.736655,0.686456,0.727433,0.678022,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.369215,0.54401,0.535336,0.556376,0.550674,0.271088,0.714116,0.692303,0.69381,0.69407,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.361638,0.561169,0.545066,0.589562,0.57031,0.2429,0.711744,0.690711,0.691655,0.693614,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.420866,0.555767,0.551771,0.564744,0.561252,0.293142,0.740214,0.688964,0.734463,0.680113,50.0,1.0,1.0,45.333333,2.0,0.0,64.0,7.0


In [52]:
best_bilstm_history_50

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.522368,0.561487,0.53111,"[0.411764705882353, 0.6504559270516718]",0.607748,"[0.6909871244635193, 0.5245098039215687]",0.57463,"[0.29326047358834245, 0.856]",0.299781,0.715302,...,True,384,0.1,0.001,1,focal,2,False,,64
0,0.276995,0.534795,0.52831,"[0.4730021598272138, 0.5836177474402731]",0.544915,"[0.5809018567639257, 0.5089285714285714]",0.541454,"[0.3989071038251366, 0.684]",0.199421,0.715302,...,True,384,0.1,0.001,12,focal,2,False,,64
0,0.271424,0.530982,0.525074,"[0.4721030042918455, 0.5780445969125214]",0.540209,"[0.5744125326370757, 0.506006006006006]",0.537364,"[0.4007285974499089, 0.674]",0.207214,0.729537,...,True,384,0.1,0.001,123,focal,2,False,,64


In [53]:
best_bilstm_history_50["f1"].mean()

0.5281646902343131

In [54]:
best_bilstm_history_50["precision"].mean()

0.5642909825367778

In [55]:
best_bilstm_history_50["recall"].mean()

0.5511493624772313

In [56]:
np.stack(best_bilstm_history_50["f1_scores"]).mean(axis=0)

array([0.45228996, 0.60403942])

In [57]:
np.stack(best_bilstm_history_50["precision_scores"]).mean(axis=0)

array([0.61543384, 0.51314813])

In [58]:
np.stack(best_bilstm_history_50["recall_scores"]).mean(axis=0)

array([0.36429872, 0.738     ])

## Unidirectional

In [59]:
size = 50
bilstm_history_50_uni, best_bilstm_history_50_uni, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=False,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_ids=None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_uni.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_50_focal_2_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/lstm_history_50_focal_2_uni_best_model.csv


In [60]:
bilstm_history_50_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_50_uni.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100,0.1,0.0001,0.321718,0.567525,0.564948,0.572111,0.570134,0.287906,0.720047,0.668152,0.711284,0.66077,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,4.0
100,0.1,0.0005,0.366988,0.548459,0.544661,0.556066,0.552217,0.286125,0.735469,0.682609,0.728845,0.67428,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,5.0
100,0.1,0.001,0.528008,0.526533,0.51488,0.521036,0.521362,0.413482,0.733096,0.678997,0.725811,0.671012,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,3.0
100,0.5,0.0001,0.358694,0.580235,0.579976,0.583017,0.582426,0.31008,0.744959,0.689596,0.747377,0.680323,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,1.0
100,0.5,0.0005,0.356399,0.568796,0.565183,0.577669,0.57361,0.285974,0.737841,0.682405,0.735953,0.674033,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,2.0
100,0.5,0.001,0.429581,0.554496,0.550842,0.55285,0.551945,0.423116,0.737841,0.681691,0.735909,0.67333,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,0.0
200,0.1,0.0001,0.399494,0.563711,0.558741,0.574671,0.569466,0.305385,0.740214,0.686836,0.737286,0.678004,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,10.0
200,0.1,0.0005,0.329617,0.558627,0.549408,0.578431,0.564877,0.288045,0.730724,0.682096,0.720244,0.674773,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,11.0
200,0.1,0.001,0.345803,0.563076,0.556696,0.571955,0.566271,0.277891,0.73191,0.681565,0.720639,0.673595,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,9.0
200,0.5,0.0001,0.376615,0.558627,0.554925,0.566785,0.56333,0.286739,0.742586,0.688179,0.742399,0.679163,50.0,1.0,0.0,45.333333,2.0,0.0,64.0,7.0


In [61]:
best_bilstm_history_50_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.32747,0.546234,0.545785,"[0.531496062992126, 0.5600739371534197]",0.549389,"[0.5781584582441114, 0.520618556701031]",0.548902,"[0.4918032786885246, 0.606]",0.321424,0.743772,...,False,384,0.1,0.0005,1,focal,2,False,,64
0,0.335829,0.552908,0.551131,"[0.5228891149542217, 0.579372197309417]",0.558685,"[0.5921658986175116, 0.5252032520325203]",0.557062,"[0.4681238615664845, 0.646]",0.283601,0.740214,...,False,384,0.1,0.0005,12,focal,2,False,,64
0,0.473695,0.542421,0.53779,"[0.49152542372881364, 0.5840554592720971]",0.551316,"[0.5873417721518988, 0.5152905198776758]",0.548293,"[0.4225865209471767, 0.674]",0.365368,0.743772,...,False,384,0.1,0.0005,123,focal,2,False,,64


In [62]:
best_bilstm_history_50_uni["f1"].mean()

0.5449020325683492

In [63]:
best_bilstm_history_50_uni["precision"].mean()

0.5531297429374581

In [64]:
best_bilstm_history_50_uni["recall"].mean()

0.5514189435336977

In [65]:
np.stack(best_bilstm_history_50_uni["f1_scores"]).mean(axis=0)

array([0.51530353, 0.57450053])

In [66]:
np.stack(best_bilstm_history_50_uni["precision_scores"]).mean(axis=0)

array([0.58588871, 0.52037078])

In [67]:
np.stack(best_bilstm_history_50_uni["recall_scores"]).mean(axis=0)

array([0.46083789, 0.642     ])