In [1]:
import pickle
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import re

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search,
    obtain_SWNUNetwork_input,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## SWNU Network

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [8]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 3)]
ffn_hidden_dim_sizes = [[32,32], [128,128], [512,512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# w=5

In [10]:
size = 5

## umap

In [11]:
(
    swnu_network_umap_kfold_5,
    best_swnu_network_umap_kfold_5,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_5_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_5_kfold_best_model.csv


In [12]:
swnu_network_umap_kfold_5.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_5.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(32, 32)",0.1,0.0001,0.695993,0.665693,0.666005,0.665721,0.731006,0.710582,0.714625,0.707925,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,11.0
15,"(10,)","(32, 32)",0.1,0.0003,0.693569,0.664857,0.664295,0.665867,0.735095,0.715443,0.719349,0.713017,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,10.0
15,"(10,)","(32, 32)",0.1,0.0005,0.688785,0.664661,0.662588,0.668667,0.732983,0.716705,0.717054,0.716583,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,9.0
15,"(10,)","(128, 128)",0.1,0.0001,0.689779,0.664521,0.662508,0.667586,0.728625,0.712176,0.712362,0.712021,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(128, 128)",0.1,0.0003,0.685741,0.665745,0.66327,0.673011,0.727232,0.713879,0.712352,0.716418,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(128, 128)",0.1,0.0005,0.690401,0.665239,0.663548,0.668474,0.732579,0.715797,0.716724,0.715351,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(512, 512)",0.1,0.0001,0.695806,0.669261,0.667747,0.671278,0.730512,0.713012,0.714163,0.712029,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(512, 512)",0.1,0.0003,0.695868,0.670727,0.668809,0.673663,0.730332,0.713928,0.714156,0.71374,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(512, 512)",0.1,0.0005,0.695868,0.670215,0.668393,0.672802,0.729209,0.712029,0.712872,0.711339,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(12,)","(32, 32)",0.1,0.0001,0.695371,0.665128,0.666001,0.66553,0.732938,0.712095,0.71732,0.709305,5.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,2.0


In [13]:
best_swnu_network_umap_kfold_5

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.694129,0.667738,"[0.7613785080703795, 0.5740981053724371]",0.666108,"[0.7724992623192682, 0.5597165991902834]",0.669906,"[0.7505733944954128, 0.5892381459776239]",,0.730961,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.694129,0.664676,"[0.7640546369518333, 0.5652980132450331]",0.66427,"[0.7663686184020767, 0.5621707060063225]",0.665107,"[0.7617545871559633, 0.5684603090037293]",,0.737431,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.692265,0.664944,"[0.7606205596636219, 0.5692668927732847]",0.663602,"[0.7694338515693752, 0.5577709611451943]",0.666627,"[0.752006880733945, 0.5812466702184337]",,0.738509,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swnu_network_umap_kfold_5["f1"].mean()

0.6657861193460982

In [15]:
best_swnu_network_umap_kfold_5["precision"].mean()

0.6646599997720867

In [16]:
best_swnu_network_umap_kfold_5["recall"].mean()

0.6672133312641847

In [17]:
np.stack(best_swnu_network_umap_kfold_5["f1_scores"]).mean(axis=0)

array([0.7620179 , 0.56955434])

In [18]:
np.stack(best_swnu_network_umap_kfold_5["precision_scores"]).mean(axis=0)

array([0.76943391, 0.55988609])

In [19]:
np.stack(best_swnu_network_umap_kfold_5["recall_scores"]).mean(axis=0)

array([0.75477829, 0.57964838])

# w=11

In [20]:
size = 11

## umap

In [21]:
(
    swnu_network_umap_kfold_11,
    best_swnu_network_umap_kfold_11,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_11_kfold_best_model.csv


In [22]:
swnu_network_umap_kfold_11.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(32, 32)",0.1,0.0001,0.694626,0.671392,0.669043,0.675743,0.732758,0.716955,0.71722,0.717126,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,11.0
15,"(10,)","(32, 32)",0.1,0.0003,0.690836,0.670285,0.667651,0.67697,0.74143,0.727156,0.726415,0.728365,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,10.0
15,"(10,)","(32, 32)",0.1,0.0005,0.69009,0.667941,0.665372,0.673239,0.744081,0.729957,0.729107,0.731004,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,9.0
15,"(10,)","(128, 128)",0.1,0.0001,0.695247,0.672524,0.670132,0.677287,0.737027,0.722093,0.721609,0.722861,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(128, 128)",0.1,0.0003,0.692016,0.671189,0.668682,0.677632,0.736802,0.723299,0.722313,0.725508,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(128, 128)",0.1,0.0005,0.691208,0.669131,0.667737,0.675083,0.733163,0.718939,0.719446,0.721504,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(512, 512)",0.1,0.0001,0.693942,0.673833,0.671177,0.680794,0.728759,0.715452,0.713857,0.717946,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(512, 512)",0.1,0.0003,0.700093,0.677384,0.674833,0.681957,0.735858,0.720196,0.720171,0.720333,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(512, 512)",0.1,0.0005,0.688413,0.668002,0.665407,0.674819,0.727726,0.714613,0.712831,0.717315,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(12,)","(32, 32)",0.1,0.0001,0.699845,0.675856,0.673574,0.679428,0.733881,0.718018,0.717977,0.718079,11.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,2.0


In [23]:
best_swnu_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.689469,0.665035,"[0.7555033754035808, 0.5745658835546477]",0.662822,"[0.7739025856885148, 0.5517410495340853]",0.66866,"[0.7379587155963303, 0.5993606819392648]",,0.749966,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.691706,0.669005,"[0.7556868537666174, 0.5823232323232324]",0.666466,"[0.7794028031687995, 0.5535285645703313]",0.673825,"[0.7333715596330275, 0.6142781033564199]",,0.761019,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.702516,0.679525,"[0.76536312849162, 0.5936863543788188]",0.676979,"[0.7854556427278213, 0.568503169185763]",0.683738,"[0.7462729357798165, 0.6212040490143846]",,0.748214,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [24]:
best_swnu_network_umap_kfold_11["f1"].mean()

0.6711881379864195

In [25]:
best_swnu_network_umap_kfold_11["precision"].mean()

0.6687556358125525

In [26]:
best_swnu_network_umap_kfold_11["recall"].mean()

0.6754076742198739

In [27]:
np.stack(best_swnu_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.75885112, 0.58352516])

In [28]:
np.stack(best_swnu_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.77958701, 0.55792426])

In [29]:
np.stack(best_swnu_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.73920107, 0.61161428])

# w=20

In [30]:
size = 20

## umap

In [31]:
(
    swnu_network_umap_kfold_20,
    best_swnu_network_umap_kfold_20,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_20_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_20_kfold_best_model.csv


In [32]:
swnu_network_umap_kfold_20.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_20.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(32, 32)",0.1,0.0001,0.68543,0.669056,0.667229,0.679949,0.787483,0.77872,0.775681,0.784563,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,11.0
15,"(10,)","(32, 32)",0.1,0.0003,0.693756,0.675167,0.672952,0.683891,0.802579,0.792578,0.790623,0.795584,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,10.0
15,"(10,)","(32, 32)",0.1,0.0005,0.689966,0.671905,0.669647,0.681017,0.804466,0.794915,0.792424,0.798376,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,9.0
15,"(10,)","(128, 128)",0.1,0.0001,0.693321,0.675734,0.67359,0.685402,0.775082,0.764842,0.76255,0.769565,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(128, 128)",0.1,0.0003,0.694688,0.677597,0.675277,0.687643,0.773105,0.763204,0.760405,0.767899,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(128, 128)",0.1,0.0005,0.693383,0.676329,0.673936,0.686352,0.782271,0.772493,0.769698,0.77707,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(512, 512)",0.1,0.0001,0.689966,0.673155,0.67109,0.68356,0.758728,0.748299,0.745702,0.752956,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(512, 512)",0.1,0.0003,0.688661,0.670589,0.66851,0.679768,0.76349,0.752919,0.75054,0.757618,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(512, 512)",0.1,0.0005,0.689717,0.671713,0.670349,0.681523,0.7634,0.751668,0.750595,0.755418,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(12,)","(32, 32)",0.1,0.0001,0.68338,0.665099,0.662874,0.673984,0.786449,0.776599,0.773885,0.78065,20.0,3.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,2.0


In [33]:
best_swnu_network_umap_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.676608,0.656369,"[0.7397630118494075, 0.5729756337681516]",0.654098,"[0.7757156338471217, 0.5324794144556267]",0.663567,"[0.7069954128440367, 0.6201385189131593]",,0.815204,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.688164,0.665587,"[0.7524781772451546, 0.5786955426844623]",0.663074,"[0.7774380923265056, 0.5487106017191977]",0.670609,"[0.7290711009174312, 0.6121470431539691]",,0.834075,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.686114,0.663353,"[0.7508875739644971, 0.5758186397984887]",0.660889,"[0.7756723716381418, 0.5461060678451983]",0.668294,"[0.7276376146788991, 0.608950452850293]",,0.798356,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [34]:
best_swnu_network_umap_kfold_20["f1"].mean()

0.6617697632183602

In [35]:
best_swnu_network_umap_kfold_20["precision"].mean()

0.6593536969719653

In [36]:
best_swnu_network_umap_kfold_20["recall"].mean()

0.6674900238929647

In [37]:
np.stack(best_swnu_network_umap_kfold_20["f1_scores"]).mean(axis=0)

array([0.74770959, 0.57582994])

In [38]:
np.stack(best_swnu_network_umap_kfold_20["precision_scores"]).mean(axis=0)

array([0.77627537, 0.54243203])

In [39]:
np.stack(best_swnu_network_umap_kfold_20["recall_scores"]).mean(axis=0)

array([0.72123471, 0.61374534])

# w=35

In [40]:
size = 35

## umap

In [None]:
(
    swnu_network_umap_kfold_35,
    best_swnu_network_umap_kfold_35,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_umap_kfold_35.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_umap_kfold_35

In [None]:
best_swnu_network_umap_kfold_35["f1"].mean()

In [None]:
best_swnu_network_umap_kfold_35["precision"].mean()

In [None]:
best_swnu_network_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_swnu_network_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_35["recall_scores"]).mean(axis=0)

# w=80

In [None]:
size = 80

## umap

In [None]:
(
    swnu_network_umap_kfold_80,
    best_swnu_network_umap_kfold_80,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_umap_kfold_80.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_umap_kfold_80

In [None]:
best_swnu_network_umap_kfold_80["f1"].mean()

In [None]:
best_swnu_network_umap_kfold_80["precision"].mean()

In [None]:
best_swnu_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_swnu_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_80["recall_scores"]).mean(axis=0)