In [1]:
import pickle
import numpy as np
import pandas as pd
import os
from tqdm.notebook import tqdm
import re

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search,
    obtain_SWNUNetwork_input,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [7]:
x_data = obtain_SWNUNetwork_input(
    method="umap",
    dimension=30,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    k=5,
    features="time_encoding",
    standardise_method=None,
    include_features_in_path=False,
)

x_data["x_data"]["path"].shape

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


torch.Size([5568, 5, 30])

## SWNU Network

In [8]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [9]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [10]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# w=5

In [11]:
size = 5

## umap

In [12]:
(
    swnu_network_umap_kfold_5,
    best_swnu_network_umap_kfold_5,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_5_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_5_kfold_best_model.csv


In [13]:
swnu_network_umap_kfold_5.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_5.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(256, 256)",0.1,0.0001,0.692575,0.665617,0.664209,0.667522,0.735364,0.717352,0.719397,0.715814,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(256, 256)",0.1,0.0005,0.692016,0.66561,0.664529,0.668199,0.732489,0.714652,0.716813,0.713559,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(256, 256)",0.1,0.001,0.686362,0.663096,0.661021,0.667911,0.728175,0.712817,0.712623,0.713695,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(256, 256)",0.2,0.0001,0.686984,0.66326,0.661238,0.667692,0.732534,0.716351,0.716841,0.716401,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(256, 256)",0.2,0.0005,0.693569,0.667578,0.666444,0.670419,0.732983,0.715593,0.717359,0.714886,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(256, 256)",0.2,0.001,0.689282,0.665382,0.663104,0.669378,0.726378,0.710855,0.710375,0.711519,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(10,)","(512, 512)",0.1,0.0001,0.688972,0.664952,0.662726,0.668893,0.731141,0.715097,0.71521,0.715163,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,19.0
15,"(10,)","(512, 512)",0.1,0.0005,0.689034,0.664354,0.662216,0.667833,0.732129,0.715747,0.716036,0.715486,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,20.0
15,"(10,)","(512, 512)",0.1,0.001,0.6927,0.666706,0.665266,0.669422,0.726783,0.709633,0.71059,0.709175,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,18.0
15,"(10,)","(512, 512)",0.2,0.0001,0.691457,0.666502,0.664523,0.669779,0.733881,0.717388,0.717881,0.71697,5.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,22.0


In [14]:
best_swnu_network_umap_kfold_5

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.694501,0.67057,"[0.7593598590515342, 0.581781066598622]",0.668244,"[0.7782124586217274, 0.5582761998041136]",0.674376,"[0.7413990825688074, 0.607352157698455]",,0.736487,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.685927,0.661471,"[0.7524607022183045, 0.5704817741524343]",0.659269,"[0.7716179572160289, 0.5469208211143695]",0.665198,"[0.7342316513761468, 0.5961640916355887]",,0.735679,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.691333,0.664278,"[0.7595818815331009, 0.5689744924518481]",0.662823,"[0.7694117647058824, 0.556234096692112]",0.666156,"[0.75, 0.582312200319659]",,0.7315,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [15]:
best_swnu_network_umap_kfold_5["f1"].mean()

0.6654399626676407

In [16]:
best_swnu_network_umap_kfold_5["precision"].mean()

0.6634455496923723

In [17]:
best_swnu_network_umap_kfold_5["recall"].mean()

0.6685765305997761

In [18]:
np.stack(best_swnu_network_umap_kfold_5["f1_scores"]).mean(axis=0)

array([0.75713415, 0.57374578])

In [19]:
np.stack(best_swnu_network_umap_kfold_5["precision_scores"]).mean(axis=0)

array([0.77308073, 0.55381037])

In [20]:
np.stack(best_swnu_network_umap_kfold_5["recall_scores"]).mean(axis=0)

array([0.74187691, 0.59527615])

# w=11

In [21]:
size = 11

## umap

In [22]:
(
    swnu_network_umap_kfold_11,
    best_swnu_network_umap_kfold_11,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_11_kfold_best_model.csv


In [23]:
swnu_network_umap_kfold_11.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(256, 256)",0.1,0.0001,0.685306,0.664227,0.661929,0.670708,0.73132,0.718312,0.716586,0.721057,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(256, 256)",0.1,0.0005,0.68543,0.66402,0.661489,0.670065,0.737701,0.722783,0.72221,0.723451,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(256, 256)",0.1,0.001,0.693135,0.668533,0.66647,0.671971,0.731096,0.71409,0.714856,0.713429,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(256, 256)",0.2,0.0001,0.683939,0.663859,0.661496,0.671174,0.731186,0.719012,0.716925,0.722577,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(256, 256)",0.2,0.0005,0.686424,0.666887,0.664441,0.674562,0.733073,0.71972,0.718118,0.722065,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(256, 256)",0.2,0.001,0.69096,0.667018,0.664841,0.670996,0.727457,0.711792,0.711596,0.712412,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(10,)","(512, 512)",0.1,0.0001,0.691519,0.669747,0.667651,0.675609,0.733881,0.719295,0.718906,0.72075,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,19.0
15,"(10,)","(512, 512)",0.1,0.0005,0.679404,0.663382,0.662076,0.674781,0.731276,0.72095,0.71869,0.726656,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,20.0
15,"(10,)","(512, 512)",0.1,0.001,0.690774,0.664446,0.663095,0.666915,0.732399,0.71441,0.716568,0.713169,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,18.0
15,"(10,)","(512, 512)",0.2,0.0001,0.694936,0.671949,0.669727,0.676638,0.735454,0.72073,0.720192,0.721839,11.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,22.0


In [24]:
best_swnu_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.707176,0.685383,"[0.7681865132064335, 0.6025803187452567]",0.682563,"[0.7914259653390088, 0.5736994219653179]",0.690398,"[0.7462729357798165, 0.6345231752797017]",,0.74754,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.683131,0.664164,"[0.7439759036144579, 0.5843520782396089]",0.66181,"[0.7836294416243654, 0.5399909624943515]",0.672398,"[0.7081422018348624, 0.6366542354821524]",,0.746192,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.69767,0.673952,"[0.7618907809747505, 0.5860132720775907]",0.671578,"[0.7806859205776173, 0.5624693777560019]",0.677797,"[0.7439793577981652, 0.6116142781033564]",,0.749427,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [25]:
best_swnu_network_umap_kfold_11["f1"].mean()

0.6744998111430164

In [26]:
best_swnu_network_umap_kfold_11["precision"].mean()

0.6719835149594439

In [27]:
best_swnu_network_umap_kfold_11["recall"].mean()

0.6801976973796758

In [28]:
np.stack(best_swnu_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.75801773, 0.59098189])

In [29]:
np.stack(best_swnu_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.78524711, 0.55871992])

In [30]:
np.stack(best_swnu_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.73279817, 0.62759723])

# w=20

In [11]:
size = 20

## umap

In [12]:
(
    swnu_network_umap_kfold_20,
    best_swnu_network_umap_kfold_20,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_20_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swnu_network_umap_focal_2_20_kfold_best_model.csv


In [13]:
swnu_network_umap_kfold_20.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_kfold_20.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(256, 256)",0.1,0.0001,0.684809,0.665335,0.663373,0.67336,0.773465,0.762911,0.760921,0.766968,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(256, 256)",0.1,0.0005,0.689034,0.668113,0.667514,0.675625,0.787887,0.777396,0.777358,0.780001,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(256, 256)",0.1,0.001,0.679528,0.665703,0.666131,0.680167,0.768298,0.759593,0.757948,0.766323,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(256, 256)",0.2,0.0001,0.677043,0.657585,0.655781,0.665706,0.774183,0.76413,0.761997,0.768952,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(256, 256)",0.2,0.0005,0.685492,0.665343,0.663017,0.672491,0.793728,0.78279,0.781838,0.784427,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(256, 256)",0.2,0.001,0.679279,0.664253,0.664032,0.677146,0.754369,0.745807,0.74413,0.753282,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(10,)","(512, 512)",0.1,0.0001,0.686362,0.666271,0.663943,0.673571,0.770364,0.759596,0.757475,0.763331,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,19.0
15,"(10,)","(512, 512)",0.1,0.0005,0.678037,0.661279,0.659719,0.671843,0.78263,0.77343,0.770393,0.778719,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,20.0
15,"(10,)","(512, 512)",0.1,0.001,0.677664,0.663799,0.664309,0.678241,0.750595,0.742706,0.741218,0.751522,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,18.0
15,"(10,)","(512, 512)",0.2,0.0001,0.684747,0.665427,0.66319,0.673436,0.768837,0.759164,0.756471,0.764496,20.0,4.0,...,384.0,2.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,22.0


In [14]:
best_swnu_network_umap_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.690028,0.675956,"[0.7434829554218726, 0.6084294796326819]",0.674911,"[0.8046744574290484, 0.5451476793248945]",0.689636,"[0.6909403669724771, 0.6883324453915823]",,0.784472,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.692078,0.672714,"[0.7523238380809595, 0.5931034482758619]",0.670016,"[0.7884978001257071, 0.5515345854328905]",0.680386,"[0.7193233944954128, 0.6414491209376665]",,0.824639,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.705871,0.685957,"[0.7650387135199523, 0.6068759342301944]",0.682903,"[0.7958488228004956, 0.5699578848853533]",0.692717,"[0.7365252293577982, 0.648907831646244]",,0.795121,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [15]:
best_swnu_network_umap_kfold_20["f1"].mean()

0.6782090615269204

In [16]:
best_swnu_network_umap_kfold_20["precision"].mean()

0.6759435383330649

In [17]:
best_swnu_network_umap_kfold_20["recall"].mean()

0.6875797314668635

In [18]:
np.stack(best_swnu_network_umap_kfold_20["f1_scores"]).mean(axis=0)

array([0.75361517, 0.60280295])

In [19]:
np.stack(best_swnu_network_umap_kfold_20["precision_scores"]).mean(axis=0)

array([0.79634036, 0.55554672])

In [20]:
np.stack(best_swnu_network_umap_kfold_20["recall_scores"]).mean(axis=0)

array([0.71559633, 0.65956313])

# w=35

In [21]:
size = 35

## umap

In [None]:
(
    swnu_network_umap_kfold_35,
    best_swnu_network_umap_kfold_35,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
swnu_network_umap_kfold_35.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_umap_kfold_35

In [None]:
best_swnu_network_umap_kfold_35["f1"].mean()

In [None]:
best_swnu_network_umap_kfold_35["precision"].mean()

In [None]:
best_swnu_network_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_swnu_network_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_35["recall_scores"]).mean(axis=0)

# w=80

In [None]:
size = 80

## umap

In [None]:
(
    swnu_network_umap_kfold_80,
    best_swnu_network_umap_kfold_80,
    _,
    __,
) = swnu_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

In [None]:
swnu_network_umap_kfold_80.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

In [None]:
best_swnu_network_umap_kfold_80

In [None]:
best_swnu_network_umap_kfold_80["f1"].mean()

In [None]:
best_swnu_network_umap_kfold_80["precision"].mean()

In [None]:
best_swnu_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_swnu_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swnu_network_umap_kfold_80["recall_scores"]).mean(axis=0)