In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
num_features = len(features)
add_time_in_path = True

In [8]:
num_epochs = 100
embedding_dim = 384
dimensions = [15]  # [50, 15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (
    df_rumours[df_rumours["set"] == "train"].index,
    df_rumours[df_rumours["set"] == "dev"].index,
    df_rumours[df_rumours["set"] == "test"].index,
)

In [9]:
shift = 3
window_size = 5
n = 11

In [10]:
size = shift * n + (window_size - shift)
print(size)

35


## UMAP

In [11]:
(
    seqsignet_network_umap,
    best_seqsignet_network_umap,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=bidirectional,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_best_model.csv


In [12]:
seqsignet_network_umap

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.449512,0.610105,0.598277,"[0.5293440736478712, 0.6672091131000815]",0.640582,"[0.71875, 0.5624142661179699]",0.619472,"[0.41894353369763204, 0.82]",0.205911,0.658363,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.536899,0.566254,0.564530,"[0.5371312309257376, 0.5919282511210762]",0.572440,"[0.6082949308755761, 0.5365853658536586]",0.570437,"[0.4808743169398907, 0.66]",0.229629,0.683274,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.580718,0.586273,0.581061,"[0.62778730703259, 0.5343347639484979]",0.584791,"[0.593192868719611, 0.5763888888888888]",0.582333,"[0.6666666666666666, 0.498]",0.256931,0.779359,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.311905,0.598665,0.589349,"[0.5274971941638609, 0.6512013256006628]",0.621502,"[0.6871345029239766, 0.5558698727015559]",0.607026,"[0.42805100182149364, 0.786]",0.230468,0.619217,...,True,focal,2,False,,Conv1d,,concatenation,64,1
0,0.493008,0.629171,0.627316,"[0.6536064113980409, 0.601025641025641]",0.628107,"[0.6393728222996515, 0.6168421052631579]",0.627244,"[0.668488160291439, 0.586]",0.230153,0.701068,...,True,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.401599,0.619638,0.619526,"[0.6129970902036858, 0.6260543580131209]",0.622333,"[0.6556016597510373, 0.5890652557319224]",0.621796,"[0.575591985428051, 0.668]",0.221937,0.651246,...,True,focal,2,False,,Conv1d,,concatenation,64,22
0,0.273155,0.608198,0.607919,"[0.6183844011142061, 0.5974534769833497]",0.608047,"[0.6306818181818182, 0.5854126679462572]",0.608279,"[0.6065573770491803, 0.61]",0.229651,0.604982,...,True,focal,2,False,,Conv1d,,concatenation,64,22
0,0.489728,0.592946,0.583740,"[0.5218365061590146, 0.645643153526971]",0.614549,"[0.6773255813953488, 0.5517730496453901]",0.601204,"[0.424408014571949, 0.778]",0.224572,0.633452,...,True,focal,2,False,,Conv1d,,concatenation,64,23
0,0.463490,0.600572,0.597610,"[0.6321334503950834, 0.5630865484880083]",0.599202,"[0.6101694915254238, 0.5882352941176471]",0.597869,"[0.6557377049180327, 0.54]",0.292312,0.690391,...,True,focal,2,False,,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_umap["f1"].mean()

0.5812891232959757

In [14]:
best_seqsignet_network_umap["precision"].mean()

0.5992710534092841

In [15]:
best_seqsignet_network_umap["recall"].mean()

0.5907474195506982

In [16]:
np.stack(best_seqsignet_network_umap["f1_scores"]).mean(axis=0)

array([0.5647542 , 0.59782404])

In [17]:
np.stack(best_seqsignet_network_umap["precision_scores"]).mean(axis=0)

array([0.64007927, 0.55846284])

In [18]:
np.stack(best_seqsignet_network_umap["recall_scores"]).mean(axis=0)

array([0.52216151, 0.65933333])

## Unidirectional LSTM

In [11]:
(
    seqsignet_network_umap_uni,
    best_seqsignet_network_umap_uni,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{size}_uni.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_uni_best_model.csv


In [12]:
seqsignet_network_umap_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.361723,0.594852,0.585448,"[0.5230078563411897, 0.6478873239436619]",0.617164,"[0.6812865497076024, 0.553041018387553]",0.603204,"[0.424408014571949, 0.782]",0.194299,0.715302,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.410663,0.599619,0.597670,"[0.569672131147541, 0.625668449197861]",0.607681,"[0.6510538641686182, 0.5643086816720257]",0.604188,"[0.5063752276867031, 0.702]",0.269319,0.640569,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.442552,0.619638,0.619636,"[0.6189111747851004, 0.620361560418649]",0.621127,"[0.6506024096385542, 0.5916515426497277]",0.621082,"[0.5901639344262295, 0.652]",0.203600,0.704626,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.473086,0.576740,0.574792,"[0.5460122699386503, 0.6035714285714286]",0.583769,"[0.6223776223776224, 0.5451612903225806]",0.581169,"[0.48633879781420764, 0.676]",0.228288,0.718861,...,False,focal,2,False,,Conv1d,,concatenation,64,1
0,0.378303,0.632984,0.632395,"[0.6176762661370406, 0.6471127406049496]",0.638166,"[0.6790393013100436, 0.5972927241962775]",0.636242,"[0.5664845173041895, 0.706]",0.208031,0.697509,...,False,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.495069,0.593899,0.593896,"[0.5927342256214149, 0.5950570342205322]",0.595386,"[0.6237424547283702, 0.5670289855072463]",0.595332,"[0.5646630236794171, 0.626]",0.271232,0.633452,...,False,focal,2,False,,Conv1d,,concatenation,64,22
0,0.462951,0.610105,0.605445,"[0.648323301805675, 0.5625668449197861]",0.609302,"[0.6140065146579805, 0.6045977011494252]",0.606352,"[0.6867030965391621, 0.526]",0.295552,0.683274,...,False,focal,2,False,,Conv1d,,concatenation,64,22
0,0.420886,0.599619,0.593852,"[0.5454545454545455, 0.6422487223168655]",0.615674,"[0.672, 0.5593471810089021]",0.606508,"[0.45901639344262296, 0.754]",0.227976,0.622776,...,False,focal,2,False,,Conv1d,,concatenation,64,23
0,0.394248,0.625358,0.625291,"[0.6302916274694261, 0.6202898550724637]",0.625875,"[0.6517509727626459, 0.6]",0.626100,"[0.6102003642987249, 0.642]",0.248324,0.661922,...,False,focal,2,False,,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_umap_uni["f1"].mean()

0.602249400811865

In [14]:
best_seqsignet_network_umap_uni["precision"].mean()

0.6158595876729271

In [15]:
best_seqsignet_network_umap_uni["recall"].mean()

0.6095755919854281

In [16]:
np.stack(best_seqsignet_network_umap_uni["f1_scores"]).mean(axis=0)

array([0.59490783, 0.60959097])

In [17]:
np.stack(best_seqsignet_network_umap_uni["precision_scores"]).mean(axis=0)

array([0.64842823, 0.58329095])

In [18]:
np.stack(best_seqsignet_network_umap_uni["recall_scores"]).mean(axis=0)

array([0.56648452, 0.65266667])

## GRP

In [19]:
(
    seqsignet_network_grp_20,
    best_seqsignet_network_grp_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=bidirectional,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35_best_model.csv


In [20]:
seqsignet_network_grp_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.486409,0.607245,0.594917,"[0.5242494226327944, 0.6655844155844155]",0.638099,"[0.7160883280757098, 0.5601092896174863]",0.616740,"[0.4134790528233151, 0.82]",0.213213,0.669039,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.309012,0.636797,0.632759,"[0.6712683347713545, 0.5942492012779552]",0.636620,"[0.6377049180327868, 0.6355353075170843]",0.633281,"[0.7085610200364298, 0.558]",0.207509,0.686833,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.312679,0.632984,0.632214,"[0.6490428441203282, 0.6153846153846154]",0.632203,"[0.6496350364963503, 0.6147704590818364]",0.632226,"[0.6484517304189436, 0.616]",0.196478,0.711744,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.300968,0.634890,0.632183,"[0.6637401229148375, 0.6006256517205422]",0.634064,"[0.6406779661016949, 0.6274509803921569]",0.632262,"[0.6885245901639344, 0.576]",0.209348,0.686833,...,True,focal,2,False,,Conv1d,,concatenation,64,1
0,0.351542,0.623451,0.621021,"[0.6513680494263019, 0.5906735751295337]",0.622376,"[0.6318493150684932, 0.6129032258064516]",0.621066,"[0.6721311475409836, 0.57]",0.196893,0.743772,...,True,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.353152,0.632031,0.631226,"[0.6484517304189436, 0.614]",0.631226,"[0.6484517304189436, 0.614]",0.631226,"[0.6484517304189436, 0.614]",0.231601,0.637011,...,True,focal,2,False,,Conv1d,,concatenation,64,22
0,0.432256,0.628217,0.628160,"[0.632768361581921, 0.6235521235521236]",0.628791,"[0.6549707602339181, 0.6026119402985075]",0.629011,"[0.6120218579234973, 0.646]",0.227555,0.669039,...,True,focal,2,False,,Conv1d,,concatenation,64,22
0,0.531532,0.580553,0.580186,"[0.5677799607072691, 0.5925925925925927]",0.583964,"[0.6162046908315565, 0.5517241379310345]",0.583206,"[0.5264116575591985, 0.64]",0.259972,0.654804,...,True,focal,2,False,,Conv1d,,concatenation,64,23
0,0.486962,0.621544,0.621495,"[0.6258246936852025, 0.6171648987463839]",0.622170,"[0.6484375, 0.595903165735568]",0.622368,"[0.604735883424408, 0.64]",0.270410,0.615658,...,True,focal,2,False,,Conv1d,,concatenation,64,23


In [21]:
best_seqsignet_network_grp_20["f1"].mean()

0.617518858542935

In [22]:
best_seqsignet_network_grp_20["precision"].mean()

0.6215020628660183

In [23]:
best_seqsignet_network_grp_20["recall"].mean()

0.6194219793564056

In [24]:
np.stack(best_seqsignet_network_grp_20["f1_scores"]).mean(axis=0)

array([0.63320618, 0.60183153])

In [25]:
np.stack(best_seqsignet_network_grp_20["precision_scores"]).mean(axis=0)

array([0.64110142, 0.6019027 ])

In [26]:
np.stack(best_seqsignet_network_grp_20["recall_scores"]).mean(axis=0)

array([0.63084396, 0.608     ])

## Unidirectional LSTM

In [None]:
(
    seqsignet_network_grp_20_uni,
    best_seqsignet_network_grp_20_uni,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{size}_uni.csv",
    verbose=False,
)

In [None]:
seqsignet_network_grp_20_uni

In [None]:
best_seqsignet_network_grp_20_uni["f1"].mean()

In [None]:
best_seqsignet_network_grp_20_uni["precision"].mean()

In [None]:
best_seqsignet_network_grp_20_uni["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_grp_20_uni["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_20_uni["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_grp_20_uni["recall_scores"]).mean(axis=0)