In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True

In [8]:
num_epochs = 100
dimensions = [15]  # [50, 15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([8], 4), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[128,128],[256,256],[512,512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (
    df_rumours[df_rumours["set"] == "train"].index,
    df_rumours[df_rumours["set"] == "dev"].index,
    df_rumours[df_rumours["set"] == "test"].index,
)

In [9]:
shift = 3
window_size = 5
n = 11

In [10]:
size = shift * n + (window_size - shift)
print(size)

35


## UMAP

In [28]:
(
    seqsignet_network_umap,
    best_seqsignet_network_umap,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=bidirectional,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_best_model.csv


In [29]:
seqsignet_network_umap

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.449381,0.595806,0.578424,"[0.4928229665071771, 0.6640253565768621]",0.633819,"[0.7177700348432056, 0.5498687664041995]",0.606614,"[0.37522768670309653, 0.838]",0.178093,0.736655,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.347104,0.613918,0.612967,"[0.5937813440320964, 0.6321525885558582]",0.619875,"[0.6607142857142857, 0.5790349417637272]",0.617581,"[0.5391621129326047, 0.696]",0.201821,0.729537,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.425771,0.538608,0.537970,"[0.5207920792079207, 0.5551470588235293]",0.542052,"[0.5704989154013015, 0.5136054421768708]",0.541526,"[0.4790528233151184, 0.604]",0.211347,0.736655,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.320288,0.609152,0.605919,"[0.570230607966457, 0.6416083916083916]",0.620740,"[0.671604938271605, 0.5698757763975155]",0.614723,"[0.49544626593806923, 0.734]",0.225203,0.587189,...,True,focal,2,False,,Conv1d,,concatenation,64,1
0,0.265662,0.605338,0.590911,"[0.5140845070422535, 0.6677367576243981]",0.640207,"[0.7227722772277227, 0.5576407506702413]",0.615454,"[0.3989071038251366, 0.832]",0.242611,0.622776,...,True,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.331718,0.618684,0.618684,"[0.619047619047619, 0.6183206106870229]",0.619972,"[0.6487025948103793, 0.5912408759124088]",0.619993,"[0.5919854280510018, 0.648]",0.228319,0.747331,...,True,focal,2,False,,Conv1d,,concatenation,64,52
0,0.498844,0.610105,0.609479,"[0.5938430983118173, 0.6251145737855179]",0.614913,"[0.6528384279475983, 0.5769881556683587]",0.613313,"[0.5446265938069217, 0.682]",0.188612,0.708185,...,True,focal,2,False,,Conv1d,,concatenation,64,52
0,0.416412,0.586273,0.567777,"[0.47836538461538464, 0.6571879936808848]",0.623131,"[0.7031802120141343, 0.5430809399477807]",0.597239,"[0.36247723132969034, 0.832]",0.234509,0.640569,...,True,focal,2,False,,Conv1d,,concatenation,64,53
0,0.448582,0.575786,0.524670,"[0.6805455850681981, 0.3687943262411348]",0.597879,"[0.5616113744075829, 0.6341463414634146]",0.561694,"[0.8633879781420765, 0.26]",0.290149,0.722420,...,True,focal,2,False,,Conv1d,,concatenation,64,53


0.5812891232959757

In [30]:
best_seqsignet_network_umap["f1"].mean()

0.5495485365663367

In [31]:
best_seqsignet_network_umap["precision"].mean()

0.5643041846337936

In [32]:
best_seqsignet_network_umap["recall"].mean()

0.5581639344262296

In [33]:
np.stack(best_seqsignet_network_umap["f1_scores"]).mean(axis=0)

array([0.54158652, 0.55751055])

In [34]:
np.stack(best_seqsignet_network_umap["precision_scores"]).mean(axis=0)

array([0.59835229, 0.53025608])

In [35]:
np.stack(best_seqsignet_network_umap["recall_scores"]).mean(axis=0)

array([0.5136612 , 0.60266667])

## Unidirectional LSTM

In [44]:
(
    seqsignet_network_umap_uni,
    best_seqsignet_network_umap_uni,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{size}_uni.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_35_uni_best_model.csv


In [45]:
seqsignet_network_umap_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.535877,0.595806,0.582244,"[0.5069767441860465, 0.6575121163166396]",0.626228,"[0.7009646302250804, 0.551490514905149]",0.605543,"[0.3970856102003643, 0.814]",0.212359,0.761566,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.399652,0.583413,0.583261,"[0.5912067352666043, 0.575315840621963]",0.583619,"[0.6076923076923076, 0.5595463137996219]",0.583796,"[0.575591985428051, 0.592]",0.200841,0.768683,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.424965,0.588179,0.587819,"[0.5756385068762278, 0.6]",0.591677,"[0.6247334754797441, 0.5586206896551724]",0.590849,"[0.5336976320582878, 0.648]",0.184980,0.797153,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.344484,0.597712,0.585647,"[0.5149425287356322, 0.6563517915309447]",0.625695,"[0.6978193146417445, 0.5535714285714286]",0.607007,"[0.4080145719489982, 0.806]",0.229887,0.619217,...,False,focal,2,False,,Conv1d,,concatenation,64,1
0,0.260057,0.584366,0.552236,"[0.43229166666666663, 0.6721804511278195]",0.648273,"[0.7579908675799086, 0.5385542168674698]",0.598184,"[0.302367941712204, 0.894]",0.239729,0.633452,...,False,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.321641,0.566254,0.558157,"[0.617968094038623, 0.49834619625137816]",0.564246,"[0.573208722741433, 0.5552825552825553]",0.561155,"[0.6703096539162113, 0.452]",0.228572,0.690391,...,False,focal,2,False,,Conv1d,,concatenation,64,52
0,0.335902,0.610105,0.606383,"[0.5681098204857444, 0.6446568201563858]",0.622886,"[0.6758793969849246, 0.5698924731182796]",0.615991,"[0.4899817850637523, 0.742]",0.232631,0.587189,...,False,focal,2,False,,Conv1d,,concatenation,64,52
0,0.462198,0.566254,0.547049,"[0.6403162055335969, 0.453781512605042]",0.566605,"[0.5656424581005587, 0.5675675675675675]",0.557852,"[0.7377049180327869, 0.378]",0.193443,0.761566,...,False,focal,2,False,,Conv1d,,concatenation,64,53
0,0.879203,0.609152,0.605781,"[0.6422338568935427, 0.5693277310924371]",0.607986,"[0.6164154103852596, 0.5995575221238938]",0.606155,"[0.6703096539162113, 0.542]",0.548155,0.708185,...,False,focal,2,False,,Conv1d,,concatenation,64,53


In [46]:
best_seqsignet_network_umap_uni["f1"].mean()

0.5938859384862587

In [47]:
best_seqsignet_network_umap_uni["precision"].mean()

0.6418476160018686

In [48]:
best_seqsignet_network_umap_uni["recall"].mean()

0.6177692774741955

In [49]:
np.stack(best_seqsignet_network_umap_uni["f1_scores"]).mean(axis=0)

array([0.52116028, 0.6666116 ])

In [50]:
np.stack(best_seqsignet_network_umap_uni["precision_scores"]).mean(axis=0)

array([0.72200171, 0.56169352])

In [51]:
np.stack(best_seqsignet_network_umap_uni["recall_scores"]).mean(axis=0)

array([0.41287189, 0.82266667])

## GRP

In [36]:
(
    seqsignet_network_grp_20,
    best_seqsignet_network_grp_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=bidirectional,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35_best_model.csv


In [37]:
seqsignet_network_grp_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.416968,0.631077,0.629520,"[0.6055045871559634, 0.6535362578334826]",0.639536,"[0.6875, 0.5915721231766613]",0.635492,"[0.5409836065573771, 0.73]",0.202117,0.669039,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.257084,0.627264,0.627252,"[0.6251198465963567, 0.6293838862559242]",0.629059,"[0.659919028340081, 0.5981981981981982]",0.628903,"[0.5938069216757741, 0.664]",0.210019,0.651246,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.306409,0.630124,0.630003,"[0.6233009708737863, 0.6367041198501873]",0.632976,"[0.6673596673596673, 0.5985915492957746]",0.632350,"[0.5846994535519126, 0.68]",0.208498,0.669039,...,True,focal,2,False,,Conv1d,,concatenation,64,0
0,0.338561,0.644423,0.644360,"[0.6491063029162747, 0.6396135265700483]",0.644949,"[0.6712062256809338, 0.6186915887850467]",0.645208,"[0.6284153005464481, 0.662]",0.212399,0.640569,...,True,focal,2,False,,Conv1d,,concatenation,64,1
0,0.266109,0.602479,0.586854,"[0.506508875739645, 0.6671987230646449]",0.639043,"[0.722972972972973, 0.5551128818061088]",0.612900,"[0.38979963570127507, 0.836]",0.244153,0.622776,...,True,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.298185,0.634890,0.634124,"[0.6508659981768459, 0.6173826173826173]",0.634113,"[0.6514598540145985, 0.6167664670658682]",0.634137,"[0.6502732240437158, 0.618]",0.222504,0.640569,...,True,focal,2,False,,Conv1d,,concatenation,64,52
0,0.430298,0.626311,0.625559,"[0.6423357664233577, 0.6087824351297405]",0.625540,"[0.643510054844607, 0.6075697211155379]",0.625583,"[0.6411657559198543, 0.61]",0.217460,0.697509,...,True,focal,2,False,,Conv1d,,concatenation,64,52
0,0.403756,0.589133,0.571291,"[0.4838323353293412, 0.6587490102929533]",0.625755,"[0.7062937062937062, 0.54521625163827]",0.599971,"[0.3679417122040073, 0.832]",0.232589,0.644128,...,True,focal,2,False,,Conv1d,,concatenation,64,53
0,0.262933,0.634890,0.633064,"[0.6589492430988424, 0.6071794871794872]",0.633879,"[0.6445993031358885, 0.6231578947368421]",0.632976,"[0.6739526411657559, 0.592]",0.211754,0.669039,...,True,focal,2,False,,Conv1d,,concatenation,64,53


0.617518858542935

In [38]:
best_seqsignet_network_grp_20["f1"].mean()

0.61647713201737

In [39]:
best_seqsignet_network_grp_20["precision"].mean()

0.6176564022312184

In [40]:
best_seqsignet_network_grp_20["recall"].mean()

0.6172313296903461

In [41]:
np.stack(best_seqsignet_network_grp_20["f1_scores"]).mean(axis=0)

array([0.62901517, 0.6039391 ])

In [42]:
np.stack(best_seqsignet_network_grp_20["precision_scores"]).mean(axis=0)

array([0.63875257, 0.59656024])

In [43]:
np.stack(best_seqsignet_network_grp_20["recall_scores"]).mean(axis=0)

array([0.62112933, 0.61333333])

## Unidirectional LSTM

In [11]:
(
    seqsignet_network_grp_20_uni,
    best_seqsignet_network_grp_20_uni,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    split_ids=None,  # torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{size}_uni.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35_uni.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_35_uni_best_model.csv


In [12]:
seqsignet_network_grp_20_uni

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.350964,0.616778,0.615940,"[0.6338797814207651, 0.598]",0.615940,"[0.6338797814207651, 0.598]",0.615940,"[0.6338797814207651, 0.598]",0.215109,0.672598,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.357429,0.632984,0.629481,"[0.5934530095036958, 0.6655082536924414]",0.647178,"[0.7060301507537688, 0.5883256528417818]",0.638920,"[0.51183970856102, 0.766]",0.206303,0.644128,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.383064,0.617731,0.617496,"[0.6080156402737048, 0.6269767441860465]",0.621103,"[0.6561181434599156, 0.5860869565217391]",0.620242,"[0.5664845173041895, 0.674]",0.210598,0.676157,...,False,focal,2,False,,Conv1d,,concatenation,64,0
0,0.324067,0.606292,0.599957,"[0.5496183206106872, 0.6502963590177816]",0.624330,"[0.6847826086956522, 0.5638766519823789]",0.613508,"[0.45901639344262296, 0.768]",0.228191,0.604982,...,False,focal,2,False,,Conv1d,,concatenation,64,1
0,0.260443,0.587226,0.555072,"[0.4354628422425033, 0.6746806912096169]",0.653184,"[0.7660550458715596, 0.5403128760529483]",0.601095,"[0.30418943533697634, 0.898]",0.239959,0.637011,...,False,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.315429,0.631077,0.630642,"[0.6433179723502304, 0.6179664363277395]",0.630628,"[0.6511194029850746, 0.6101364522417154]",0.630851,"[0.6357012750455373, 0.626]",0.226526,0.637011,...,False,focal,2,False,,Conv1d,,concatenation,64,52
0,0.464395,0.617731,0.613788,"[0.6528138528138527, 0.5747613997879109]",0.616925,"[0.6221122112211221, 0.6117381489841986]",0.614352,"[0.6867030965391621, 0.542]",0.225429,0.679715,...,False,focal,2,False,,Conv1d,,concatenation,64,52
0,0.402711,0.594852,0.581110,"[0.5052386495925495, 0.6569814366424536]",0.625372,"[0.7, 0.550744248985115]",0.604632,"[0.39526411657559196, 0.814]",0.232981,0.637011,...,False,focal,2,False,,Conv1d,,concatenation,64,53
0,0.348010,0.611058,0.606492,"[0.5641025641025641, 0.648881239242685]",0.625828,"[0.6821705426356589, 0.5694864048338368]",0.617437,"[0.4808743169398907, 0.754]",0.217296,0.622776,...,False,focal,2,False,,Conv1d,,concatenation,64,53


In [13]:
best_seqsignet_network_grp_20_uni["f1"].mean()

0.5951972319238338

In [14]:
best_seqsignet_network_grp_20_uni["precision"].mean()

0.5969041439420218

In [15]:
best_seqsignet_network_grp_20_uni["recall"].mean()

0.5960170006071644

In [16]:
np.stack(best_seqsignet_network_grp_20_uni["f1_scores"]).mean(axis=0)

array([0.61620677, 0.5741877 ])

In [17]:
np.stack(best_seqsignet_network_grp_20_uni["precision_scores"]).mean(axis=0)

array([0.61611315, 0.57769514])

In [18]:
np.stack(best_seqsignet_network_grp_20_uni["recall_scores"]).mean(axis=0)

array([0.61870067, 0.57333333])