In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = False

In [8]:
split_ids = torch.tensor(df_rumours['timeline_id'].astype(int))

In [9]:
num_epochs = 100
dimensions = [15]  # [50, 15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256,256],[512,512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

# history_length=11

In [10]:
shift = 3
window_size = 5
n = 3

## UMAP

In [11]:
(
    seqsignet_network_umap_11,
    best_seqsignet_network_umap_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold_best_model.csv


In [12]:
seqsignet_network_umap_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.687232,0.661882,"[0.7544629792215394, 0.5693018480492813]",0.659877,"[0.7704722056186492, 0.5492818226844973]",0.664971,"[0.739105504587156, 0.5908364411294619]",,0.730152,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.687605,0.665574,"[0.7514090774250964, 0.5797392176529589]",0.663017,"[0.7784265519360787, 0.5476077688299384]",0.671040,"[0.726204128440367, 0.6158763985082578]",,0.735139,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691519,0.665812,"[0.7584999270392528, 0.5731235491359298]",0.663931,"[0.7723625557206538, 0.5555]",0.668514,"[0.7451261467889908, 0.5919019712306872]",,0.734466,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.687605,0.665091,"[0.7519242155121373, 0.5782586814292904]",0.662580,"[0.7772337821297429, 0.547925608011445]",0.670179,"[0.7282110091743119, 0.6121470431539691]",,0.745923,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.688723,0.667043,"[0.752004752004752, 0.582082082082082]",0.664440,"[0.7800369685767098, 0.5488437942425672]",0.672762,"[0.7259174311926605, 0.6196057538625466]",,0.740801,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.667474,0.646494,"[0.7326139088729017, 0.5603745687530803]",0.644454,"[0.7675879396984925, 0.5213204951856947]",0.653221,"[0.7006880733944955, 0.6057538625466169]",,0.723278,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.690587,0.662607,"[0.7597684515195369, 0.5654450261780105]",0.661468,"[0.7670952659263589, 0.5558414822439527]",0.663983,"[0.7525802752293578, 0.5753862546616942]",,0.735274,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.685741,0.661598,"[0.7519858781994704, 0.5712105798575788]",0.659339,"[0.7722054380664652, 0.5464720194647202]",0.665547,"[0.7327981651376146, 0.5982951518380394]",,0.728535,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.677353,0.654493,"[0.7433654558932543, 0.5656210790464241]",0.652178,"[0.7697267424009825, 0.534629981024668]",0.659588,"[0.71875, 0.6004262120404902]",,0.732309,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_umap_11["f1"].mean()

0.6676737903632827

In [14]:
best_seqsignet_network_umap_11["precision"].mean()

0.6652308167486963

In [15]:
best_seqsignet_network_umap_11["recall"].mean()

0.6721373603120994

In [16]:
np.stack(best_seqsignet_network_umap_11["f1_scores"]).mean(axis=0)

array([0.75551811, 0.57982947])

In [17]:
np.stack(best_seqsignet_network_umap_11["precision_scores"]).mean(axis=0)

array([0.77765941, 0.55280222])

In [18]:
np.stack(best_seqsignet_network_umap_11["recall_scores"]).mean(axis=0)

array([0.73461391, 0.60966081])

## GRP

In [19]:
(
    seqsignet_network_grp_11,
    best_seqsignet_network_grp_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_3_kfold_best_model.csv


In [20]:
seqsignet_network_grp_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.690960,0.664104,"[0.7590816623074689, 0.5691268191268191]",0.662574,"[0.769593400117855, 0.5555555555555556]",0.666115,"[0.7488532110091743, 0.5833777304208844]",,0.735544,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.689842,0.671583,"[0.7490196078431373, 0.5941463414634146]",0.669085,"[0.7902609802673456, 0.5479082321187584]",0.680389,"[0.7118692660550459, 0.648907831646244]",,0.741205,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691146,0.669669,"[0.7538987078568247, 0.585439079309482]",0.667007,"[0.7821263482280432, 0.5518867924528302]",0.675486,"[0.7276376146788991, 0.6233351092168353]",,0.746057,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.693383,0.671388,"[0.7564045609358804, 0.5863716369122453]",0.668737,"[0.7822358346094946, 0.5552380952380952]",0.676714,"[0.7322247706422018, 0.6212040490143846]",,0.736083,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.704007,0.680439,"[0.7672236880680153, 0.5936540429887411]",0.678045,"[0.7849430113977205, 0.5711472181191531]",0.684147,"[0.7502866972477065, 0.6180074587107086]",,0.741744,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.693010,0.669954,"[0.7571870853604601, 0.5827210539650368]",0.667459,"[0.7793626707132019, 0.5555555555555556]",0.674459,"[0.7362385321100917, 0.6126798082045818]",,0.733657,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.690960,0.664791,"[0.7584498834498835, 0.5711329539575789]",0.663042,"[0.7710308056872038, 0.555052790346908]",0.667223,"[0.7462729357798165, 0.5881726158763985]",,0.735813,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.688537,0.667814,"[0.7507829977628635, 0.5848447204968943]",0.665179,"[0.7824059682934411, 0.547951582867784]",0.674341,"[0.7216169724770642, 0.6270644645711241]",,0.724896,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.690774,0.666551,"[0.7564234326824255, 0.5766777239091605]",0.664291,"[0.7752031297020764, 0.5533790401567091]",0.670278,"[0.7385321100917431, 0.6020245071923281]",,0.729883,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [21]:
best_seqsignet_network_grp_11["f1"].mean()

0.6729131608489154

In [22]:
best_seqsignet_network_grp_11["precision"].mean()

0.6704798130261965

In [23]:
best_seqsignet_network_grp_11["recall"].mean()

0.6808099138696502

In [24]:
np.stack(best_seqsignet_network_grp_11["f1_scores"]).mean(axis=0)

array([0.75217887, 0.59364745])

In [25]:
np.stack(best_seqsignet_network_grp_11["precision_scores"]).mean(axis=0)

array([0.78916189, 0.55179773])

In [26]:
np.stack(best_seqsignet_network_grp_11["recall_scores"]).mean(axis=0)

array([0.71875   , 0.64286983])

# history_length=20

In [11]:
shift = 3
window_size = 5
n = 6

## UMAP

In [28]:
(
    seqsignet_network_umap_20,
    best_seqsignet_network_umap_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_6_kfold_best_model.csv


In [29]:
seqsignet_network_umap_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.684436,0.667817,"[0.7421172886519422, 0.5935174069627851]",0.665945,"[0.7916802079948001, 0.5402097902097902]",0.678446,"[0.698394495412844, 0.6584976025572722]",,0.727996,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.675676,0.665336,"[0.7241597970830691, 0.6065128900949798]",0.668422,"[0.8099290780141843, 0.5269155206286837]",0.684627,"[0.6548165137614679, 0.7144379328716036]",,0.743901,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.692824,0.665358,"[0.7612286293827876, 0.5694879832810867]",0.664083,"[0.7694786174575279, 0.5586878523833931]",0.666934,"[0.7531536697247706, 0.580713905167821]",,0.735948,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.696365,0.672934,"[0.760476400529334, 0.5853906846525834]",0.670495,"[0.7805614246906127, 0.5604288499025342]",0.677039,"[0.7413990825688074, 0.6126798082045818]",,0.752932,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.697111,0.671203,"[0.7634987629166059, 0.578906452448821]",0.669460,"[0.7753473248595921, 0.5635721493440968]",0.673553,"[0.752006880733945, 0.5950985615343634]",,0.762636,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.680522,0.659502,"[0.7441027172290235, 0.5749007936507936]",0.657072,"[0.7763239875389408, 0.5378190255220417]",0.665962,"[0.7144495412844036, 0.6174746936600959]",,0.751853,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.692078,0.669266,"[0.7561263655152053, 0.5824064711830131]",0.666740,"[0.7793670115642118, 0.5541125541125541]",0.673988,"[0.7342316513761468, 0.6137453383058071]",,0.738644,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.692637,0.667468,"[0.75895336939044, 0.5759835433273336]",0.665446,"[0.7742320310169997, 0.5566600397614314]",0.670481,"[0.7442660550458715, 0.5966968566862014]",,0.743496,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.673066,0.648993,"[0.7409158050221567, 0.5570707070707072]",0.646846,"[0.7641681901279708, 0.5295247239558329]",0.653338,"[0.7190366972477065, 0.5876398508257858]",,0.766411,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [30]:
best_seqsignet_network_umap_20["f1"].mean()

0.6755659591088605

In [31]:
best_seqsignet_network_umap_20["precision"].mean()

0.6736780140817012

In [32]:
best_seqsignet_network_umap_20["recall"].mean()

0.6862234055335877

In [33]:
np.stack(best_seqsignet_network_umap_20["f1_scores"]).mean(axis=0)

array([0.74881178, 0.60232014])

In [34]:
np.stack(best_seqsignet_network_umap_20["precision_scores"]).mean(axis=0)

array([0.7973703 , 0.54998573])

In [35]:
np.stack(best_seqsignet_network_umap_20["recall_scores"]).mean(axis=0)

array([0.70613532, 0.66631149])

## GRP

In [12]:
(
    seqsignet_network_grp_20,
    best_seqsignet_network_grp_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_6_kfold_best_model.csv


In [13]:
seqsignet_network_grp_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.689469,0.673258,"[0.7460365853658536, 0.6004796163069545]",0.671280,"[0.7965494791666666, 0.5460095944177933]",0.684285,"[0.7015481651376146, 0.6670218433670752]",,0.758054,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.683691,0.669863,"[0.7374284388055082, 0.6022966955706586]",0.669333,"[0.8010084033613445, 0.5376569037656904]",0.683901,"[0.6831995412844036, 0.6846030900372936]",,0.765063,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.695806,0.679075,"[0.7523520485584219, 0.6057971014492753]",0.676647,"[0.7991618310767247, 0.554131683605833]",0.689405,"[0.7107224770642202, 0.6680873734683005]",,0.760884,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.695806,0.675919,"[0.7561995817149685, 0.595639246778989]",0.673098,"[0.7894572676232064, 0.556739231125521]",0.683007,"[0.7256307339449541, 0.6403835908364411]",,0.757784,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.698043,0.676946,"[0.7595011876484561, 0.5943915873810716]",0.674130,"[0.7875615763546798, 0.5606991025035427]",0.682882,"[0.7333715596330275, 0.6323921150772509]",,0.760749,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.695993,0.673091,"[0.7596168017686071, 0.5865652724968313]",0.670549,"[0.7816196542311192, 0.559477756286267]",0.677614,"[0.7388188073394495, 0.6164091635588705]",,0.741205,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.703448,0.681912,"[0.7646797810974709, 0.5991433610481229]",0.679075,"[0.7897952948365414, 0.5683556405353728]",0.687285,"[0.7411123853211009, 0.6334576451784762]",,0.744575,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.686486,0.668338,"[0.7459214501510574, 0.5907542579075427]",0.665989,"[0.7883141762452107, 0.5436632333184057]",0.677316,"[0.707855504587156, 0.6467767714437933]",,0.739722,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.684995,0.669963,"[0.740399385560676, 0.5995260663507108]",0.668696,"[0.7974851091992058, 0.539906103286385]",0.682444,"[0.6909403669724771, 0.6739477890250399]",,0.740666,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [14]:
best_seqsignet_network_grp_20["f1"].mean()

0.6787307173955389

In [15]:
best_seqsignet_network_grp_20["precision"].mean()

0.6777579496533935

In [16]:
best_seqsignet_network_grp_20["recall"].mean()

0.6925933184012486

In [17]:
np.stack(best_seqsignet_network_grp_20["f1_scores"]).mean(axis=0)

array([0.74560519, 0.61185625])

In [18]:
np.stack(best_seqsignet_network_grp_20["precision_scores"]).mean(axis=0)

array([0.80713446, 0.54838144])

In [19]:
np.stack(best_seqsignet_network_grp_20["recall_scores"]).mean(axis=0)

array([0.69294725, 0.69223939])

# history_length=35

In [20]:
shift = 3
window_size = 5
n = 11

## UMAP

In [21]:
(
    seqsignet_network_umap_35,
    best_seqsignet_network_umap_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_11_kfold_best_model.csv


In [22]:
seqsignet_network_umap_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.696925,0.676079,"[0.7582515611061553, 0.593906093906094]",0.673263,"[0.7875231624459543, 0.5590032910202163]",0.682268,"[0.7310779816513762, 0.6334576451784762]",,0.764658,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.680708,0.659532,"[0.7444427868118753, 0.5746213061832629]",0.657095,"[0.776049766718507, 0.5381395348837209]",0.665859,"[0.7153096330275229, 0.6164091635588705]",,0.785011,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.689469,0.671249,"[0.7486421243210621, 0.5938566552901025]",0.668772,"[0.7901273885350318, 0.5474157303370787]",0.680102,"[0.711295871559633, 0.648907831646244]",,0.751045,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.705499,0.686369,"[0.7638266068759342, 0.6089108910891088]",0.683297,"[0.7979387882573392, 0.5686546463245492]",0.693906,"[0.7325114678899083, 0.6553010122535962]",,0.774363,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.693756,0.671447,"[0.7570604761200651, 0.5858331232669524]",0.668830,"[0.7816793893129771, 0.5559808612440191]",0.676509,"[0.7339449541284404, 0.619072988811934]",,0.785011,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.687791,0.662225,"[0.7551527554451102, 0.5692980200565698]",0.660277,"[0.7703549060542797, 0.5501988071570576]",0.665155,"[0.7405389908256881, 0.5897709110282365]",,0.773285,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.689469,0.667227,"[0.7532582938388626, 0.5811965811965812]",0.664660,"[0.7791053921568627, 0.5502141837220371]",0.672474,"[0.7290711009174312, 0.6158763985082578]",,0.746462,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.693569,0.672559,"[0.7555026769779892, 0.5896155766350474]",0.669820,"[0.7849196538936959, 0.5547205260685768]",0.678703,"[0.7282110091743119, 0.6291955247735749]",,0.770319,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.677167,0.657390,"[0.7397054403366395, 0.5750736015701668]",0.655146,"[0.777321541377132, 0.5329695316052752]",0.664981,"[0.7055619266055045, 0.6244006393180608]",,0.789864,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [23]:
best_seqsignet_network_umap_35["f1"].mean()

0.6650737649808086

In [24]:
best_seqsignet_network_umap_35["precision"].mean()

0.663205691131907

In [25]:
best_seqsignet_network_umap_35["recall"].mean()

0.6735115316343506

In [26]:
np.stack(best_seqsignet_network_umap_35["f1_scores"]).mean(axis=0)

array([0.7442269 , 0.58592063])

In [27]:
np.stack(best_seqsignet_network_umap_35["precision_scores"]).mean(axis=0)

array([0.78462748, 0.5417839 ])

In [28]:
np.stack(best_seqsignet_network_umap_35["recall_scores"]).mean(axis=0)

array([0.70823777, 0.6387853 ])

## GRP

In [29]:
(
    seqsignet_network_grp_35,
    best_seqsignet_network_grp_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_11_kfold_best_model.csv


In [30]:
seqsignet_network_grp_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.683877,0.665392,"[0.7440386356776337, 0.5867446393762183]",0.663085,"[0.7855321861057999, 0.5406376290974405]",0.674079,"[0.7067087155963303, 0.6414491209376665]",,0.812913,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.696179,0.676634,"[0.7561340514661878, 0.597132970835393]",0.673807,"[0.7906758448060075, 0.5569386814200092]",0.684032,"[0.7244839449541285, 0.6435801811401172]",,0.804691,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.706617,0.693123,"[0.7574730354391371, 0.6287735849056604]",0.691450,"[0.8187874750166556, 0.5641134151502327]",0.707439,"[0.7047018348623854, 0.7101758124667021]",,0.804691,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.697111,0.675450,"[0.759294919271219, 0.591605931138477]",0.672708,"[0.7854734906527735, 0.5599429115128449]",0.680935,"[0.7348050458715596, 0.6270644645711241]",,0.795255,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.712768,0.692606,"[0.7713310580204779, 0.6138812327737408]",0.689458,"[0.799446324207936, 0.5794701986754967]",0.698882,"[0.7451261467889908, 0.6526371870005327]",,0.787033,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.700280,0.678143,"[0.7625516834022444, 0.5937342091965638]",0.675434,"[0.7862362971985384, 0.5646323882748678]",0.683126,"[0.7402522935779816, 0.6259989344698987]",,0.747944,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.714818,0.692579,"[0.7752643948296123, 0.6098929117797043]",0.689860,"[0.7948795180722892, 0.5848410757946211]",0.696891,"[0.7565940366972477, 0.637187000532765]",,0.770993,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.707922,0.688178,"[0.7666418466120626, 0.6097135740971358]",0.685071,"[0.7976448713975829, 0.5724976613657624]",0.695032,"[0.7379587155963303, 0.6521044219499201]",,0.764389,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.703075,0.681907,"[0.7639650318565713, 0.5998492840994725]",0.679023,"[0.7905550444648881, 0.5674904942965779]",0.687613,"[0.739105504587156, 0.6361214704315397]",,0.770993,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [31]:
best_seqsignet_network_grp_35["f1"].mean()

0.6840207408101057

In [32]:
best_seqsignet_network_grp_35["precision"].mean()

0.6811249273345

In [33]:
best_seqsignet_network_grp_35["recall"].mean()

0.6911671637918534

In [34]:
np.stack(best_seqsignet_network_grp_35["f1_scores"]).mean(axis=0)

array([0.76279742, 0.60524406])

In [35]:
np.stack(best_seqsignet_network_grp_35["precision_scores"]).mean(axis=0)

array([0.79538072, 0.56686913])

In [36]:
np.stack(best_seqsignet_network_grp_35["recall_scores"]).mean(axis=0)

array([0.73289373, 0.6494406 ])

# history_length=80

In [10]:
shift = 3
window_size = 5
n = 26

## UMAP

In [11]:
(
    seqsignet_network_umap_80,
    best_seqsignet_network_umap_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_26_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_26_kfold_best_model.csv


In [12]:
seqsignet_network_umap_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.710904,0.679308,"[0.7799687898992765, 0.5786471067644662]",0.681164,"[0.7719741645605167, 0.5903547671840355]",0.677763,"[0.7881307339449541, 0.567394778902504]",,0.782316,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.674930,0.658376,"[0.7335777574091049, 0.5831739961759083]",0.656990,"[0.7851536952256377, 0.5288253142609449]",0.669167,"[0.6883600917431193, 0.6499733617474693]",,0.805365,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.687978,0.664151,"[0.7536061230497498, 0.5746951219512195]",0.661823,"[0.7743496672716274, 0.5492957746478874]",0.668251,"[0.7339449541284404, 0.6025572722429409]",,0.767354,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.712209,0.690496,"[0.7724727379899795, 0.6085192697768763]",0.687638,"[0.7947240751970891, 0.5805515239477503]",0.695376,"[0.7514334862385321, 0.6393180607352158]",,0.785281,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.679031,0.658249,"[0.7425239234449761, 0.5739732805541812]",0.655867,"[0.7759375, 0.535796766743649]",0.664938,"[0.7118692660550459, 0.6180074587107086]",,0.781507,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.683131,0.654477,"[0.7539797395079595, 0.5549738219895287]",0.653399,"[0.7612507305669199, 0.5455481214616572]",0.655789,"[0.7468463302752294, 0.5647309536494406]",,0.770589,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.684623,0.664137,"[0.747085201793722, 0.5811881188118813]",0.661607,"[0.7804497189256715, 0.5427646786870088]",0.670961,"[0.7164564220183486, 0.6254661694192861]",,0.750640,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.683877,0.665142,"[0.7443473017787158, 0.5859375]",0.662795,"[0.7848061029879212, 0.540784136998648]",0.673587,"[0.707855504587156, 0.6393180607352158]",,0.778676,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.674930,0.650181,"[0.7432273262661956, 0.5571356018283392]",0.648094,"[0.7639225181598063, 0.532265890344493]",0.654034,"[0.7236238532110092, 0.5844432605221097]",,0.785146,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_umap_80["f1"].mean()

0.6599636979298662

In [14]:
best_seqsignet_network_umap_80["precision"].mean()

0.6575780020072118

In [15]:
best_seqsignet_network_umap_80["recall"].mean()

0.6659816684832814

In [16]:
np.stack(best_seqsignet_network_umap_80["f1_scores"]).mean(axis=0)

array([0.74541404, 0.57451336])

In [17]:
np.stack(best_seqsignet_network_umap_80["precision_scores"]).mean(axis=0)

array([0.77563317, 0.53952284])

In [18]:
np.stack(best_seqsignet_network_umap_80["recall_scores"]).mean(axis=0)

array([0.71750765, 0.61445569])

## GRP

In [19]:
(
    seqsignet_network_grp_80,
    best_seqsignet_network_grp_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    output_dim=output_dim,
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    lstm_hidden_dim_sizes=lstm_hidden_dim_sizes,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=split_ids,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_26_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_26_kfold_best_model.csv


In [20]:
seqsignet_network_grp_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.686300,0.666860,"[0.7473352349497072, 0.5863848611452446]",0.664341,"[0.7844311377245509, 0.5442518248175182]",0.674589,"[0.7135894495412844, 0.635588705380927]",,0.831109,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.715005,0.690274,"[0.777793925301555, 0.6027539620680696]",0.688459,"[0.78868258178603, 0.5882352941176471]",0.692605,"[0.7672018348623854, 0.6180074587107086]",,0.818439,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.719478,0.696372,"[0.7801314828341855, 0.6126126126126127]",0.693991,"[0.7953529937444147, 0.5926294820717132]",0.699736,"[0.7654816513761468, 0.633990410229089]",,0.824370,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.694688,0.675799,"[0.7540540540540541, 0.5975429975429976]",0.673053,"[0.7916141235813366, 0.5544915640674875]",0.683870,"[0.7198967889908257, 0.6478423015450187]",,0.802804,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.707549,0.689007,"[0.7649438202247192, 0.6130702836004933]",0.685887,"[0.8010668340131786, 0.5707070707070707]",0.697083,"[0.7319380733944955, 0.662226957911561]",,0.792560,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.695433,0.675202,"[0.7562649164677805, 0.5941381023348237]",0.672392,"[0.7882462686567164, 0.5565379246161005]",0.681982,"[0.7267775229357798, 0.637187000532765]",,0.753606,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.719851,0.696978,"[0.7802310279280597, 0.6137239784117193]",0.694515,"[0.7961802447030737, 0.5928500496524329]",0.700515,"[0.7649082568807339, 0.6361214704315397]",,0.787303,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.698975,0.676979,"[0.7612712490761271, 0.5926860025220682]",0.674260,"[0.7857796765334147, 0.5627394636015326]",0.682122,"[0.7382454128440367, 0.6259989344698987]",,0.790807,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.699907,0.676991,"[0.7630261995878717, 0.5909552845528455]",0.674433,"[0.7840290381125227, 0.5648372996600292]",0.681363,"[0.7431192660550459, 0.6196057538625466]",,0.786764,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [21]:
best_seqsignet_network_grp_80["f1"].mean()

0.6900728474161283

In [22]:
best_seqsignet_network_grp_80["precision"].mean()

0.6873359707500803

In [23]:
best_seqsignet_network_grp_80["recall"].mean()

0.6962660063720004

In [24]:
np.stack(best_seqsignet_network_grp_80["f1_scores"]).mean(axis=0)

array([0.76964779, 0.61049791])

In [25]:
np.stack(best_seqsignet_network_grp_80["precision_scores"]).mean(axis=0)

array([0.7974869 , 0.57718505])

In [26]:
np.stack(best_seqsignet_network_grp_80["recall_scores"]).mean(axis=0)

array([0.74397936, 0.64855265])