In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [8]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [10]:
shift = 3
window_size = 5
n = 3

## GRP

In [11]:
(
    seqsignet_network_grp_kfold_11,
    best_seqsignet_network_grp_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_3_kfold_best_model.csv


In [12]:
seqsignet_network_grp_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.676048,0.663425,"[0.7286071205496565, 0.5982431807674526]",0.664224,"[0.8000685871056241, 0.5283789301755819]",0.679131,"[0.6688646788990825, 0.6893979754928077]",,0.729748,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.677353,0.664756,"[0.7297423887587823, 0.5997687861271677]",0.665493,"[0.8011655810764484, 0.5298202614379085]",0.680504,"[0.6700114678899083, 0.6909962706446457]",,0.732039,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691519,0.676391,"[0.746360153256705, 0.6064209274673008]",0.674729,"[0.801778070464274, 0.5476804123711341]",0.688692,"[0.6981077981651376, 0.6792754395311668]",,0.749831,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.702144,0.679671,"[0.7645151783082818, 0.5948275862068966]",0.677013,"[0.7865372953305033, 0.5674891146589259]",0.684313,"[0.7436926605504587, 0.6249334043686734]",,0.756841,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.705126,0.683281,"[0.7664599940950694, 0.6001011122345803]",0.680483,"[0.7900182592818016, 0.570947570947571]",0.688329,"[0.7442660550458715, 0.6323921150772509]",,0.748214,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.695247,0.677156,"[0.7535795026375283, 0.6007326007326007]",0.674481,"[0.7944073721004131, 0.5545536519386834]",0.686022,"[0.716743119266055, 0.6553010122535962]",,0.741340,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.697484,0.680585,"[0.7540536444915897, 0.6071169208424111]",0.678044,"[0.7997428479588556, 0.5563442768411713]",0.690695,"[0.713302752293578, 0.6680873734683005]",,0.738374,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.687605,0.665643,"[0.7513353115727003, 0.5799498746867169]",0.663080,"[0.7785977859778598, 0.5475627070515854]",0.671163,"[0.7259174311926605, 0.6164091635588705]",,0.735813,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.705312,0.672163,"[0.7764106915570642, 0.5679147308007653]",0.674585,"[0.7661177783979906, 0.5830527497194163]",0.670263,"[0.7869839449541285, 0.5535428875865743]",,0.740261,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


in path: 0.6690594175374892

in input: 0.6685830625233913

both: 0.6834005713392136

In [13]:
best_seqsignet_network_grp_kfold_11["f1"].mean()

0.6834560497955615

In [14]:
best_seqsignet_network_grp_kfold_11["precision"].mean()

0.6806433546132534

In [15]:
best_seqsignet_network_grp_kfold_11["recall"].mean()

0.6896971466928651

In [16]:
np.stack(best_seqsignet_network_grp_kfold_11["f1_scores"]).mean(axis=0)

array([0.76426463, 0.60264747])

In [17]:
np.stack(best_seqsignet_network_grp_kfold_11["precision_scores"]).mean(axis=0)

array([0.79290385, 0.56838286])

In [18]:
np.stack(best_seqsignet_network_grp_kfold_11["recall_scores"]).mean(axis=0)

array([0.73776758, 0.64162671])

# history_length=20

In [10]:
shift = 3
window_size = 5
n = 6

## GRP

In [11]:
(
    seqsignet_network_grp_kfold_20,
    best_seqsignet_network_grp_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_6_kfold_best_model.csv


In [12]:
seqsignet_network_grp_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.695993,0.686699,"[0.7406582922563205, 0.6327403737896871]",0.689730,"[0.8314887540164227, 0.5479719188767551]",0.708126,"[0.6677178899082569, 0.7485348961108151]",,0.763984,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.681081,0.671548,"[0.7275043796782927, 0.6155920017973489]",0.675295,"[0.8183446793264063, 0.5322455322455323]",0.692352,"[0.6548165137614679, 0.7298881193393714]",,0.789190,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.686300,0.676409,"[0.7329842931937173, 0.619832843912356]",0.679322,"[0.8206039076376554, 0.5380392156862746]",0.696612,"[0.6622706422018348, 0.7309536494405967]",,0.795390,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.703262,0.686373,"[0.7591527987897125, 0.6135922330097088]",0.683591,"[0.8036515054452275, 0.5635309852875613]",0.696369,"[0.7193233944954128, 0.6734150239744273]",,0.772880,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.698229,0.684427,"[0.7504239247726222, 0.6184303558802734]",0.683063,"[0.8116038679559854, 0.5545224006762468]",0.698404,"[0.6978211009174312, 0.6989877464038359]",,0.780563,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.696365,0.678640,"[0.7541132075471698, 0.6031668696711328]",0.675979,"[0.7963021995537137, 0.5556552962298025]",0.687866,"[0.7161697247706422, 0.6595631326584976]",,0.757514,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.719478,0.702770,"[0.7732409221033599, 0.6322990471536769]",0.699398,"[0.8148618609082249, 0.5839350180505415]",0.712532,"[0.7356651376146789, 0.6893979754928077]",,0.774633,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.702889,0.686378,"[0.7583383869011523, 0.6144170295113692]",0.683696,"[0.8046975546975547, 0.562693841382366]",0.696821,"[0.7170298165137615, 0.6766116142781033]",,0.756706,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.705312,0.681393,"[0.7686905632772495, 0.5940949935815149]",0.679111,"[0.7848819838661488, 0.5733399405351833]",0.684781,"[0.7531536697247706, 0.6164091635588705]",,0.767219,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_grp_kfold_20["f1"].mean()

0.6653909745748255

In [14]:
best_seqsignet_network_grp_kfold_20["precision"].mean()

0.6727026677742289

In [15]:
best_seqsignet_network_grp_kfold_20["recall"].mean()

0.6897973201673566

In [16]:
np.stack(best_seqsignet_network_grp_kfold_20["f1_scores"]).mean(axis=0)

array([0.71603963, 0.61474232])

In [17]:
np.stack(best_seqsignet_network_grp_kfold_20["precision_scores"]).mean(axis=0)

array([0.82238224, 0.52302309])

In [18]:
np.stack(best_seqsignet_network_grp_kfold_20["recall_scores"]).mean(axis=0)

array([0.63407875, 0.74551589])

# history_length=35

In [19]:
shift = 3
window_size = 5
n = 11

## GRP

In [20]:
(
    seqsignet_network_grp_kfold_35,
    best_seqsignet_network_grp_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_11_kfold_best_model.csv


In [21]:
seqsignet_network_grp_kfold_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.691892,0.676174,"[0.7475179471513671, 0.604829070045422]",0.674252,"[0.7999346191565871, 0.5485689505637468]",0.687748,"[0.7015481651376146, 0.6739477890250399]",,0.830435,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.697484,0.681664,"[0.7526291723822589, 0.6106980091148956]",0.679430,"[0.8034493979824276, 0.5554101221640488]",0.693033,"[0.707855504587156, 0.6782099094299414]",,0.836231,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.685927,0.670691,"[0.7415247737383034, 0.5998575160294466]",0.669279,"[0.7974265918838667, 0.5411311053984575]",0.682915,"[0.692947247706422, 0.6728822589238146]",,0.836770,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.700652,0.681581,"[0.759508834980533, 0.6036525172754195]",0.678642,"[0.7949843260188088, 0.5622988505747126]",0.689318,"[0.7270642201834863, 0.6515716568993074]",,0.803478,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.713141,0.694237,"[0.7702642185400805, 0.6182088811709253]",0.690973,"[0.8034880099657428, 0.5784586815227484]",0.701752,"[0.7396788990825688, 0.663825253063399]",,0.797547,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.707363,0.691707,"[0.7611804076665653, 0.62223291626564]",0.689056,"[0.8107582631237849, 0.5673541026766126]",0.703091,"[0.7173165137614679, 0.688865210442195]",,0.773285,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.723020,0.705028,"[0.7778774289985052, 0.6321782178217821]",0.701500,"[0.8126171143035603, 0.5903837263060564]",0.713164,"[0.7459862385321101, 0.6803409696323921]",,0.795525,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.706617,0.688346,"[0.7638055222088836, 0.6128873585833744]",0.685266,"[0.8013224181360201, 0.5692096847875743]",0.696735,"[0.729644495412844, 0.663825253063399]",,0.792290,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.694315,0.674966,"[0.7542703026670663, 0.5956607495069034]",0.672201,"[0.7900188323917138, 0.5543827443781552]",0.682599,"[0.7216169724770642, 0.6435801811401172]",,0.805769,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


best_seqsignet_network_grp_kfold_35["f1"].mean()

In [22]:
best_seqsignet_network_grp_kfold_35["precision"].mean()

0.670192096858707

In [23]:
best_seqsignet_network_grp_kfold_35["recall"].mean()

0.6850225457778777

In [24]:
np.stack(best_seqsignet_network_grp_kfold_35["f1_scores"]).mean(axis=0)

array([0.73626204, 0.60393224])

In [25]:
np.stack(best_seqsignet_network_grp_kfold_35["precision_scores"]).mean(axis=0)

array([0.80334066, 0.53704353])

In [26]:
np.stack(best_seqsignet_network_grp_kfold_35["recall_scores"]).mean(axis=0)

array([0.67975917, 0.69028592])

# history_length=80

In [27]:
shift = 3
window_size = 5
n = 26

## GRP

In [28]:
(
    seqsignet_network_grp_kfold_80,
    best_seqsignet_network_grp_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/seqsignet_grp_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_26_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_grp_focal_2_3_5_26_kfold_best_model.csv


In [29]:
seqsignet_network_grp_kfold_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.708854,0.688702,"[0.7679049034175335, 0.6095]",0.685613,"[0.797038864898211, 0.5741874705605275]",0.695133,"[0.7408256880733946, 0.6494405966968567]",,0.856854,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.725443,0.705990,"[0.7816160118606376, 0.6303638644918445]",0.702580,"[0.8093337427080135, 0.5958254269449715]",0.712443,"[0.7557339449541285, 0.6691529035695258]",,0.852271,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.709413,0.686921,"[0.7708363957077758, 0.6030048382989559]",0.684256,"[0.7909502262443439, 0.577560975609756]",0.691257,"[0.7517201834862385, 0.630793819925413]",,0.839331,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.706990,0.687339,"[0.7657228017883756, 0.608955223880597]",0.684246,"[0.7973308504034761, 0.5711619225384974]",0.694315,"[0.7365252293577982, 0.6521044219499201]",,0.804017,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.722647,0.705261,"[0.7768446310737853, 0.6336779911373708]",0.701741,"[0.8144654088050315, 0.5890160183066362]",0.714107,"[0.742545871559633, 0.6856686201385189]",,0.804421,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.713700,0.692490,"[0.7732506643046944, 0.6117290192113245]",0.689514,"[0.7970176506390748, 0.582010582010582]",0.697753,"[0.7508600917431193, 0.6446457112413426]",,0.785955,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.714632,0.698251,"[0.7685563114134543, 0.6279465370595383]",0.695110,"[0.812919731371922, 0.5773011617515639]",0.708558,"[0.7287844036697247, 0.6883324453915823]",,0.804691,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.702144,0.684843,"[0.7586831772878284, 0.6110029211295035]",0.682030,"[0.801531589023612, 0.5625280143433438]",0.694402,"[0.7201834862385321, 0.6686201385189131]",,0.825179,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.698788,0.669019,"[0.7682821909951247, 0.56975505857295]",0.668977,"[0.7685025817555938, 0.5694518360830229]",0.669060,"[0.7680619266055045, 0.5700586041555674]",,0.826931,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [30]:
best_seqsignet_network_grp_kfold_80["f1"].mean()

0.6786881036089433

In [31]:
best_seqsignet_network_grp_kfold_80["precision"].mean()

0.676344916409012

In [32]:
best_seqsignet_network_grp_kfold_80["recall"].mean()

0.6823373366064983

In [33]:
np.stack(best_seqsignet_network_grp_kfold_80["f1_scores"]).mean(axis=0)

array([0.76606611, 0.59131009])

In [34]:
np.stack(best_seqsignet_network_grp_kfold_80["precision_scores"]).mean(axis=0)

array([0.78358011, 0.56910972])

In [35]:
np.stack(best_seqsignet_network_grp_kfold_80["recall_scores"]).mean(axis=0)

array([0.74933104, 0.61534363])