In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_attention_encoder_functions import (
    seqsignet_attention_encoder_hyperparameter_search,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert_embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


# SeqSigNet with Attention Network

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [8]:
num_epochs = 100
dimensions = [15]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (10, 3, 5)]
num_layers = [1]
ffn_hidden_dim_sizes = [[32, 32], [128, 128], [512, 512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "transformer_encoder_layers": 2,
    "swmhau_parameters": swmhau_parameters,
    "num_layers": num_layers,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [10]:
shift = 3
window_size = 5
n = 3

## Random Projections

In [11]:
(
    seqsignet_attention_encoder_umap_11,
    best_seqsignet_attention_encoder_umap_11,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_3_kfold_best_model.csv


In [12]:
seqsignet_attention_encoder_umap_11.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  seqsignet_attention_encoder_umap_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,shift,...,embedding_dim,num_features,log_signature,transformer_encoder_layers,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,10,3,5,1,"(32, 32)",0.1,0.0001,0.690525,0.659447,0.661522,0.660367,0.717257,0.695889,0.701566,0.694666,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,11.0
15,10,3,5,1,"(32, 32)",0.1,0.0003,0.689966,0.658843,0.664167,0.662603,0.729883,0.708663,0.718838,0.709937,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,10.0
15,10,3,5,1,"(32, 32)",0.1,0.0005,0.697856,0.667311,0.668246,0.667892,0.720672,0.699841,0.703823,0.698104,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,9.0
15,10,3,5,1,"(128, 128)",0.1,0.0001,0.689407,0.665119,0.663777,0.669514,0.729613,0.714507,0.715528,0.71626,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,10,3,5,1,"(128, 128)",0.1,0.0003,0.683691,0.661897,0.659562,0.667825,0.722335,0.709739,0.707908,0.713275,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,10,3,5,1,"(128, 128)",0.1,0.0005,0.688537,0.664256,0.662593,0.668394,0.716224,0.700737,0.700707,0.702499,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,10,3,5,1,"(512, 512)",0.1,0.0001,0.697608,0.666834,0.667548,0.666594,0.729568,0.707798,0.713316,0.704636,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,10,3,5,1,"(512, 512)",0.1,0.0003,0.689904,0.662866,0.662862,0.665385,0.717662,0.700609,0.702124,0.701081,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,10,3,5,1,"(512, 512)",0.1,0.0005,0.699534,0.667152,0.669368,0.666312,0.716179,0.69304,0.699774,0.690446,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,12,3,10,1,"(32, 32)",0.1,0.0001,0.68891,0.656936,0.657923,0.6565,0.730107,0.708908,0.713968,0.706158,11.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,2.0


In [13]:
best_seqsignet_attention_encoder_umap_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.662815,0.648297,"[0.7197521301316809, 0.5768421052631579]",0.648562,"[0.7829457364341085, 0.5141784820683903]",0.661449,"[0.6659977064220184, 0.6568993074054342]",,0.747136,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.699907,0.681218,"[0.7584033613445377, 0.604033448106247]",0.678321,"[0.7956549118387909, 0.5609867519415258]",0.68936,"[0.7244839449541285, 0.6542354821523708]",,0.734196,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.680149,0.658494,"[0.7444907683144729, 0.5724962630792227]",0.656071,"[0.7744733581164808, 0.5376696303228825]",0.664445,"[0.716743119266055, 0.6121470431539691]",,0.724222,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_seqsignet_attention_encoder_umap_11[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001


in path: 0.658

in input: 0.6430702845314854

both: 0.6596275491973381

In [15]:
best_seqsignet_attention_encoder_umap_11["f1"].mean()

0.66266967937322

In [16]:
best_seqsignet_attention_encoder_umap_11["precision"].mean()

0.6609848117870298

In [17]:
best_seqsignet_attention_encoder_umap_11["recall"].mean()

0.6717511005589959

In [18]:
np.stack(best_seqsignet_attention_encoder_umap_11["f1_scores"]).mean(axis=0)

array([0.74088209, 0.58445727])

In [19]:
np.stack(best_seqsignet_attention_encoder_umap_11["precision_scores"]).mean(axis=0)

array([0.784358  , 0.53761162])

In [20]:
np.stack(best_seqsignet_attention_encoder_umap_11["recall_scores"]).mean(axis=0)

array([0.70240826, 0.64109394])

# history_length=20

In [21]:
shift = 3
window_size = 5
n = 6

## Random Projections

In [22]:
(
    seqsignet_attention_encoder_umap_20,
    best_seqsignet_attention_encoder_umap_20,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_6_kfold_best_model.csv


In [23]:
seqsignet_attention_encoder_umap_20.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  seqsignet_attention_encoder_umap_20.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,shift,...,embedding_dim,num_features,log_signature,transformer_encoder_layers,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,10,3,5,1,"(32, 32)",0.1,0.0001,0.67319,0.653496,0.651915,0.661554,0.821899,0.813123,0.81096,0.816603,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,11.0
15,10,3,5,1,"(32, 32)",0.1,0.0003,0.687667,0.666058,0.664981,0.672441,0.817271,0.807296,0.805931,0.809198,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,10.0
15,10,3,5,1,"(32, 32)",0.1,0.0005,0.690587,0.667368,0.66514,0.671939,0.796379,0.785853,0.783957,0.788311,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,9.0
15,10,3,5,1,"(128, 128)",0.1,0.0001,0.680646,0.659645,0.657495,0.66614,0.832682,0.824234,0.82185,0.827285,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,10,3,5,1,"(128, 128)",0.1,0.0003,0.687418,0.666111,0.663804,0.672291,0.823516,0.815143,0.812341,0.8192,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,10,3,5,1,"(128, 128)",0.1,0.0005,0.680522,0.656904,0.65596,0.662312,0.769646,0.758553,0.757291,0.762524,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,10,3,5,1,"(512, 512)",0.1,0.0001,0.685741,0.657841,0.657968,0.659723,0.773689,0.756689,0.760267,0.755177,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,10,3,5,1,"(512, 512)",0.1,0.0003,0.698975,0.671071,0.671044,0.672526,0.738779,0.719709,0.722547,0.718324,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,10,3,5,1,"(512, 512)",0.1,0.0005,0.696303,0.669939,0.669375,0.67248,0.72229,0.705362,0.705899,0.705839,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,12,3,10,1,"(32, 32)",0.1,0.0001,0.674433,0.655705,0.653791,0.664314,0.847374,0.839086,0.837524,0.840944,20.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,2.0


In [24]:
best_seqsignet_attention_encoder_umap_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.679217,0.658945,"[0.7420950097407463, 0.5757949223564209]",0.656587,"[0.7773940345368917, 0.5357798165137615]",0.666066,"[0.7098623853211009, 0.62226957911561]",,0.850923,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.679776,0.66198,"[0.7395391146149182, 0.5844218674407353]",0.659986,"[0.7847490347490348, 0.5352237483385024]",0.671417,"[0.6992545871559633, 0.6435801811401172]",,0.845667,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.664306,0.646189,"[0.72625018999848, 0.5661286437003132]",0.6448,"[0.7728890326755096, 0.5167106420404574]",0.655459,"[0.6849197247706422, 0.6259989344698987]",,0.845532,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [25]:
best_seqsignet_attention_encoder_umap_20[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(32, 32)",0.1,0.0001
0,15,12,3,10,1,"(32, 32)",0.1,0.0001
0,15,12,3,10,1,"(32, 32)",0.1,0.0001


In [26]:
best_seqsignet_attention_encoder_umap_20["f1"].mean()

0.6557049579752691

In [27]:
best_seqsignet_attention_encoder_umap_20["precision"].mean()

0.6537910514756929

In [28]:
best_seqsignet_attention_encoder_umap_20["recall"].mean()

0.6643142319955554

In [29]:
np.stack(best_seqsignet_attention_encoder_umap_20["f1_scores"]).mean(axis=0)

array([0.73596144, 0.57544848])

In [30]:
np.stack(best_seqsignet_attention_encoder_umap_20["precision_scores"]).mean(axis=0)

array([0.77834403, 0.52923807])

In [31]:
np.stack(best_seqsignet_attention_encoder_umap_20["recall_scores"]).mean(axis=0)

array([0.69801223, 0.63061623])

# history_length=35

In [32]:
shift = 3
window_size = 5
n = 11

## Random Projections

In [33]:
(
    seqsignet_attention_encoder_umap_35,
    best_seqsignet_attention_encoder_umap_35,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_11_kfold_best_model.csv


In [34]:
seqsignet_attention_encoder_umap_35.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  seqsignet_attention_encoder_umap_35.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,shift,...,embedding_dim,num_features,log_signature,transformer_encoder_layers,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,10,3,5,1,"(32, 32)",0.1,0.0001,0.675924,0.654903,0.653928,0.662631,0.845262,0.837892,0.835225,0.842089,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,11.0
15,10,3,5,1,"(32, 32)",0.1,0.0003,0.690214,0.667412,0.664936,0.672227,0.859235,0.852157,0.84945,0.855552,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,10.0
15,10,3,5,1,"(32, 32)",0.1,0.0005,0.691022,0.664743,0.664088,0.667927,0.828593,0.817448,0.818548,0.816988,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,9.0
15,10,3,5,1,"(128, 128)",0.1,0.0001,0.6758,0.654019,0.652265,0.660895,0.852586,0.844946,0.842725,0.84778,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,10,3,5,1,"(128, 128)",0.1,0.0003,0.689593,0.66267,0.661562,0.664859,0.803478,0.791445,0.790662,0.79287,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,10,3,5,1,"(128, 128)",0.1,0.0005,0.696179,0.669252,0.668972,0.671319,0.779755,0.765318,0.766085,0.766143,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,10,3,5,1,"(512, 512)",0.1,0.0001,0.688475,0.661595,0.660282,0.663712,0.802444,0.789779,0.791064,0.789116,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,10,3,5,1,"(512, 512)",0.1,0.0003,0.690711,0.662669,0.661747,0.664038,0.765916,0.750641,0.751687,0.749779,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,10,3,5,1,"(512, 512)",0.1,0.0005,0.692451,0.666703,0.664964,0.669354,0.708811,0.691231,0.691482,0.691214,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,12,3,10,1,"(32, 32)",0.1,0.0001,0.682634,0.657887,0.655862,0.661722,0.866918,0.858813,0.859229,0.858523,35.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,2.0


In [35]:
best_seqsignet_attention_encoder_umap_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.704567,0.680445,"[0.7682409709021786, 0.5926497044461577]",0.678219,"[0.7839450910176067, 0.5724925521350546]",0.683716,"[0.7531536697247706, 0.6142781033564199]",,0.864402,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.685368,0.664334,"[0.7483601669648181, 0.580308304326206]",0.66178,"[0.7795031055900621, 0.544055944055944]",0.670673,"[0.7196100917431193, 0.6217368140649974]",,0.86885,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.657968,0.62888,"[0.7327799621377603, 0.5249805850375356]",0.627587,"[0.7445989937851435, 0.5105740181268882]",0.630777,"[0.7213302752293578, 0.5402237613212574]",,0.867502,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [36]:
best_seqsignet_attention_encoder_umap_35[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(32, 32)",0.1,0.0001
0,15,12,3,10,1,"(32, 32)",0.1,0.0001
0,15,12,3,10,1,"(32, 32)",0.1,0.0001


In [37]:
best_seqsignet_attention_encoder_umap_35["f1"].mean()

0.657886615635776

In [38]:
best_seqsignet_attention_encoder_umap_35["precision"].mean()

0.6558616174517832

In [39]:
best_seqsignet_attention_encoder_umap_35["recall"].mean()

0.661722119239987

In [40]:
np.stack(best_seqsignet_attention_encoder_umap_35["f1_scores"]).mean(axis=0)

array([0.7497937 , 0.56597953])

In [41]:
np.stack(best_seqsignet_attention_encoder_umap_35["precision_scores"]).mean(axis=0)

array([0.76934906, 0.54237417])

In [42]:
np.stack(best_seqsignet_attention_encoder_umap_35["recall_scores"]).mean(axis=0)

array([0.73136468, 0.59207956])

# history_length=80

In [43]:
shift = 3
window_size = 5
n = 26

## UMAP

In [44]:
(
    seqsignet_attention_encoder_umap_80,
    best_seqsignet_attention_encoder_umap_80,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_26_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_attention_encoder_umap_focal_2_3_5_26_kfold_best_model.csv


In [45]:
seqsignet_attention_encoder_umap_80.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  seqsignet_attention_encoder_umap_80.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,shift,...,embedding_dim,num_features,log_signature,transformer_encoder_layers,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,10,3,5,1,"(32, 32)",0.1,0.0001,0.672693,0.649794,0.647876,0.655225,0.823022,0.814342,0.81187,0.81776,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,11.0
15,10,3,5,1,"(32, 32)",0.1,0.0003,0.680522,0.651819,0.65242,0.654274,0.859056,0.850455,0.850946,0.850156,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,10.0
15,10,3,5,1,"(32, 32)",0.1,0.0005,0.690463,0.660822,0.660838,0.661468,0.796244,0.782949,0.78441,0.781684,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,9.0
15,10,3,5,1,"(128, 128)",0.1,0.0001,0.667599,0.635514,0.635828,0.635723,0.804825,0.792698,0.795141,0.792398,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,10,3,5,1,"(128, 128)",0.1,0.0003,0.695993,0.666559,0.666282,0.667033,0.749652,0.734102,0.734733,0.733537,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,10,3,5,1,"(128, 128)",0.1,0.0005,0.694315,0.671038,0.668973,0.675422,0.711102,0.694876,0.694669,0.695758,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,10,3,5,1,"(512, 512)",0.1,0.0001,0.68543,0.659562,0.657875,0.662355,0.733342,0.716161,0.717169,0.715312,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,10,3,5,1,"(512, 512)",0.1,0.0003,0.694998,0.667325,0.666535,0.668852,0.706744,0.686594,0.688799,0.685493,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,10,3,5,1,"(512, 512)",0.1,0.0005,0.69096,0.666701,0.66566,0.670955,0.707014,0.689783,0.690976,0.690621,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,12,3,10,1,"(32, 32)",0.1,0.0001,0.674495,0.645259,0.645591,0.648122,0.848767,0.838516,0.842106,0.83657,80.0,3.0,...,384.0,2.0,1.0,2.0,45.333333,2.0,1.0,5.0,64.0,2.0


In [46]:
best_seqsignet_attention_encoder_umap_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.703821,0.680342,"[0.7669746297111013, 0.5937100485809256]",0.677923,"[0.785049534674272, 0.5707964601769911]",0.684127,"[0.7497133027522935, 0.6185402237613212]",,0.881251,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.677167,0.63376,"[0.759844703272324, 0.5076748152359295]",0.639974,"[0.7357679914070892, 0.5441803778184034]",0.630655,"[0.7855504587155964, 0.4757591901971231]",,0.873298,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.690028,0.652741,"[0.7665309560578407, 0.5389520377044635]",0.656441,"[0.7510316368638239, 0.561849710982659]",0.650266,"[0.7826834862385321, 0.5178476291955247]",,0.877072,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [47]:
best_seqsignet_attention_encoder_umap_80[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(128, 128)",0.1,0.0001
0,15,12,3,10,1,"(128, 128)",0.1,0.0001
0,15,12,3,10,1,"(128, 128)",0.1,0.0001


In [48]:
best_seqsignet_attention_encoder_umap_80["f1"].mean()

0.6556145317604307

In [49]:
best_seqsignet_attention_encoder_umap_80["precision"].mean()

0.658112618653873

In [50]:
best_seqsignet_attention_encoder_umap_80["recall"].mean()

0.6550157151433985

In [51]:
np.stack(best_seqsignet_attention_encoder_umap_80["f1_scores"]).mean(axis=0)

array([0.7644501 , 0.54677897])

In [52]:
np.stack(best_seqsignet_attention_encoder_umap_80["precision_scores"]).mean(axis=0)

array([0.75728305, 0.55894218])

In [53]:
np.stack(best_seqsignet_attention_encoder_umap_80["recall_scores"]).mean(axis=0)

array([0.77264908, 0.53738235])