In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.swmhau_network_functions import (
    swmhau_network_hyperparameter_search,
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


# swmhau Network

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = False

In [9]:
num_epochs = 100
dimensions = [15]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (8, 4, 6)]
num_layers = [1]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [None]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "swmhau_parameters": swmhau_parameters,
    "num_layers": num_layers,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# w=5

In [10]:
size = 5

## UMAP

In [None]:
(
    swmhau_network_umap_kfold_5,
    best_swmhau_network_umap_kfold_5,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


In [41]:
swmhau_network_umap_kfold_5.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "learning_rate",
    ]
).mean()

  swmhau_network_umap_kfold_5.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,embedding_dim,num_features,log_signature,dropout_rate,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,learning_rate,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
15,8,4,6,1,"(256, 256)",0.0001,0.688506,0.660915,0.659834,0.662731,0.730939,0.712657,0.714866,0.711311,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,14.5
15,8,4,6,1,"(256, 256)",0.0005,0.687884,0.661194,0.659645,0.663484,0.729546,0.711837,0.713169,0.71083,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,15.5
15,8,4,6,1,"(256, 256)",0.001,0.685586,0.659476,0.65795,0.662351,0.72813,0.711319,0.712071,0.711135,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,13.5
15,8,4,6,1,"(512, 512)",0.0001,0.686611,0.659636,0.658465,0.662012,0.728265,0.710142,0.711841,0.709037,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,20.5
15,8,4,6,1,"(512, 512)",0.0005,0.688723,0.659824,0.65926,0.66095,0.729995,0.71088,0.713851,0.709179,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,21.5
15,8,4,6,1,"(512, 512)",0.001,0.685958,0.657766,0.656936,0.65948,0.725817,0.707171,0.709195,0.705871,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,19.5
15,8,4,12,1,"(256, 256)",0.0001,0.688319,0.659784,0.658873,0.661009,0.730647,0.711275,0.714223,0.709185,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,26.5
15,8,4,12,1,"(256, 256)",0.0005,0.688071,0.660033,0.6589,0.661515,0.729231,0.710447,0.712692,0.708754,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,27.5
15,8,4,12,1,"(256, 256)",0.001,0.686207,0.659638,0.658052,0.662111,0.727794,0.710171,0.711352,0.709358,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,25.5
15,8,4,12,1,"(512, 512)",0.0001,0.689158,0.659291,0.659196,0.660013,0.731073,0.711057,0.714939,0.708726,5.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,32.5


In [33]:
best_swmhau_network_umap_kfold_5

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.688164,0.663149,"[0.7549436062692252, 0.571355367665898]",0.661067,"[0.7717879604672058, 0.5503455083909181]",0.666426,"[0.7388188073394495, 0.594033031433138]",,0.7346,...,0.0005,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.693197,0.665764,"[0.7615184004636337, 0.5700104493207941]",0.664486,"[0.7697715289982425, 0.5592004100461302]",0.667344,"[0.7534403669724771, 0.5812466702184337]",,0.733657,...,0.0005,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.695433,0.662713,"[0.7677657760090961, 0.5576610720086627]",0.664071,"[0.7612739571589628, 0.5668684645019263]",0.661559,"[0.7743692660550459, 0.5487480021310602]",,0.739857,...,0.0005,123,focal,2,True,5,Conv1d,,concatenation,64


In [34]:
best_swmhau_network_umap_kfold_5[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(256, 256)",0.2,0.0005
0,15,12,3,10,1,"(256, 256)",0.2,0.0005
0,15,12,3,10,1,"(256, 256)",0.2,0.0005


In [35]:
best_swmhau_network_umap_kfold_5["f1"].mean()

0.6638757786228849

In [36]:
best_swmhau_network_umap_kfold_5["precision"].mean()

0.6632079715938975

In [37]:
best_swmhau_network_umap_kfold_5["recall"].mean()

0.6651093573582675

In [38]:
np.stack(best_swmhau_network_umap_kfold_5["f1_scores"]).mean(axis=0)

array([0.76140926, 0.5663423 ])

In [39]:
np.stack(best_swmhau_network_umap_kfold_5["precision_scores"]).mean(axis=0)

array([0.76761115, 0.55880479])

In [40]:
np.stack(best_swmhau_network_umap_kfold_5["recall_scores"]).mean(axis=0)

array([0.75554281, 0.5746759 ])

## Random Projections

In [None]:
(
    swmhau_network_grp_kfold_5,
    best_swmhau_network_grp_kfold_5,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_5_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_5_kfold_best_model.csv


In [None]:
swmhau_network_grp_kfold_5.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  swmhau_network_grp_kfold_5.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,include_features_in_input,embedding_dim,num_features,log_signature,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.70028,0.668747,0.669814,0.667869,0.723727,0.702427,0.706708,0.699668,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.695247,0.671043,0.668999,0.674703,0.730826,0.714933,0.71488,0.71518,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.693321,0.666047,0.664903,0.667849,0.729658,0.711711,0.713237,0.710502,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,8,4,6,1,"(256, 256)",0.2,0.0001,0.698975,0.667751,0.668526,0.667071,0.723233,0.701337,0.70619,0.69834,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.2,0.0005,0.691395,0.667471,0.665156,0.671412,0.728355,0.712566,0.712378,0.712957,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.2,0.001,0.688537,0.664382,0.662638,0.668517,0.726558,0.711547,0.710899,0.712705,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.695433,0.669219,0.669079,0.672304,0.726828,0.709303,0.711367,0.709166,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,19.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.695185,0.669072,0.667678,0.671497,0.729613,0.712409,0.713383,0.711779,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,20.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.689531,0.664015,0.662277,0.667026,0.726468,0.710767,0.710398,0.71132,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,18.0
15,8,4,6,1,"(512, 512)",0.2,0.0001,0.701274,0.67015,0.671589,0.6697,0.729433,0.708598,0.713457,0.705976,5.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,22.0


In [None]:
best_swmhau_network_grp_kfold_5

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.70233,0.674991,"[0.7692529981216587, 0.5807298503544237]",0.673939,"[0.775415088843577, 0.572463768115942]",0.676213,"[0.7631880733944955, 0.5892381459776239]",,0.729883,...,0.0005,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.692265,0.670258,"[0.7554436379795586, 0.5850716260366926]",0.667624,"[0.7814894269077536, 0.5537583254043768]",0.675608,"[0.7310779816513762, 0.6201385189131593]",,0.730826,...,0.0005,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.691146,0.66788,"[0.7557848194546793, 0.5799746514575412]",0.665434,"[0.7776766757658478, 0.5531914893617021]",0.672287,"[0.7350917431192661, 0.6094832179009056]",,0.73177,...,0.0005,123,focal,2,True,5,Conv1d,,concatenation,64


In [None]:
best_swmhau_network_grp_kfold_5.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'include_features_in_path',
       'include_features_in_input', 'embedding_dim', 'num_features',
       'log_signature', 'num_heads', 'num_layers', 'ffn_hidden_dim',
       'dropout_rate', 'learning_rate', 'seed', 'loss_function', 'gamma',
       'k_fold', 'n_splits', 'augmentation_type', 'hidden_dim_aug',
       'comb_method', 'batch_size'],
      dtype='object')

In [None]:
best_swmhau_network_grp_kfold_5[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,8,4,6,1,"(256, 256)",0.1,0.0005
0,15,8,4,6,1,"(256, 256)",0.1,0.0005
0,15,8,4,6,1,"(256, 256)",0.1,0.0005


In [None]:
best_swmhau_network_grp_kfold_5["f1"].mean()

0.6710429305674257

In [None]:
best_swmhau_network_grp_kfold_5["precision"].mean()

0.6689991290665332

In [None]:
best_swmhau_network_grp_kfold_5["recall"].mean()

0.6747029468261377

In [None]:
np.stack(best_swmhau_network_grp_kfold_5["f1_scores"]).mean(axis=0)

array([0.76016049, 0.58192538])

In [None]:
np.stack(best_swmhau_network_grp_kfold_5["precision_scores"]).mean(axis=0)

array([0.77819373, 0.55980453])

In [None]:
np.stack(best_swmhau_network_grp_kfold_5["recall_scores"]).mean(axis=0)

array([0.74311927, 0.60628663])

# w=11

In [10]:
size = 11

## UMAP

In [44]:
(
    swmhau_network_umap_kfold_11,
    best_swmhau_network_umap_kfold_11,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_11_kfold_best_model.csv


In [45]:
swmhau_network_umap_kfold_11.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "learning_rate",
    ]
).mean()

  swmhau_network_umap_kfold_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,embedding_dim,num_features,log_signature,dropout_rate,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,learning_rate,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
15,8,4,6,1,"(256, 256)",0.0001,0.678689,0.654235,0.652523,0.658544,0.741856,0.726353,0.726648,0.726469,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,14.5
15,8,4,6,1,"(256, 256)",0.0005,0.688972,0.663341,0.661549,0.666268,0.732781,0.715373,0.716673,0.71444,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,15.5
15,8,4,6,1,"(256, 256)",0.001,0.691643,0.665971,0.664306,0.668815,0.727996,0.710679,0.711819,0.710132,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,13.5
15,8,4,6,1,"(512, 512)",0.0001,0.683877,0.658866,0.656896,0.662432,0.741497,0.725124,0.725958,0.724459,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,20.5
15,8,4,6,1,"(512, 512)",0.0005,0.690059,0.66397,0.662401,0.666632,0.733769,0.715881,0.717859,0.714661,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,21.5
15,8,4,6,1,"(512, 512)",0.001,0.69096,0.660148,0.660487,0.660251,0.72822,0.707899,0.712049,0.705527,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,19.5
15,8,4,12,1,"(256, 256)",0.0001,0.679807,0.653296,0.651836,0.656185,0.744328,0.726948,0.729202,0.725478,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,26.5
15,8,4,12,1,"(256, 256)",0.0005,0.687201,0.661309,0.65968,0.664209,0.734578,0.717983,0.718721,0.717499,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,27.5
15,8,4,12,1,"(256, 256)",0.001,0.690618,0.663567,0.662257,0.665566,0.730377,0.711904,0.714235,0.710506,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,25.5
15,8,4,12,1,"(512, 512)",0.0001,0.685026,0.659036,0.657408,0.662065,0.742328,0.725248,0.726903,0.724078,11.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,32.5


In [46]:
best_swmhau_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.676794,0.652483,"[0.7443985849056604, 0.560567663456665]",0.650319,"[0.7660800970873787, 0.5345577573707105]",0.656574,"[0.7239105504587156, 0.5892381459776239]",,0.760749,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.685741,0.666742,"[0.7463135720734276, 0.5871694417238003]",0.664289,"[0.7853071564281191, 0.543271409152696]",0.674897,"[0.7110091743119266, 0.6387852956846031]",,0.754954,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.665797,0.646548,"[0.729031283058788, 0.5640651592511549]",0.644818,"[0.7708533077660594, 0.518783542039356]",0.654761,"[0.6915137614678899, 0.6180074587107086]",,0.750371,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [47]:
best_swmhau_network_umap_kfold_11[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001


In [48]:
best_swmhau_network_umap_kfold_11["f1"].mean()

0.6552576174115826

In [49]:
best_swmhau_network_umap_kfold_11["precision"].mean()

0.65314221164072

In [50]:
best_swmhau_network_umap_kfold_11["recall"].mean()

0.662077397768578

In [51]:
np.stack(best_swmhau_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.73991448, 0.57060075])

In [52]:
np.stack(best_swmhau_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.77408019, 0.53220424])

In [53]:
np.stack(best_swmhau_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.70881116, 0.61534363])

## Random Projections

In [11]:
(
    swmhau_network_grp_kfold_11,
    best_swmhau_network_grp_kfold_11,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_11_kfold_best_model.csv


In [12]:
swmhau_network_grp_kfold_11.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  swmhau_network_grp_kfold_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,include_features_in_input,embedding_dim,num_features,log_signature,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.676918,0.657446,0.655669,0.665528,0.738959,0.728594,0.726175,0.734063,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.696676,0.672628,0.670449,0.676376,0.734106,0.718789,0.718655,0.719392,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.692389,0.667902,0.665934,0.671439,0.737071,0.721173,0.721586,0.721222,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,8,4,6,1,"(256, 256)",0.2,0.0001,0.679528,0.65969,0.658694,0.668273,0.738419,0.727629,0.725682,0.732858,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.2,0.0005,0.694315,0.665075,0.664743,0.665702,0.735724,0.716116,0.719897,0.713593,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.2,0.001,0.695185,0.669797,0.667904,0.672564,0.7346,0.718232,0.71863,0.717868,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.68456,0.663382,0.66119,0.669806,0.742822,0.730735,0.729203,0.734107,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,19.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.689966,0.661984,0.661303,0.663752,0.73469,0.716969,0.71883,0.715768,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,20.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.694439,0.667307,0.666121,0.669037,0.73478,0.717737,0.718819,0.716904,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,18.0
15,8,4,6,1,"(512, 512)",0.2,0.0001,0.688164,0.666545,0.664264,0.672455,0.742867,0.730201,0.728931,0.733012,11.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,22.0


In [13]:
best_swmhau_network_grp_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.681267,0.660027,"[0.7450044736057262, 0.5750497017892645]",0.657575,"[0.776258545680547, 0.5388914764788076]",0.666289,"[0.7161697247706422, 0.6164091635588705]",,0.759132,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.6863,0.666796,"[0.7474110760918505, 0.5861814605360216]",0.664272,"[0.784251968503937, 0.5442922374429223]",0.674466,"[0.7138761467889908, 0.6350559403303143]",,0.764389,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.672321,0.652575,"[0.7354003612281758, 0.5697503671071953]",0.650508,"[0.7740811153358682, 0.526935264825713]",0.66027,"[0.700401376146789, 0.6201385189131593]",,0.762502,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_grp_kfold_11.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'include_features_in_path',
       'include_features_in_input', 'embedding_dim', 'num_features',
       'log_signature', 'num_heads', 'num_layers', 'ffn_hidden_dim',
       'dropout_rate', 'learning_rate', 'seed', 'loss_function', 'gamma',
       'k_fold', 'n_splits', 'augmentation_type', 'hidden_dim_aug',
       'comb_method', 'batch_size'],
      dtype='object')

In [15]:
best_swmhau_network_grp_kfold_11[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(256, 256)",0.2,0.0001
0,15,12,3,10,1,"(256, 256)",0.2,0.0001
0,15,12,3,10,1,"(256, 256)",0.2,0.0001


In [16]:
best_swmhau_network_grp_kfold_11["f1"].mean()

0.659799573393039

In [17]:
best_swmhau_network_grp_kfold_11["precision"].mean()

0.6574517680446325

In [18]:
best_swmhau_network_grp_kfold_11["recall"].mean()

0.6670084784181277

In [19]:
np.stack(best_swmhau_network_grp_kfold_11["f1_scores"]).mean(axis=0)

array([0.7426053 , 0.57699384])

In [20]:
np.stack(best_swmhau_network_grp_kfold_11["precision_scores"]).mean(axis=0)

array([0.77819721, 0.53670633])

In [21]:
np.stack(best_swmhau_network_grp_kfold_11["recall_scores"]).mean(axis=0)

array([0.71014908, 0.62386787])

# w=20

In [10]:
size = 20

## UMAP

In [11]:
(
    swmhau_network_umap_kfold_20,
    best_swmhau_network_umap_kfold_20,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_20_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_20_kfold_best_model.csv


In [12]:
swmhau_network_umap_kfold_20.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "learning_rate",
    ]
).mean()

  swmhau_network_umap_kfold_20.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,embedding_dim,num_features,log_signature,dropout_rate,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,learning_rate,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
15,8,4,6,1,"(256, 256)",0.0001,0.677446,0.652879,0.651459,0.657507,0.812733,0.802105,0.801385,0.803211,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,14.5
15,8,4,6,1,"(256, 256)",0.0005,0.686518,0.659628,0.658396,0.66192,0.758144,0.742776,0.743988,0.742118,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,15.5
15,8,4,6,1,"(256, 256)",0.001,0.685275,0.660886,0.659157,0.664942,0.732219,0.716691,0.716855,0.71729,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,13.5
15,8,4,6,1,"(512, 512)",0.0001,0.678316,0.651984,0.651001,0.655469,0.805275,0.793115,0.793934,0.792681,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,20.5
15,8,4,6,1,"(512, 512)",0.0005,0.690587,0.662969,0.661888,0.664639,0.749405,0.732087,0.734575,0.730361,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,21.5
15,8,4,6,1,"(512, 512)",0.001,0.687481,0.659178,0.658594,0.660774,0.737319,0.71891,0.721675,0.717111,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,19.5
15,8,4,12,1,"(256, 256)",0.0001,0.676608,0.652518,0.65102,0.65721,0.812711,0.802095,0.801458,0.803148,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,26.5
15,8,4,12,1,"(256, 256)",0.0005,0.682976,0.656174,0.654675,0.658643,0.756391,0.740829,0.742145,0.740069,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,27.5
15,8,4,12,1,"(256, 256)",0.001,0.686052,0.659958,0.65855,0.662833,0.733926,0.717337,0.718113,0.716882,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,25.5
15,8,4,12,1,"(512, 512)",0.0001,0.677229,0.652732,0.651721,0.657524,0.802804,0.791833,0.790974,0.793367,20.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,32.5


In [13]:
best_swmhau_network_umap_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.66822,0.645328,"[0.7354340071343638, 0.5552223888055972]",0.643202,"[0.7635802469135803, 0.5228235294117647]",0.650595,"[0.7092889908256881, 0.5919019712306872]",,0.84351,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.673625,0.650501,"[0.7404002965159378, 0.5606022584692596]",0.648271,"[0.7666564322996623, 0.5298861480075902]",0.655491,"[0.7158830275229358, 0.5950985615343634]",,0.855371,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.685368,0.660092,"[0.7527826596367897, 0.5674013326499231]",0.658064,"[0.7694610778443114, 0.5466666666666666]",0.663291,"[0.7368119266055045, 0.5897709110282365]",,0.831918,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_umap_kfold_20[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(256, 256)",0.2,0.0001
0,15,12,3,10,1,"(256, 256)",0.2,0.0001
0,15,12,3,10,1,"(256, 256)",0.2,0.0001


In [15]:
best_swmhau_network_umap_kfold_20["f1"].mean()

0.6519738238686452

In [16]:
best_swmhau_network_umap_kfold_20["precision"].mean()

0.6498456835239292

In [17]:
best_swmhau_network_umap_kfold_20["recall"].mean()

0.6564592314579026

In [18]:
np.stack(best_swmhau_network_umap_kfold_20["f1_scores"]).mean(axis=0)

array([0.74287232, 0.56107533])

In [19]:
np.stack(best_swmhau_network_umap_kfold_20["precision_scores"]).mean(axis=0)

array([0.76656592, 0.53312545])

In [20]:
np.stack(best_swmhau_network_umap_kfold_20["recall_scores"]).mean(axis=0)

array([0.72066131, 0.59225715])

## Random Projections

In [11]:
(
    swmhau_network_grp_kfold_20,
    best_swmhau_network_grp_kfold_20,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_20_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_20_kfold_best_model.csv


In [12]:
swmhau_network_grp_kfold_20.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  swmhau_network_grp_kfold_20.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,include_features_in_input,embedding_dim,num_features,log_signature,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.676235,0.652267,0.650157,0.6568,0.83075,0.82274,0.819755,0.826923,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.691768,0.669813,0.667315,0.675185,0.757919,0.745412,0.743842,0.747436,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.688288,0.666046,0.664734,0.672181,0.74143,0.727441,0.727458,0.729542,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,8,4,6,1,"(256, 256)",0.2,0.0001,0.68077,0.657414,0.6551,0.662052,0.825134,0.817213,0.813948,0.822114,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.2,0.0005,0.692886,0.673768,0.671538,0.681951,0.74763,0.736357,0.734411,0.740571,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.2,0.001,0.699596,0.676041,0.673934,0.680139,0.749068,0.734785,0.734338,0.735555,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.671699,0.647433,0.646222,0.652328,0.827829,0.820085,0.817005,0.825424,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,19.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.695309,0.670705,0.668674,0.674054,0.746102,0.731761,0.731245,0.732661,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,20.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.69593,0.672352,0.670188,0.676705,0.743541,0.728971,0.728669,0.729867,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,18.0
15,8,4,6,1,"(512, 512)",0.2,0.0001,0.667785,0.646184,0.644106,0.652435,0.830435,0.82342,0.819651,0.829996,20.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,22.0


In [13]:
best_swmhau_network_grp_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.685741,0.661671,"[0.751912889935256, 0.5714285714285714]",0.659398,"[0.7723700120918985, 0.5464268351968887]",0.66567,"[0.7325114678899083, 0.5988279168886521]",,0.838118,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.71575,0.69261,"[0.7769489542196869, 0.6082712561006935]",0.690187,"[0.7930725589728277, 0.5873015873015873]",0.696131,"[0.7614678899082569, 0.630793819925413]",,0.847823,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.674744,0.649794,"[0.7432690893041047, 0.5563183320620392]",0.647737,"[0.7633726201269265, 0.5321011673151751]",0.653521,"[0.724197247706422, 0.5828449653702718]",,0.84971,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_grp_kfold_20.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'include_features_in_path',
       'include_features_in_input', 'embedding_dim', 'num_features',
       'log_signature', 'num_heads', 'num_layers', 'ffn_hidden_dim',
       'dropout_rate', 'learning_rate', 'seed', 'loss_function', 'gamma',
       'k_fold', 'n_splits', 'augmentation_type', 'hidden_dim_aug',
       'comb_method', 'batch_size'],
      dtype='object')

In [15]:
best_swmhau_network_grp_kfold_20[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(256, 256)",0.1,0.0001
0,15,12,3,10,1,"(256, 256)",0.1,0.0001
0,15,12,3,10,1,"(256, 256)",0.1,0.0001


In [16]:
best_swmhau_network_grp_kfold_20["f1"].mean()

0.6680248488417253

In [17]:
best_swmhau_network_grp_kfold_20["precision"].mean()

0.6657741301675506

In [18]:
best_swmhau_network_grp_kfold_20["recall"].mean()

0.6717738846148208

In [19]:
np.stack(best_swmhau_network_grp_kfold_20["f1_scores"]).mean(axis=0)

array([0.75737698, 0.57867272])

In [20]:
np.stack(best_swmhau_network_grp_kfold_20["precision_scores"]).mean(axis=0)

array([0.77627173, 0.55527653])

In [22]:
np.stack(best_swmhau_network_grp_kfold_20["recall_scores"]).mean(axis=0)

array([0.7393922 , 0.60415557])

# w=35

In [10]:
size = 35

## UMAP

In [11]:
(
    swmhau_network_umap_kfold_35,
    best_swmhau_network_umap_kfold_35,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_35_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_35_kfold_best_model.csv


In [12]:
swmhau_network_umap_kfold_35.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "learning_rate",
    ]
).mean()

  swmhau_network_umap_kfold_35.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,embedding_dim,num_features,log_signature,dropout_rate,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,learning_rate,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
15,8,4,6,1,"(256, 256)",0.0001,0.686518,0.658949,0.657954,0.66112,0.855753,0.846203,0.848703,0.844399,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,14.5
15,8,4,6,1,"(256, 256)",0.0005,0.690898,0.665038,0.663663,0.667872,0.781709,0.767779,0.769241,0.766998,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,15.5
15,8,4,6,1,"(256, 256)",0.001,0.691364,0.66398,0.663245,0.665872,0.754751,0.737689,0.740766,0.735971,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,13.5
15,8,4,6,1,"(512, 512)",0.0001,0.685586,0.657291,0.656238,0.658701,0.8529,0.843296,0.845195,0.841675,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,20.5
15,8,4,6,1,"(512, 512)",0.0005,0.688102,0.663713,0.662432,0.667711,0.768814,0.755685,0.755658,0.756511,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,21.5
15,8,4,6,1,"(512, 512)",0.001,0.692948,0.666037,0.665671,0.668444,0.746642,0.730169,0.73237,0.729521,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,19.5
15,8,4,12,1,"(256, 256)",0.0001,0.686828,0.660438,0.65897,0.663163,0.856315,0.847011,0.848753,0.845531,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,26.5
15,8,4,12,1,"(256, 256)",0.0005,0.685772,0.662744,0.660569,0.667703,0.784764,0.771784,0.77207,0.771696,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,27.5
15,8,4,12,1,"(256, 256)",0.001,0.6927,0.667392,0.665796,0.670509,0.753044,0.73749,0.738548,0.736957,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,25.5
15,8,4,12,1,"(512, 512)",0.0001,0.682324,0.655525,0.654062,0.65812,0.853484,0.844012,0.845633,0.842599,35.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,32.5


In [13]:
best_swmhau_network_umap_kfold_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.68835,0.666982,"[0.7513384889946462, 0.5826260609086372]",0.664369,"[0.7805933250927071, 0.5481446688586191]",0.672967,"[0.724197247706422, 0.6217368140649974]",,0.865211,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.67698,0.650838,"[0.7463778720913214, 0.5552989479086476]",0.648988,"[0.7623318385650224, 0.5356435643564357]",0.653765,"[0.7310779816513762, 0.5764517847629196]",,0.885429,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.690587,0.66197,"[0.7603234190008663, 0.5636172450052577]",0.661079,"[0.7658522396742292, 0.5563051375194603]",0.662999,"[0.7548738532110092, 0.5711241342567928]",,0.877342,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_umap_kfold_35[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001


In [15]:
best_swmhau_network_umap_kfold_35["f1"].mean()

0.6599303389848962

In [16]:
best_swmhau_network_umap_kfold_35["precision"].mean()

0.6581451290110789

In [17]:
best_swmhau_network_umap_kfold_35["recall"].mean()

0.6632436359422528

In [18]:
np.stack(best_swmhau_network_umap_kfold_35["f1_scores"]).mean(axis=0)

array([0.75267993, 0.56718075])

In [19]:
np.stack(best_swmhau_network_umap_kfold_35["precision_scores"]).mean(axis=0)

array([0.76959247, 0.54669779])

In [20]:
np.stack(best_swmhau_network_umap_kfold_35["recall_scores"]).mean(axis=0)

array([0.73671636, 0.58977091])

## Random Projections

In [11]:
(
    swmhau_network_grp_kfold_35,
    best_swmhau_network_grp_kfold_35,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_35_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_35_kfold_best_model.csv


In [12]:
swmhau_network_grp_kfold_35.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  swmhau_network_grp_kfold_35.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,include_features_in_input,embedding_dim,num_features,log_signature,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.674495,0.646773,0.645426,0.648942,0.867547,0.860472,0.858558,0.862856,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.703138,0.67651,0.675382,0.678188,0.792829,0.780706,0.780655,0.781075,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.701709,0.677931,0.675892,0.681969,0.7891,0.777031,0.776644,0.777791,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,8,4,6,1,"(256, 256)",0.2,0.0001,0.681889,0.654541,0.653097,0.65672,0.86912,0.861874,0.860452,0.863448,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.2,0.0005,0.705064,0.681719,0.679375,0.685657,0.79903,0.787274,0.786875,0.787828,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.2,0.001,0.710966,0.682575,0.682778,0.682937,0.777553,0.762699,0.764923,0.761105,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.675116,0.646289,0.645214,0.647861,0.874422,0.867521,0.865997,0.869361,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,19.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.709289,0.682467,0.681632,0.683861,0.784338,0.76996,0.772042,0.768533,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,20.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.70525,0.67852,0.677847,0.680386,0.774678,0.760907,0.761473,0.760955,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,18.0
15,8,4,6,1,"(512, 512)",0.2,0.0001,0.680957,0.651754,0.650825,0.652886,0.872849,0.865819,0.864454,0.867502,35.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,22.0


In [13]:
best_swmhau_network_grp_kfold_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.676794,0.647401,"[0.7492045125831646, 0.5455974842767296]",0.646429,"[0.7559836544074723, 0.5368746776689015]",0.648577,"[0.742545871559633, 0.5546084176877997]",,0.880442,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.711463,0.690412,"[0.7711413364872856, 0.6096822995461422]",0.687419,"[0.796092796092796, 0.578745811393011]",0.69591,"[0.7477064220183486, 0.6441129461907299]",,0.892843,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.67698,0.650061,"[0.747118050488837, 0.5530049006964148]",0.648386,"[0.7607726597325408, 0.536]",0.652535,"[0.7339449541284404, 0.5711241342567928]",,0.886912,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_grp_kfold_35.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'include_features_in_path',
       'include_features_in_input', 'embedding_dim', 'num_features',
       'log_signature', 'num_heads', 'num_layers', 'ffn_hidden_dim',
       'dropout_rate', 'learning_rate', 'seed', 'loss_function', 'gamma',
       'k_fold', 'n_splits', 'augmentation_type', 'hidden_dim_aug',
       'comb_method', 'batch_size'],
      dtype='object')

In [15]:
best_swmhau_network_grp_kfold_35[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001


In [16]:
best_swmhau_network_grp_kfold_35["f1"].mean()

0.6626247640130957

In [17]:
best_swmhau_network_grp_kfold_35["precision"].mean()

0.660744933215787

In [18]:
best_swmhau_network_grp_kfold_35["recall"].mean()

0.665673790973624

In [19]:
np.stack(best_swmhau_network_grp_kfold_35["f1_scores"]).mean(axis=0)

array([0.7558213 , 0.56942823])

In [20]:
np.stack(best_swmhau_network_grp_kfold_35["precision_scores"]).mean(axis=0)

array([0.7709497 , 0.55054016])

In [21]:
np.stack(best_swmhau_network_grp_kfold_35["recall_scores"]).mean(axis=0)

array([0.74139908, 0.5899485 ])

# w=80

In [10]:
size = 80

## UMAP

In [23]:
(
    swmhau_network_umap_kfold_80,
    best_swmhau_network_umap_kfold_80,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_80_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_umap_focal_2_80_kfold_best_model.csv


In [24]:
swmhau_network_umap_kfold_80.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "learning_rate",
    ]
).mean()

  swmhau_network_umap_kfold_80.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,embedding_dim,num_features,log_signature,dropout_rate,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,learning_rate,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
15,8,4,6,1,"(256, 256)",0.0001,0.690805,0.65936,0.660287,0.659373,0.87085,0.861707,0.865952,0.858317,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,14.5
15,8,4,6,1,"(256, 256)",0.0005,0.689313,0.664536,0.663797,0.668581,0.793054,0.780699,0.781094,0.781064,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,15.5
15,8,4,6,1,"(256, 256)",0.001,0.684871,0.658288,0.657677,0.661064,0.764703,0.749009,0.751407,0.747735,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,13.5
15,8,4,6,1,"(512, 512)",0.0001,0.688164,0.658755,0.658388,0.659946,0.869884,0.861023,0.864235,0.858339,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,20.5
15,8,4,6,1,"(512, 512)",0.0005,0.688226,0.658465,0.659242,0.65942,0.786382,0.770979,0.7754,0.76825,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,21.5
15,8,4,6,1,"(512, 512)",0.001,0.694843,0.669584,0.669262,0.672895,0.756841,0.74154,0.743247,0.741507,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,19.5
15,12,3,10,1,"(256, 256)",0.0001,0.671451,0.642925,0.641695,0.64498,0.882891,0.874812,0.87848,0.87195,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,2.5
15,12,3,10,1,"(256, 256)",0.0005,0.67667,0.654308,0.652188,0.660047,0.870063,0.862264,0.862415,0.862389,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,3.5
15,12,3,10,1,"(256, 256)",0.001,0.674153,0.652502,0.651175,0.659342,0.804781,0.794298,0.79321,0.796243,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,1.5
15,12,3,10,1,"(512, 512)",0.0001,0.67201,0.647339,0.645349,0.651296,0.88507,0.877427,0.879961,0.875487,80.0,17.0,...,384.0,0.0,1.0,0.15,45.333333,2.0,1.0,5.0,64.0,8.5


In [25]:
best_swmhau_network_umap_kfold_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.701212,0.678154,"[0.7642993677400383, 0.5920081445660472]",0.675625,"[0.7844853607002716, 0.5667641325536062]",0.682366,"[0.7451261467889908, 0.6196057538625466]",,0.874511,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.650326,0.624184,"[0.7233038348082595, 0.5250632911392404]",0.622539,"[0.7448359659781288, 0.5002411963338157]",0.62773,"[0.7029816513761468, 0.552477357485349]",,0.896886,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.664865,0.642602,"[0.7318019093078758, 0.5534028812717338]",0.640565,"[0.7627487562189055, 0.5183806421591438]",0.648384,"[0.7032683486238532, 0.5935002663825253]",,0.886238,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [26]:
best_swmhau_network_umap_kfold_80[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001
0,15,12,3,10,1,"(512, 512)",0.1,0.0001


In [27]:
best_swmhau_network_umap_kfold_80["f1"].mean()

0.6483132381388658

In [28]:
best_swmhau_network_umap_kfold_80["precision"].mean()

0.646242675657312

In [29]:
best_swmhau_network_umap_kfold_80["recall"].mean()

0.6528265874199018

In [30]:
np.stack(best_swmhau_network_umap_kfold_80["f1_scores"]).mean(axis=0)

array([0.7398017 , 0.55682477])

In [31]:
np.stack(best_swmhau_network_umap_kfold_80["precision_scores"]).mean(axis=0)

array([0.76402336, 0.52846199])

In [32]:
np.stack(best_swmhau_network_umap_kfold_80["recall_scores"]).mean(axis=0)

array([0.71712538, 0.58852779])

## Random Projections

In [11]:
(
    swmhau_network_grp_kfold_80,
    best_swmhau_network_grp_kfold_80,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_80_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_80_kfold_best_model.csv


In [12]:
swmhau_network_grp_kfold_80.groupby(
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
).mean()

  swmhau_network_grp_kfold_80.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,input_channels,...,include_features_in_input,embedding_dim,num_features,log_signature,seed,gamma,k_fold,n_splits,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.690276,0.655793,0.657777,0.654353,0.893517,0.88644,0.889041,0.884202,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.720099,0.69551,0.69383,0.697753,0.801815,0.790766,0.789801,0.792025,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.728363,0.700402,0.701156,0.699843,0.794447,0.781235,0.782487,0.780187,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,12.0
15,8,4,6,1,"(256, 256)",0.2,0.0001,0.703448,0.671153,0.673084,0.670101,0.889653,0.882446,0.884579,0.880536,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.2,0.0005,0.70991,0.684042,0.682632,0.685857,0.795031,0.782333,0.782911,0.781813,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.2,0.001,0.721839,0.696682,0.69565,0.69864,0.790403,0.777732,0.778112,0.777961,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,15.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.68338,0.65052,0.651206,0.650279,0.890956,0.883969,0.885576,0.882494,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,19.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.729046,0.700421,0.703006,0.699261,0.792605,0.779054,0.781253,0.777952,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,20.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.716682,0.68609,0.688147,0.684749,0.78892,0.774614,0.777395,0.772621,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,18.0
15,8,4,6,1,"(512, 512)",0.2,0.0001,0.70233,0.670613,0.671908,0.669815,0.886777,0.87968,0.881146,0.878756,80.0,17.0,...,0.0,384.0,0.0,1.0,45.333333,2.0,1.0,5.0,64.0,22.0


In [13]:
best_swmhau_network_grp_kfold_80

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.695061,0.669314,"[0.7615855435733022, 0.5770423991726992]",0.667487,"[0.7744516893894487, 0.5605223505775992]",0.671853,"[0.7491399082568807, 0.5945657964837506]",,0.896752,...,0.0001,1,focal,2,True,5,Conv1d,,concatenation,64
0,,0.715005,0.689989,"[0.7780519669037596, 0.6019265816193698]",0.688299,"[0.7880035283740077, 0.5885947046843177]",0.692113,"[0.768348623853211, 0.6158763985082578]",,0.902143,...,0.0001,12,focal,2,True,5,Conv1d,,concatenation,64
0,,0.70028,0.66951,"[0.7703513281919452, 0.5686695278969958]",0.670082,"[0.767501422879909, 0.5726634251755808]",0.668977,"[0.7732224770642202, 0.5647309536494406]",,0.898504,...,0.0001,123,focal,2,True,5,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_grp_kfold_80.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'include_features_in_path',
       'include_features_in_input', 'embedding_dim', 'num_features',
       'log_signature', 'num_heads', 'num_layers', 'ffn_hidden_dim',
       'dropout_rate', 'learning_rate', 'seed', 'loss_function', 'gamma',
       'k_fold', 'n_splits', 'augmentation_type', 'hidden_dim_aug',
       'comb_method', 'batch_size'],
      dtype='object')

In [15]:
best_swmhau_network_grp_kfold_80[
    [
        "dimensions",
        "output_channels",
        "sig_depth",
        "num_heads",
        "num_layers",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
    ]
]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001
0,15,12,3,10,1,"(512, 512)",0.2,0.0001


In [16]:
best_swmhau_network_grp_kfold_80["f1"].mean()

0.6762712245596787

In [17]:
best_swmhau_network_grp_kfold_80["precision"].mean()

0.6752895201801438

In [18]:
best_swmhau_network_grp_kfold_80["recall"].mean()

0.6776473596359601

In [19]:
np.stack(best_swmhau_network_grp_kfold_80["f1_scores"]).mean(axis=0)

array([0.76999628, 0.58254617])

In [20]:
np.stack(best_swmhau_network_grp_kfold_80["precision_scores"]).mean(axis=0)

array([0.77665221, 0.57392683])

In [21]:
np.stack(best_swmhau_network_grp_kfold_80["recall_scores"]).mean(axis=0)

array([0.76357034, 0.59172438])