In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
from nlpsig_networks.scripts.swmhau_network_functions import (
    swmhau_network_hyperparameter_search
)

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


# swmhau Network

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
num_features = len(features)
add_time_in_path = True

In [8]:
num_epochs = 100
embedding_dim = 384
dimensions = [15] # [50, 15]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (8, 4, 6), (8, 4, 12)]
num_layers = [1]
ffn_hidden_dim_sizes = [[256,256],[512,512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (df_rumours[df_rumours['set']=='train'].index,
                 df_rumours[df_rumours['set']=='dev'].index,
                 df_rumours[df_rumours['set']=='test'].index)

## UMAP

In [10]:
size = 35
swmhau_network_umap, best_swmhau_network_umap, _, __ = swmhau_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swmhau_parameters=swmhau_parameters,
    num_layers=num_layers,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_indices=split_indices,
    k_fold=False,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
swmhau_network_umap.groupby(["dimensions",
                             "output_channels",
                             "sig_depth",
                             "num_heads",
                             "num_layers",
                             "ffn_hidden_dim",
                             "learning_rate"]).mean()

In [None]:
best_swmhau_network_umap

In [None]:
best_swmhau_network_umap[["dimensions",
                          "output_channels",
                          "sig_depth",
                          "num_heads",
                          "num_layers",
                          "ffn_hidden_dim",
                          "dropout_rate",
                          "learning_rate"]]

In [None]:
best_swmhau_network_umap["f1"].mean()

In [None]:
best_swmhau_network_umap["precision"].mean()

In [None]:
best_swmhau_network_umap["recall"].mean()

In [None]:
np.stack(best_swmhau_network_umap["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_swmhau_network_umap["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_swmhau_network_umap["recall_scores"]).mean(axis=0)

## Random Projections

In [18]:
size = 35
swmhau_network_grp, best_swmhau_network_grp, _, __ = swmhau_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swmhau_parameters=swmhau_parameters,
    num_layers=num_layers,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    split_indices=split_indices,
    k_fold=False,
    features=features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swmhau_network_grp_focal_{gamma}_{size}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: gaussian_random_projection
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_35_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/swmhau_network_grp_focal_2_35_kfold_best_model.csv


In [19]:
swmhau_network_grp.groupby(["dimensions",
                            "output_channels",
                            "sig_depth",
                            "num_heads",
                            "num_layers",
                            "ffn_hidden_dim",
                            "dropout_rate",
                            "learning_rate"]).mean()

  swmhau_network_grp_kfold_20.groupby(["dimensions",


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,loss,accuracy,f1,precision,recall,valid_loss,valid_accuracy,valid_f1,valid_precision,valid_recall,...,input_channels,add_time_in_path,num_features,embedding_dim,log_signature,seed,gamma,k_fold,batch_size,model_id
dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
15,8,4,6,1,"(256, 256)",0.1,0.0001,0.437324,0.632031,0.631136,0.635017,0.633487,0.286125,0.667853,0.600958,0.63985,0.602229,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,16.0
15,8,4,6,1,"(256, 256)",0.1,0.0005,0.497362,0.636479,0.63387,0.645095,0.639611,0.203662,0.714116,0.676161,0.693083,0.670172,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,17.0
15,8,4,6,1,"(256, 256)",0.1,0.001,0.479793,0.615825,0.608849,0.630976,0.620355,0.222472,0.685646,0.649962,0.659568,0.647123,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,15.0
15,8,4,6,1,"(256, 256)",0.5,0.0001,0.400435,0.620909,0.620536,0.620989,0.621106,0.205582,0.718861,0.666193,0.704705,0.665462,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,13.0
15,8,4,6,1,"(256, 256)",0.5,0.0005,0.503324,0.629171,0.627057,0.634685,0.63135,0.214372,0.715302,0.68614,0.691222,0.683052,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,14.0
15,8,4,6,1,"(256, 256)",0.5,0.001,0.421962,0.628853,0.627584,0.628607,0.628041,0.206338,0.704626,0.640897,0.693559,0.640934,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,12.0
15,8,4,6,1,"(512, 512)",0.1,0.0001,0.397865,0.632348,0.628817,0.645633,0.63748,0.219558,0.735469,0.711003,0.717649,0.708721,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,22.0
15,8,4,6,1,"(512, 512)",0.1,0.0005,0.438234,0.630442,0.630335,0.631578,0.631582,0.207406,0.696323,0.652814,0.679568,0.650582,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,23.0
15,8,4,6,1,"(512, 512)",0.1,0.001,0.856663,0.603114,0.597925,0.616178,0.607765,0.220356,0.686833,0.656084,0.66234,0.65438,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,21.0
15,8,4,6,1,"(512, 512)",0.5,0.0001,0.397297,0.622815,0.62036,0.633245,0.627093,0.225158,0.718861,0.67942,0.703125,0.673193,...,17.0,1.0,2.0,384.0,1.0,45.333333,2.0,0.0,64.0,19.0


In [20]:
best_swmhau_network_grp

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,0.388765,0.63489,0.63335,"[0.657117278424351, 0.6095820591233435]",0.633874,"[0.6461267605633803, 0.6216216216216216]",0.633244,"[0.668488160291439, 0.598]",0.187115,0.747331,...,0.001,1,focal,2,False,,Conv1d,,concatenation,64
0,0.79681,0.601525,0.599367,"[0.6287744227353464, 0.5699588477366254]",0.600191,"[0.6135181975736569, 0.586864406779661]",0.599404,"[0.644808743169399, 0.554]",0.164889,0.786477,...,0.001,12,focal,2,False,,Conv1d,,concatenation,64
0,0.325039,0.655863,0.652833,"[0.6203995793901157, 0.6852659110723627]",0.670625,"[0.7338308457711443, 0.60741885625966]",0.66167,"[0.5373406193078324, 0.786]",0.202457,0.69395,...,0.001,123,focal,2,False,,Conv1d,,concatenation,64


In [None]:
best_swmhau_network_grp.columns

Index(['loss', 'accuracy', 'f1', 'f1_scores', 'precision', 'precision_scores',
       'recall', 'recall_scores', 'valid_loss', 'valid_accuracy', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 'valid_precision_scores',
       'valid_recall', 'valid_recall_scores', 'k', 'dimensions', 'sig_depth',
       'method', 'input_channels', 'output_channels', 'features',
       'standardise_method', 'add_time_in_path', 'num_features',
       'embedding_dim', 'log_signature', 'num_heads', 'num_layers',
       'ffn_hidden_dim', 'dropout_rate', 'learning_rate', 'seed',
       'loss_function', 'gamma', 'k_fold', 'n_splits', 'augmentation_type',
       'hidden_dim_aug', 'comb_method', 'batch_size'],
      dtype='object')

In [35]:
best_swmhau_network_grp[["dimensions",
                         "output_channels",
                         "sig_depth",
                         "num_heads",
                         "num_layers",
                         "ffn_hidden_dim",
                         "dropout_rate",
                         "learning_rate"]]

Unnamed: 0,dimensions,output_channels,sig_depth,num_heads,num_layers,ffn_hidden_dim,dropout_rate,learning_rate
0,15,12,3,10,1,"(512, 512)",0.1,0.001
0,15,12,3,10,1,"(512, 512)",0.1,0.001
0,15,12,3,10,1,"(512, 512)",0.1,0.001


In [21]:
best_swmhau_network_grp["f1"].mean()

0.6285163497470242

In [22]:
best_swmhau_network_grp["precision"].mean()

0.6348967814281874

In [23]:
best_swmhau_network_grp["recall"].mean()

0.6314395871281118

In [24]:
np.stack(best_swmhau_network_grp["f1_scores"]).mean(axis=0)

array([0.63543043, 0.62160227])

In [25]:
np.stack(best_swmhau_network_grp["precision_scores"]).mean(axis=0)

array([0.66449193, 0.60530163])

In [26]:
np.stack(best_swmhau_network_grp["recall_scores"]).mean(axis=0)

array([0.61687917, 0.646     ])