In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [8]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 3)]
lstm_hidden_dim_sizes = [300, 400]
ffn_hidden_dim_sizes = [[32,32], [128,128], [512,512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [10]:
shift = 3
window_size = 5
n = 3

## umap

In [11]:
(
    seqsignet_network_umap_kfold_11,
    best_seqsignet_network_umap_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold_best_model.csv


In [12]:
seqsignet_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.694315,0.669393,"[0.7601637905820415, 0.5786228160328879]",0.667315,"[0.7758208955223881, 0.5588089330024814]",0.672510,"[0.7451261467889908, 0.5998934469898775]",,0.748214,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.682945,0.664807,"[0.742779373960381, 0.5868350740830702]",0.662603,"[0.78592, 0.5392857142857143]",0.673854,"[0.7041284403669725, 0.6435801811401172]",,0.731500,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.674930,0.656112,"[0.7365558912386706, 0.575669099756691]",0.654098,"[0.7784163473818646, 0.5297805642633229]",0.664614,"[0.6989678899082569, 0.6302610548748002]",,0.725569,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691706,0.660902,"[0.7631051274706387, 0.5586979722518677]",0.661022,"[0.7624499141385231, 0.5595938001068947]",0.660783,"[0.7637614678899083, 0.5578050079914758]",,0.754684,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.694874,0.671044,"[0.7595829049787046, 0.5825044631471564]",0.668695,"[0.7786811201445348, 0.5587084148727984]",0.674908,"[0.7413990825688074, 0.6084176877996803]",,0.735274,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.695247,0.669179,"[0.7620433706884004, 0.5763151075408137]",0.667459,"[0.7738693467336684, 0.5610494450050454]",0.671504,"[0.7505733944954128, 0.5924347362813]",,0.734466,...,True,focal,2,True,5,Conv1d,,concatenation,64,34
0,,0.690401,0.670322,"[0.7516818657497384, 0.588963127938629]",0.667644,"[0.7853795688847235, 0.5499075785582255]",0.677374,"[0.720756880733945, 0.633990410229089]",,0.730557,...,True,focal,2,True,5,Conv1d,,concatenation,64,34
0,,0.697484,0.676447,"[0.758948462795188, 0.5939454590943207]",0.673635,"[0.7873651771956857, 0.5599056603773584]",0.682452,"[0.7325114678899083, 0.6323921150772509]",,0.738374,...,True,focal,2,True,5,Conv1d,,concatenation,64,35
0,,0.685555,0.666132,"[0.7466586574560744, 0.5856055023335789]",0.663640,"[0.7839798170923998, 0.5432999088422972]",0.673893,"[0.7127293577981652, 0.6350559403303143]",,0.734196,...,True,focal,2,True,5,Conv1d,,concatenation,64,35


In [13]:
best_seqsignet_network_umap_kfold_11["f1"].mean()

0.671870897876091

In [14]:
best_seqsignet_network_umap_kfold_11["precision"].mean()

0.6696546183742624

In [15]:
best_seqsignet_network_umap_kfold_11["recall"].mean()

0.6760081550118202

In [16]:
np.stack(best_seqsignet_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.7599095, 0.5838323])

In [17]:
np.stack(best_seqsignet_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.77988898, 0.55942025])

In [18]:
np.stack(best_seqsignet_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.74111239, 0.61090392])

# history_length=20

In [19]:
shift = 3
window_size = 5
n = 6

## umap

In [None]:
(
    seqsignet_network_umap_kfold_20,
    best_seqsignet_network_umap_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
seqsignet_network_umap_kfold_20

In [None]:
best_seqsignet_network_umap_kfold_20["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["recall_scores"]).mean(axis=0)

# history_length=35

In [None]:
shift = 3
window_size = 5
n = 11

## umap

In [None]:
(
    seqsignet_network_umap_kfold_35,
    best_seqsignet_network_umap_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_35

best_seqsignet_network_umap_kfold_35["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["recall_scores"]).mean(axis=0)

# history_length=80

In [None]:
shift = 3
window_size = 5
n = 26

## umap

In [None]:
(
    seqsignet_network_umap_kfold_80,
    best_seqsignet_network_umap_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_80

In [None]:
best_seqsignet_network_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["recall_scores"]).mean(axis=0)