In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "rumours_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


## Seq-Sig-Net

In [7]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [8]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
bidirectional = True
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [9]:
kwargs = {
    "num_epochs": num_epochs,
    "df": df_rumours,
    "id_column": "timeline_id",
    "label_column": "label",
    "embeddings": sbert_embeddings,
    "y_data": y_data,
    "output_dim": output_dim,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "split_ids": split_ids,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [10]:
shift = 3
window_size = 5
n = 3

## umap

In [11]:
(
    seqsignet_network_umap_kfold_11,
    best_seqsignet_network_umap_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_3_kfold_best_model.csv


In [12]:
seqsignet_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.693010,0.669248,"[0.7579009260620315, 0.5805958747135218]",0.666887,"[0.7776772247360483, 0.5560975609756098]",0.673229,"[0.739105504587156, 0.607352157698455]",,0.728130,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.688723,0.664593,"[0.7545561434450324, 0.5746306673458992]",0.662319,"[0.7741254523522316, 0.5505124450951684]",0.668455,"[0.7359518348623854, 0.6009589770911028]",,0.727726,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691333,0.661979,"[0.7615894039735099, 0.5623678646934461]",0.661419,"[0.764893001735107, 0.5579444153120083]",0.662588,"[0.7583142201834863, 0.5668620138518913]",,0.726513,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.691706,0.670501,"[0.7540886113589058, 0.586913086913087]",0.667810,"[0.7831995058678196, 0.5524212505876822]",0.676532,"[0.7270642201834863, 0.6259989344698987]",,0.746192,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.685741,0.665849,"[0.7473778843272398, 0.5843195266272189]",0.663314,"[0.7827997489014438, 0.5438274437815511]",0.673175,"[0.7150229357798165, 0.6313265849760256]",,0.732039,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.689655,0.664907,"[0.7559724461380626, 0.5738418223701048]",0.662765,"[0.7733133433283358, 0.5522167487684729]",0.668311,"[0.7393922018348624, 0.597229621736814]",,0.737431,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.691333,0.668326,"[0.7556801416347004, 0.5809716599190284]",0.665836,"[0.778419452887538, 0.5532530120481928]",0.672923,"[0.7342316513761468, 0.6116142781033564]",,0.735813,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.704753,0.675172,"[0.7731958762886597, 0.577148958889482]",0.675348,"[0.7723112128146453, 0.5783841626538255]",0.675001,"[0.7740825688073395, 0.5759190197123069]",,0.740801,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.695247,0.673047,"[0.7582433831140026, 0.5878497605243257]",0.670398,"[0.7829007633587787, 0.5578947368421052]",0.678148,"[0.7350917431192661, 0.6212040490143846]",,0.731770,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [13]:
best_seqsignet_network_umap_kfold_11["f1"].mean()

0.6714679850544085

In [14]:
best_seqsignet_network_umap_kfold_11["precision"].mean()

0.6692648588911293

In [15]:
best_seqsignet_network_umap_kfold_11["recall"].mean()

0.6751879290021326

In [16]:
np.stack(best_seqsignet_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.76037593, 0.58256004])

In [17]:
np.stack(best_seqsignet_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.77865031, 0.55987941])

In [18]:
np.stack(best_seqsignet_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.7430237 , 0.60735216])

# history_length=20

In [19]:
shift = 3
window_size = 5
n = 6

## umap

In [20]:
(
    seqsignet_network_umap_kfold_20,
    best_seqsignet_network_umap_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_6_kfold_best_model.csv


In [21]:
seqsignet_network_umap_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.707735,0.675243,"[0.7779665817048995, 0.5725190839694657]",0.677436,"[0.7686066032456631, 0.5862646566164154]",0.673480,"[0.7875573394495413, 0.5594033031433138]",,0.748079,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.687232,0.671865,"[0.7428746552252529, 0.6008563273073264]",0.670326,"[0.7978933508887426, 0.5427589170605931]",0.683918,"[0.694954128440367, 0.6728822589238146]",,0.736083,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.684063,0.660335,"[0.750110570544007, 0.5705599189257665]",0.658020,"[0.7720789074355083, 0.5439613526570048]",0.664626,"[0.7293577981651376, 0.5998934469898775]",,0.728400,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.698043,0.681491,"[0.7540983606557377, 0.6088845968131338]",0.679012,"[0.8012903225806451, 0.5567328918322296]",0.691986,"[0.7121559633027523, 0.6718167288225892]",,0.751045,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.697111,0.680134,"[0.7538251780033328, 0.6064422378299831]",0.677590,"[0.799229039511725, 0.5559502664298401]",0.690162,"[0.713302752293578, 0.6670218433670752]",,0.742957,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.684995,0.662434,"[0.7497037914691943, 0.5751633986928104]",0.659966,"[0.7754289215686274, 0.5445026178010471]",0.667557,"[0.7256307339449541, 0.6094832179009056]",,0.756301,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.682386,0.656263,"[0.7510227936879018, 0.561502830674215]",0.654424,"[0.765792610250298, 0.5430562468889994]",0.659029,"[0.7368119266055045, 0.5812466702184337]",,0.748214,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.693383,0.665064,"[0.7624548736462095, 0.5676741130091985]",0.664139,"[0.7681117253418679, 0.5601659751037344]",0.666133,"[0.7568807339449541, 0.5753862546616942]",,0.743227,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.707176,0.687631,"[0.7657671089906068, 0.6094954014417103]",0.684529,"[0.7977632805219013, 0.5712954333643989]",0.694704,"[0.7362385321100917, 0.6531699520511455]",,0.770454,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [22]:
best_seqsignet_network_umap_kfold_20["f1"].mean()

0.6721964985765204

In [23]:
best_seqsignet_network_umap_kfold_20["precision"].mean()

0.6709536050688579

In [24]:
best_seqsignet_network_umap_kfold_20["recall"].mean()

0.6831987775730353

In [25]:
np.stack(best_seqsignet_network_umap_kfold_20["f1_scores"]).mean(axis=0)

array([0.7456488 , 0.59874419])

In [26]:
np.stack(best_seqsignet_network_umap_kfold_20["precision_scores"]).mean(axis=0)

array([0.79577584, 0.54613137])

In [27]:
np.stack(best_seqsignet_network_umap_kfold_20["recall_scores"]).mean(axis=0)

array([0.70221713, 0.66418043])

# history_length=35

In [28]:
shift = 3
window_size = 5
n = 11

## umap

In [29]:
(
    seqsignet_network_umap_kfold_35,
    best_seqsignet_network_umap_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in rumours_output/seqsignet_umap_focal_2_3_5_11_kfold_best_model.csv


In [30]:
seqsignet_network_umap_kfold_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.694501,0.670784,"[0.759147685525349, 0.5824203821656052]",0.668411,"[0.7787157069641242, 0.55810546875]",0.674745,"[0.7405389908256881, 0.608950452850293]",,0.771802,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.693010,0.676385,"[0.7497340829661147, 0.6030368763557484]",0.674112,"[0.7976075008082768, 0.5506161971830986]",0.686886,"[0.7072821100917431, 0.6664890783164624]",,0.761289,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.671202,0.656359,"[0.7277777777777779, 0.5849411764705882]",0.655956,"[0.7881016042780749, 0.5238095238095238]",0.669130,"[0.6760321100917431, 0.662226957911561]",,0.757380,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.708108,0.690867,"[0.7638721351025332, 0.6178623718887263]",0.687838,"[0.8056615776081425, 0.570013507429086]",0.700342,"[0.726204128440367, 0.6744805540756527]",,0.784877,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.701584,0.682787,"[0.7600059961025334, 0.6055678738605567]",0.679832,"[0.7964184731385485, 0.5632447296058661]",0.690773,"[0.7267775229357798, 0.6547682472029834]",,0.776655,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.693383,0.675363,"[0.7518479408658924, 0.5988783223604]",0.672768,"[0.7933779051257561, 0.552158273381295]",0.684343,"[0.7144495412844036, 0.6542354821523708]",,0.756301,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.708481,0.686085,"[0.7699323330391292, 0.6022380467955238]",0.683395,"[0.7906344410876133, 0.5761557177615572]",0.690540,"[0.7502866972477065, 0.630793819925413]",,0.772746,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.690587,0.672736,"[0.7491689332124509, 0.5963035019455253]",0.670279,"[0.7920127795527157, 0.5485458612975391]",0.681946,"[0.7107224770642202, 0.6531699520511455]",,0.825718,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.693010,0.672715,"[0.7542157886882556, 0.5912137006701416]",0.669966,"[0.7864923747276689, 0.5534386617100372]",0.679504,"[0.7244839449541285, 0.6345231752797017]",,0.801186,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


best_seqsignet_network_umap_kfold_35["f1"].mean()

In [31]:
best_seqsignet_network_umap_kfold_35["precision"].mean()

0.6649962653253055

In [32]:
best_seqsignet_network_umap_kfold_35["recall"].mean()

0.6777165264288939

In [33]:
np.stack(best_seqsignet_network_umap_kfold_35["f1_scores"]).mean(axis=0)

array([0.73984525, 0.59290705])

In [34]:
np.stack(best_seqsignet_network_umap_kfold_35["precision_scores"]).mean(axis=0)

array([0.79237296, 0.53761957])

In [35]:
np.stack(best_seqsignet_network_umap_kfold_35["recall_scores"]).mean(axis=0)

array([0.69409404, 0.66133902])

# history_length=80

In [36]:
shift = 3
window_size = 5
n = 26

## umap

In [37]:
(
    seqsignet_network_umap_kfold_80,
    best_seqsignet_network_umap_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
seqsignet_network_umap_kfold_80

In [None]:
best_seqsignet_network_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["recall_scores"]).mean(axis=0)