In [10]:
import numpy as np
import pickle
import os
from tqdm.notebook import tqdm

seed = 2023

In [11]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda:1"

In [12]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search,
)

In [13]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [14]:
%run load_talklifemoc.py

In [15]:
%run load_sbert-embeddings.py

In [16]:
sbert_embeddings.shape

torch.Size([18604, 384])

SWNU Network

In [17]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
num_features = len(features)
include_features_in_path = True
include_features_in_input = False

In [18]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [19]:
num_epochs = 100
embedding_dim = 384
dimensions = [15]
dimreduction_method = ["umap"]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [20]:
size = 11
(
    swnu_network_umap_11,
    best_swnu_network_umap_11,
    _,
    __,
) = swnu_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=dimreduction_method,
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=True,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/swnu_network_umap_focal_2_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/swnu_network_umap_focal_2_11_kfold_best_model.csv


In [21]:
swnu_network_umap_11.groupby(
    ["dimensions", "swnu_hidden_dim", "ffn_hidden_dim", "dropout_rate", "learning_rate"]
).mean()

  swnu_network_umap_11.groupby(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,sig_depth,...,embedding_dim,num_features,log_signature,seed,BiLSTM,gamma,k_fold,n_splits,batch_size,model_id
dimensions,swnu_hidden_dim,ffn_hidden_dim,dropout_rate,learning_rate,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
15,"(10,)","(256, 256)",0.1,0.0001,0.810238,0.538563,0.53297,0.544721,0.805524,0.534934,0.529311,0.54173,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,13.0
15,"(10,)","(256, 256)",0.1,0.0005,0.797911,0.535687,0.521806,0.555531,0.793131,0.534312,0.519806,0.556036,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,14.0
15,"(10,)","(256, 256)",0.1,0.001,0.80592,0.541316,0.531164,0.554563,0.801078,0.534846,0.524557,0.547683,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,12.0
15,"(10,)","(256, 256)",0.2,0.0001,0.809539,0.538196,0.532486,0.545019,0.80551,0.536153,0.53066,0.543752,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,16.0
15,"(10,)","(256, 256)",0.2,0.0005,0.800294,0.539831,0.526172,0.55881,0.794729,0.537297,0.523205,0.558147,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,17.0
15,"(10,)","(256, 256)",0.2,0.001,0.809414,0.54185,0.535119,0.550303,0.806261,0.537913,0.53091,0.546431,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,15.0
15,"(10,)","(512, 512)",0.1,0.0001,0.803662,0.535693,0.525928,0.547505,0.798716,0.533738,0.523222,0.5472,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,19.0
15,"(10,)","(512, 512)",0.1,0.0005,0.797768,0.539451,0.524992,0.560275,0.792325,0.533578,0.519996,0.554406,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,20.0
15,"(10,)","(512, 512)",0.1,0.001,0.80153,0.534859,0.524149,0.549131,0.798272,0.53373,0.5227,0.548787,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,18.0
15,"(10,)","(512, 512)",0.2,0.0001,0.802193,0.536138,0.524972,0.550217,0.797646,0.534326,0.522918,0.549366,11.0,4.0,...,384.0,0.0,1.0,45.333333,1.0,2.0,1.0,5.0,64.0,22.0


In [22]:
best_swnu_network_umap_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,,0.816276,0.540623,"[0.8943354076910802, 0.44810307802433785, 0.27...",0.540878,"[0.8964237732710639, 0.4317241379310345, 0.294...",0.541289,"[0.8922567498726439, 0.46577380952380953, 0.26...",,0.815056,...,1,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.801172,0.541213,"[0.8843616986710856, 0.45161290322580644, 0.28...",0.526136,"[0.9028726862602003, 0.40717131474103585, 0.26...",0.561165,"[0.8665944982170147, 0.5069444444444444, 0.309...",,0.794673,...,12,True,focal,2,True,5,Conv1d,,concatenation,64
0,,0.803806,0.541192,"[0.8855969207879161, 0.4464285714285714, 0.291...",0.530174,"[0.8998882534674292, 0.40584415584415584, 0.28...",0.555476,"[0.8717524197656648, 0.49603174603174605, 0.29...",,0.799883,...,123,True,focal,2,True,5,Conv1d,,concatenation,64


In [30]:
best_swnu_network_umap_11[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "dimensions",
        "swnu_hidden_dim",
        "sig_depth",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,dimensions,swnu_hidden_dim,sig_depth,ffn_hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.540623,"[0.8943354076910802, 0.44810307802433785, 0.27...",0.540878,0.541289,0.541262,"[0.8932130520142152, 0.46561037090385304, 0.26...",0.542319,0.542925,15,"(12,)",3,"(256, 256)",0.2,0.0001,1,focal,True,64
0,0.541213,"[0.8843616986710856, 0.45161290322580644, 0.28...",0.526136,0.561165,0.539522,"[0.8793685439455851, 0.4577755742852421, 0.281...",0.523542,0.562405,15,"(12,)",3,"(256, 256)",0.2,0.0001,12,focal,True,64
0,0.541192,"[0.8855969207879161, 0.4464285714285714, 0.291...",0.530174,0.555476,0.539421,"[0.8827474910484643, 0.45564924114671157, 0.27...",0.52838,0.555175,15,"(12,)",3,"(256, 256)",0.2,0.0001,123,focal,True,64


In [23]:
best_swnu_network_umap_11["f1"].mean()

0.5410092779610705

In [24]:
best_swnu_network_umap_11["precision"].mean()

0.5323960588604976

In [25]:
best_swnu_network_umap_11["recall"].mean()

0.5526431174439549

In [26]:
np.stack(best_swnu_network_umap_11["f1_scores"]).mean(axis=0)

array([0.88809801, 0.44871485, 0.28621497])

In [27]:
np.stack(best_swnu_network_umap_11["precision_scores"]).mean(axis=0)

array([0.89972824, 0.4149132 , 0.28254674])

In [28]:
np.stack(best_swnu_network_umap_11["recall_scores"]).mean(axis=0)

array([0.87686789, 0.48958333, 0.29147813])