In [1]:
import pickle
import numpy as np
import pandas as pd
import re

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.swmhau_network_functions import (
    swmhau_network_hyperparameter_search
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

swmhau Network

In [8]:
features = ["time_encoding"]#["time_encoding", "timeline_index"]
standardise_method = ["z_score"]#["z_score", None]
num_features = len(features)
add_time_in_path = False

In [9]:
num_epochs = 70
embedding_dim = 384
dimensions = [15] 
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (10, 3, 11)]#[(12, 3, 10), (8, 4, 12)]
num_layers = [1]
ffn_hidden_dim_sizes = [[32,32], [64,64]]#[[256,256],[512,512]]
dropout_rates = [0.1] #[0.2, 0.1]
learning_rates = [0.0001, 0.0003]#[1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 2

In [10]:
#create indices for kfold
fold_col_names = [c for c in df.columns if 'fold' in c ]
fold_list = []
for foldc in fold_col_names:
    fold_list.append((df[df[foldc]=='train'].index, df[df[foldc]=='dev'].index, df[df[foldc]=='test'].index))
fold_list = tuple(fold_list)

In [11]:
size = 5
swmhau_network_umap, best_swmhau_network_umap, _, __ = swmhau_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swmhau_parameters=swmhau_parameters,
    num_layers=num_layers,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_indices=fold_list,
    k_fold=True,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=add_time_in_path,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/swmhau_network_umap_focal_2_5.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/swmhau_network_umap_focal_2_5_best_model.csv


In [27]:
best_swmhau_network_umap[['f1', 'f1_scores', 'precision', 
       'recall', 'valid_f1',
       'valid_f1_scores', 'valid_precision', 
       'valid_recall', 'sig_depth',
       'num_heads', 
       'ffn_hidden_dim', 'dropout_rate', 'learning_rate', 'seed',
       'loss_function', 'k_fold', 'augmentation_type',
       'hidden_dim_aug', 'comb_method', 'batch_size']]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,sig_depth,num_heads,ffn_hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,0.545723,"[0.8905721192586623, 0.4845070422535211, 0.262...",0.534712,0.558895,0.54934,"[0.8867838968106011, 0.4871438498957609, 0.274...",0.535714,0.566628,3,10,"(64, 64)",0.1,0.0003,1,focal,True,Conv1d,,concatenation,64
0,0.560228,"[0.8860452938866343, 0.49248747913188645, 0.30...",0.541481,0.5884,0.550077,"[0.8773703523132587, 0.48975188781014023, 0.28...",0.529561,0.585013,3,10,"(64, 64)",0.1,0.0003,12,focal,True,Conv1d,,concatenation,64
0,0.558103,"[0.8887949260042283, 0.48171666301729804, 0.30...",0.541391,0.580487,0.556339,"[0.8842746541439269, 0.49204004595437384, 0.29...",0.539329,0.581047,3,10,"(64, 64)",0.1,0.0003,123,focal,True,Conv1d,,concatenation,64


In [14]:
best_swmhau_network_umap["f1"].mean()

0.5546846681289349

In [15]:
best_swmhau_network_umap["precision"].mean()

0.5391945884646884

In [16]:
best_swmhau_network_umap["recall"].mean()

0.5759272379354187

In [17]:
np.stack(best_swmhau_network_umap["f1_scores"]).mean(axis=0)

array([0.88847078, 0.48623706, 0.28934616])

In [18]:
np.stack(best_swmhau_network_umap["precision_scores"]).mean(axis=0)

array([0.90718234, 0.43872285, 0.27167857])

In [19]:
np.stack(best_swmhau_network_umap["recall_scores"]).mean(axis=0)

array([0.87058499, 0.54761905, 0.30957768])