In [19]:
import pickle
import numpy as np
import pandas as pd
import re

seed = 2023

In [20]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
from nlpsig_networks.scripts.swmhau_network_functions import (
    swmhau_network_hyperparameter_search,
)

In [22]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

TalkLife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

swmhau Network

In [8]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = False

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [10]:
num_epochs = 100
embedding_dim = 384
dimensions = [15]
dimreduction_method = ["umap"]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (8, 4, 6)]
num_layers = [1]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [11]:
size = 11
(
    swmhau_network_umap_11,
    best_swmhau_network_umap_11,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=dimreduction_method,
    dimensions=dimensions,
    log_signature=True,
    swmhau_parameters=swmhau_parameters,
    num_layers=num_layers,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=include_features_in_path,
    include_features_in_input=include_features_in_input,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/swmhau_network_umap_focal_2_11.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/swmhau_network_umap_focal_2_11_best_model.csv


In [69]:
best_swmhau_network_umap_11 = pd.read_csv(
    "talklife_moc_output/swmhau_network_umap_focal_2_11_best_model.csv"
)

In [70]:
best_swmhau_network_umap_11[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "sig_depth",
        "num_heads",
        "ffn_hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "augmentation_type",
        "hidden_dim_aug",
        "comb_method",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,sig_depth,num_heads,ffn_hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,0.545105,[0.88791905 0.46014493 0.28725121],0.532982,0.560165,0.548318,[0.88744153 0.47681209 0.28070175],0.537791,0.562515,3,10,"(512, 512)",0.2,0.0005,1,focal,True,Conv1d,,concatenation,64
1,0.543427,[0.88621169 0.44322092 0.30084746],0.531451,0.558011,0.544859,[0.88468618 0.46633374 0.28355838],0.531369,0.562318,3,10,"(512, 512)",0.2,0.0005,12,focal,True,Conv1d,,concatenation,64
2,0.533057,[0.88289222 0.43884729 0.27743271],0.51958,0.55019,0.534192,[0.88031922 0.45371801 0.2685402 ],0.520164,0.552608,3,10,"(512, 512)",0.2,0.0005,123,focal,True,Conv1d,,concatenation,64


In [71]:
best_swmhau_network_umap_11["f1"].mean()

0.5405297196480728

In [72]:
best_swmhau_network_umap_11["precision"].mean()

0.5280045411230511

In [73]:
best_swmhau_network_umap_11["recall"].mean()

0.5561220632193558

In [74]:
best_swmhau_network_umap_11["f1_scores"] = best_swmhau_network_umap_11["f1_scores"].map(
    lambda x: [
        float(idx.replace(" ", ""))
        for idx in x.replace("[", "").replace("]", "").replace(" 0", ",0").split(",")
    ]
)
best_swmhau_network_umap_11["precision_scores"] = best_swmhau_network_umap_11[
    "precision_scores"
].map(
    lambda x: [
        float(idx.replace(" ", ""))
        for idx in x.replace("[", "").replace("]", "").replace(" 0", ",0").split(",")
    ]
)
best_swmhau_network_umap_11["recall_scores"] = best_swmhau_network_umap_11[
    "recall_scores"
].map(
    lambda x: [
        float(idx.replace(" ", ""))
        for idx in x.replace("[", "").replace("]", "").replace(" 0", ",0").split(",")
    ]
)

In [75]:
np.stack(best_swmhau_network_umap_11["f1_scores"]).mean(axis=0)

array([0.88567432, 0.44740438, 0.28851046])

In [76]:
np.stack(best_swmhau_network_umap_11["precision_scores"]).mean(axis=0)

array([0.89984748, 0.41335285, 0.27081329])

In [77]:
np.stack(best_swmhau_network_umap_11["recall_scores"]).mean(axis=0)

array([0.87194345, 0.48759921, 0.30882353])