In [1]:
import pickle
import numpy as np
import pandas as pd
import re

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.swmhau_network_functions import (
    swmhau_network_hyperparameter_search,
)

In [4]:
output_dir = "reddit_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Reddit MoC

In [5]:
%run load_redditmoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([6195, 384])

swmhau Network

In [8]:
features = ["time_encoding"]  # ["time_encoding", "timeline_index"]
standardise_method = ["z_score"]  # ["z_score", None]
num_features = len(features)
add_time_in_path = False

In [9]:
num_epochs = 100
embedding_dim = 384
dimensions = [15]  # [50, 15]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (8, 4, 6), (8, 4, 12)]
num_layers = [1]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (
    df[df["set"] == "train"].index,
    df[df["set"] == "dev"].index,
    df[df["set"] == "test"].index,
)

In [10]:
size = 20
(
    swmhau_network_umap,
    best_swmhau_network_umap,
    _,
    __,
) = swmhau_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swmhau_parameters=swmhau_parameters,
    num_layers=num_layers,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_indices=split_indices,
    k_fold=False,
    features=features,
    standardise_method=standardise_method,
    include_features_in_path=add_time_in_path,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swmhau_network_umap_focal_{gamma}_{size}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/6195 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/6195 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in reddit_moc_output/swmhau_network_umap_focal_2_20.csv
saving the best model results dataframe to CSV for this hyperparameter search in reddit_moc_output/swmhau_network_umap_focal_2_20_best_model.csv


In [11]:
swmhau_network_umap

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,0.490804,0.790875,0.597089,"[0.9028871391076115, 0.5707317073170731, 0.317...",0.596304,"[0.9028871391076115, 0.5792079207920792, 0.306...",0.598218,"[0.9028871391076115, 0.5625, 0.32926829268292684]",0.291540,0.887033,...,1,focal,2,False,,Conv1d,,concatenation,64,0
0,0.471263,0.799430,0.620780,"[0.9081433224755701, 0.576086956521739, 0.3781...",0.627836,"[0.9016817593790427, 0.6625, 0.31932773109243695]",0.629243,"[0.9146981627296588, 0.5096153846153846, 0.463...",0.298658,0.879873,...,12,focal,2,False,,Conv1d,,concatenation,64,0
0,0.487448,0.795627,0.580725,"[0.8978246539222148, 0.6029723991507431, 0.241...",0.617892,"[0.9019867549668874, 0.5399239543726235, 0.411...",0.582375,"[0.8937007874015748, 0.6826923076923077, 0.170...",0.304794,0.881464,...,123,focal,2,False,,Conv1d,,concatenation,64,0
0,0.579060,0.768061,0.556837,"[0.8815533980582524, 0.5257985257985257, 0.263...",0.564378,"[0.8697318007662835, 0.5376884422110553, 0.285...",0.550675,"[0.8937007874015748, 0.5144230769230769, 0.243...",0.329063,0.863962,...,1,focal,2,False,,Conv1d,,concatenation,64,1
0,0.604418,0.787072,0.607118,"[0.8859934853420196, 0.5896805896805896, 0.345...",0.610902,"[0.8796895213454075, 0.6030150753768844, 0.35]",0.603592,"[0.8923884514435696, 0.5769230769230769, 0.341...",0.378108,0.857597,...,12,focal,2,False,,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.536365,0.769962,0.604913,"[0.873144399460189, 0.5813449023861172, 0.3602...",0.598448,"[0.8986111111111111, 0.5296442687747036, 0.367...",0.615657,"[0.8490813648293963, 0.6442307692307693, 0.353...",0.356749,0.863962,...,12,focal,2,False,,Conv1d,,concatenation,64,34
0,0.491411,0.798479,0.625574,"[0.8937007874015748, 0.6027397260273972, 0.380...",0.639205,"[0.8937007874015748, 0.5739130434782609, 0.45]",0.619195,"[0.8937007874015748, 0.6346153846153846, 0.329...",0.301468,0.871122,...,123,focal,2,False,,Conv1d,,concatenation,64,34
0,0.466332,0.782319,0.547712,"[0.9033977348434377, 0.5968819599109132, 0.142...",0.542083,"[0.9174560216508796, 0.5560165975103735, 0.152...",0.556047,"[0.889763779527559, 0.6442307692307693, 0.1341...",0.313288,0.870326,...,1,focal,2,False,,Conv1d,,concatenation,64,35
0,0.463273,0.783270,0.592786,"[0.9020116807268007, 0.5244956772334294, 0.351...",0.610143,"[0.8921694480102695, 0.6546762589928058, 0.283...",0.604329,"[0.9120734908136483, 0.4375, 0.4634146341463415]",0.277096,0.898170,...,12,focal,2,False,,Conv1d,,concatenation,64,35


In [14]:
best_swmhau_network_umap

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,learning_rate,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,0.479101,0.792776,0.632396,"[0.8902116402116402, 0.6000000000000001, 0.406...",0.626854,"[0.8973333333333333, 0.5943396226415094, 0.388...",0.6386,"[0.8832020997375328, 0.6057692307692307, 0.426...",0.307332,0.882259,...,0.0001,1,focal,2,False,,Conv1d,,concatenation,64
0,0.531378,0.784221,0.613496,"[0.8858267716535433, 0.5714285714285715, 0.383...",0.612636,"[0.8858267716535433, 0.5756097560975609, 0.376...",0.614459,"[0.8858267716535433, 0.5673076923076923, 0.390...",0.327268,0.878282,...,0.0001,12,focal,2,False,,Conv1d,,concatenation,64
0,0.500984,0.797529,0.598491,"[0.8955797565663036, 0.5856079404466501, 0.314...",0.619761,"[0.8748435544430538, 0.6051282051282051, 0.379...",0.584308,"[0.9173228346456693, 0.5673076923076923, 0.268...",0.296446,0.883055,...,0.0001,123,focal,2,False,,Conv1d,,concatenation,64


In [15]:
best_swmhau_network_umap["f1"].mean()

0.6147945190791779

In [16]:
np.stack(best_swmhau_network_umap["f1_scores"]).mean(axis=0)

array([0.89053939, 0.58567884, 0.36816533])