In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
    obtain_path,
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

Baseline: LSTM classification (window=5)

In [8]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [10]:
size = 5
(
    bilstm_history_5,
    best_bilstm_history_5,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_5_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_5_focal_2_kfold_best_model.csv


In [11]:
bilstm_history_5.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_5.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.810614,0.538552,0.531588,0.54965,0.805747,0.542128,0.535826,0.558397,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.1,0.0005,0.813481,0.542101,0.536854,0.550499,0.808164,0.545448,0.538749,0.559148,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.1,0.001,0.807891,0.543174,0.531517,0.559762,0.803551,0.547516,0.537205,0.567231,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
100,0.2,0.0001,0.811815,0.538035,0.532444,0.547523,0.806955,0.540005,0.535505,0.554175,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.2,0.0005,0.813212,0.541283,0.536448,0.54993,0.808678,0.545193,0.539431,0.558502,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.2,0.001,0.813158,0.544141,0.538121,0.554781,0.80715,0.54755,0.542087,0.563518,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
200,0.1,0.0001,0.815183,0.53793,0.536342,0.542911,0.811929,0.542261,0.539972,0.552826,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0
200,0.1,0.0005,0.812012,0.543288,0.53643,0.554524,0.806886,0.545536,0.538762,0.561205,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,8.0
200,0.1,0.001,0.818802,0.541824,0.543402,0.544737,0.813666,0.545326,0.547106,0.552457,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,6.0
200,0.2,0.0001,0.815201,0.537533,0.535322,0.542859,0.811596,0.541383,0.538759,0.552168,5.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0


In [12]:
best_bilstm_history_5

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.80988,0.541505,"[0.8893408225192071, 0.47470470247381324, 0.26...",0.532373,"[0.9018593688621186, 0.4309995953055443, 0.264...",0.554075,"[0.8771650534895568, 0.5282738095238095, 0.256...",,0.806511,...,True,100,0.2,0.001,1,focal,2,True,5,64
0,,0.814449,0.542913,"[0.8925683729021036, 0.4840792696504119, 0.252...",0.536489,"[0.9031879522785058, 0.4391919191919192, 0.267...",0.553357,"[0.8821956189505858, 0.5391865079365079, 0.238...",,0.806427,...,True,100,0.2,0.001,12,focal,2,True,5,64
0,,0.815147,0.548004,"[0.8926397165405057, 0.47796535847401883, 0.27...",0.545502,"[0.9032005736262304, 0.42829076620825146, 0.30...",0.556912,"[0.8823229750382068, 0.5406746031746031, 0.247...",,0.808511,...,True,100,0.2,0.001,123,focal,2,True,5,64


In [19]:
best_bilstm_history_5[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.541505,"[0.8893408225192071, 0.47470470247381324, 0.26...",0.532373,0.554075,0.550613,"[0.8861661277283751, 0.48752649600521775, 0.27...",0.540949,0.568598,100,0.2,0.001,1,focal,True,64
0,0.542913,"[0.8925683729021036, 0.4840792696504119, 0.252...",0.536489,0.553357,0.548353,"[0.8858396811784292, 0.4816974408811144, 0.277...",0.543274,0.563741,100,0.2,0.001,12,focal,True,64
0,0.548004,"[0.8926397165405057, 0.47796535847401883, 0.27...",0.545502,0.556912,0.543686,"[0.8875960206523108, 0.48670427074939565, 0.25...",0.542038,0.558214,100,0.2,0.001,123,focal,True,64


In [13]:
best_bilstm_history_5["f1"].mean()

0.5441408595533188

In [14]:
best_bilstm_history_5["precision"].mean()

0.5381214976643758

In [15]:
best_bilstm_history_5["recall"].mean()

0.5547812486440588

In [16]:
np.stack(best_bilstm_history_5["f1_scores"]).mean(axis=0)

array([0.8915163 , 0.47891644, 0.26198983])

In [17]:
np.stack(best_bilstm_history_5["precision_scores"]).mean(axis=0)

array([0.9027493 , 0.43282743, 0.27878777])

In [18]:
np.stack(best_bilstm_history_5["recall_scores"]).mean(axis=0)

array([0.88056122, 0.53604497, 0.24773756])