In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda:0"

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

Baseline: LSTM classification (window=35)

In [8]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [10]:
size = 35
(
    bilstm_history_35,
    best_bilstm_history_35,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_35_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_35_focal_2_kfold_best_model.csv


In [11]:
bilstm_history_35.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_35.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.818946,0.498027,0.520303,0.490676,0.812499,0.507456,0.531851,0.503599,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.1,0.0005,0.824733,0.518313,0.542055,0.508437,0.817918,0.519223,0.544144,0.514592,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.1,0.001,0.81968,0.532139,0.539198,0.529788,0.813666,0.538348,0.54375,0.540704,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
100,0.2,0.0001,0.820415,0.498476,0.522189,0.490052,0.813819,0.506752,0.534429,0.501566,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.2,0.0005,0.824285,0.518513,0.542349,0.509028,0.817084,0.51802,0.543184,0.514034,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.2,0.001,0.819304,0.532915,0.538626,0.53095,0.813249,0.539555,0.543242,0.542064,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
200,0.1,0.0001,0.814986,0.491347,0.518271,0.488429,0.809303,0.495983,0.525937,0.497313,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0
200,0.1,0.0005,0.82287,0.521616,0.53951,0.514406,0.816584,0.52429,0.545359,0.520473,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,8.0
200,0.1,0.001,0.813356,0.526302,0.527994,0.53013,0.80929,0.540085,0.542551,0.547634,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,6.0
200,0.2,0.0001,0.813947,0.491634,0.517739,0.489527,0.809831,0.494876,0.526679,0.496321,35.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0


In [12]:
best_bilstm_history_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.817512,0.527153,"[0.8954121430163328, 0.45246971109040074, 0.23...",0.534838,"[0.8953836357847819, 0.4266256590509666, 0.282...",0.525394,"[0.8954406520631686, 0.4816468253968254, 0.199...",,0.815556,...,True,200,0.2,0.001,1,focal,2,True,5,64
0,,0.815577,0.532344,"[0.8937909018355945, 0.4617577197149644, 0.241...",0.531737,"[0.8961654183470968, 0.4430264357338195, 0.256...",0.534026,"[0.8914289353031075, 0.48214285714285715, 0.22...",,0.807678,...,True,200,0.2,0.001,12,focal,2,True,5,64
0,,0.806224,0.526252,"[0.8872433306386419, 0.46814814814814815, 0.22...",0.522583,"[0.9013205439852835, 0.4082687338501292, 0.258...",0.539681,"[0.8735990830361692, 0.5486111111111112, 0.196...",,0.80401,...,True,200,0.2,0.001,123,focal,2,True,5,64


In [13]:
best_bilstm_history_35[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.527153,"[0.8954121430163328, 0.45246971109040074, 0.23...",0.534838,0.525394,0.545745,"[0.8925405836072104, 0.47024423804609566, 0.27...",0.554597,0.547434,200,0.2,0.001,1,focal,True,64
0,0.532344,"[0.8937909018355945, 0.4617577197149644, 0.241...",0.531737,0.534026,0.537627,"[0.8874314751307918, 0.4691780821917808, 0.256...",0.534178,0.546221,200,0.2,0.001,12,focal,True,64
0,0.526252,"[0.8872433306386419, 0.46814814814814815, 0.22...",0.522583,0.539681,0.537453,"[0.88465219362313, 0.4695484493009803, 0.25815...",0.537675,0.55071,200,0.2,0.001,123,focal,True,64


In [14]:
best_bilstm_history_35["f1"].mean()

0.5285826944499633

In [15]:
best_bilstm_history_35["precision"].mean()

0.5297194395350956

In [16]:
best_bilstm_history_35["recall"].mean()

0.5330337614659456

In [17]:
np.stack(best_bilstm_history_35["f1_scores"]).mean(axis=0)

array([0.89214879, 0.46079186, 0.23280743])

In [18]:
np.stack(best_bilstm_history_35["precision_scores"]).mean(axis=0)

array([0.8976232 , 0.42597361, 0.26556151])

In [19]:
np.stack(best_bilstm_history_35["recall_scores"]).mean(axis=0)

array([0.88682289, 0.5041336 , 0.2081448 ])