In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda:1"

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

Baseline: LSTM classification (window=20)

In [8]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [10]:
size = 20
(
    bilstm_history_20,
    best_bilstm_history_20,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_20_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_20_focal_2_kfold_best_model.csv


In [11]:
bilstm_history_20.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.811653,0.50424,0.518992,0.503343,0.805385,0.506174,0.52405,0.508128,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.1,0.0005,0.813875,0.523137,0.530253,0.523667,0.80954,0.529253,0.538151,0.532854,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.1,0.001,0.818372,0.536526,0.542656,0.537171,0.811777,0.535854,0.542895,0.539267,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
100,0.2,0.0001,0.812639,0.503141,0.51873,0.501783,0.805747,0.503892,0.523123,0.505311,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.2,0.0005,0.814269,0.522828,0.531625,0.524581,0.808664,0.524767,0.536996,0.528902,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.2,0.001,0.819268,0.536402,0.543293,0.536064,0.812193,0.534395,0.542472,0.536531,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
200,0.1,0.0001,0.813302,0.497192,0.51188,0.494726,0.807997,0.497223,0.517547,0.495937,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0
200,0.1,0.0005,0.812406,0.52545,0.527271,0.528161,0.806747,0.531878,0.533887,0.537837,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,8.0
200,0.1,0.001,0.812603,0.533006,0.532568,0.540165,0.807469,0.538102,0.539065,0.546247,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,6.0
200,0.2,0.0001,0.813481,0.498118,0.513166,0.495585,0.808331,0.495499,0.516494,0.494334,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0


In [12]:
best_bilstm_history_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.823425,0.535746,"[0.8985516432668844, 0.470326409495549, 0.2383...",0.542492,"[0.8944412896712727, 0.46893491124260356, 0.26...",0.53054,"[0.9026999490575649, 0.47172619047619047, 0.21...",,0.81864,...,True,300,0.1,0.001,1,focal,2,True,5,64
0,,0.80945,0.532076,"[0.8885941644562334, 0.47839569981393426, 0.22...",0.53487,"[0.9030243261012492, 0.41013824884792627, 0.29...",0.545814,"[0.8746179317371371, 0.5739087301587301, 0.188...",,0.802718,...,True,300,0.1,0.001,12,focal,2,True,5,64
0,,0.803913,0.535218,"[0.885061944606603, 0.4672897196261682, 0.2533...",0.526763,"[0.9018506278916061, 0.4086181277860327, 0.269...",0.55107,"[0.8688869077941925, 0.5456349206349206, 0.238...",,0.799216,...,True,300,0.1,0.001,123,focal,2,True,5,64


In [13]:
best_bilstm_history_20[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.535746,"[0.8985516432668844, 0.470326409495549, 0.2383...",0.542492,0.53054,0.547567,"[0.894514977187066, 0.472851492268968, 0.27533...",0.553971,0.546272,300,0.1,0.001,1,focal,True,64
0,0.532076,"[0.8885941644562334, 0.47839569981393426, 0.22...",0.53487,0.545814,0.534559,"[0.8833741881681029, 0.47677827495756825, 0.24...",0.540213,0.55169,300,0.1,0.001,12,focal,True,64
0,0.535218,"[0.885061944606603, 0.4672897196261682, 0.2533...",0.526763,0.55107,0.535082,"[0.8817487223599656, 0.4677265500794912, 0.255...",0.528678,0.553072,300,0.1,0.001,123,focal,True,64


In [14]:
best_bilstm_history_20["f1"].mean()

0.5343467031879768

In [15]:
best_bilstm_history_20["precision"].mean()

0.5347084507917814

In [16]:
best_bilstm_history_20["recall"].mean()

0.542474556661026

In [17]:
np.stack(best_bilstm_history_20["f1_scores"]).mean(axis=0)

array([0.89073592, 0.47200394, 0.24030025])

In [18]:
np.stack(best_bilstm_history_20["precision_scores"]).mean(axis=0)

array([0.89977208, 0.42923043, 0.27512284])

In [19]:
np.stack(best_bilstm_history_20["recall_scores"]).mean(axis=0)

array([0.88206826, 0.53042328, 0.21493213])