In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda:1"

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

Baseline: LSTM classification (window=11)

In [8]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [10]:
size = 11
(
    bilstm_history_11,
    best_bilstm_history_11,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/18604 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_11_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/lstm_history_11_focal_2_kfold_best_model.csv


In [11]:
bilstm_history_11.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_11.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.813535,0.514977,0.527467,0.516116,0.804052,0.515119,0.526936,0.523212,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.1,0.0005,0.812388,0.535597,0.534205,0.541667,0.806288,0.53937,0.535665,0.550213,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.1,0.001,0.816133,0.541505,0.541896,0.547982,0.808623,0.543205,0.543793,0.553336,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
100,0.2,0.0001,0.816796,0.513378,0.530145,0.512138,0.806566,0.513243,0.528705,0.518692,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.2,0.0005,0.817154,0.538161,0.540599,0.540065,0.80765,0.537065,0.536634,0.544646,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.2,0.001,0.818444,0.541154,0.544025,0.544509,0.81111,0.544315,0.546686,0.552089,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
200,0.1,0.0001,0.80239,0.505304,0.513572,0.515826,0.793812,0.507149,0.513151,0.524406,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0
200,0.1,0.0005,0.811815,0.529236,0.531496,0.535978,0.802301,0.529318,0.530848,0.542561,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,8.0
200,0.1,0.001,0.812173,0.539065,0.534552,0.547938,0.806024,0.544335,0.538999,0.557295,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,6.0
200,0.2,0.0001,0.805669,0.506111,0.515557,0.514174,0.796924,0.507358,0.513732,0.522596,11.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0


In [12]:
best_bilstm_history_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.823479,0.537396,"[0.8987053994316387, 0.45545598313125985, 0.25...",0.5463,"[0.8913315796066642, 0.4859392575928009, 0.261...",0.529766,"[0.9062022414671421, 0.42857142857142855, 0.25...",,0.818015,...,True,384,0.1,0.001,1,focal,2,True,5,64
0,,0.808106,0.543496,"[0.8882212315048135, 0.47182295231576543, 0.27...",0.531538,"[0.9014426229508197, 0.4368398817068019, 0.256...",0.558159,"[0.8753820682628629, 0.5128968253968254, 0.286...",,0.801634,...,True,384,0.1,0.001,12,focal,2,True,5,64
0,,0.79628,0.529359,"[0.8796417364016736, 0.47830599958480374, 0.23...",0.513753,"[0.9037479849543256, 0.4112816851124598, 0.226...",0.554127,"[0.8567880794701986, 0.5714285714285714, 0.234...",,0.794381,...,True,384,0.1,0.001,123,focal,2,True,5,64


In [19]:
best_bilstm_history_11[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.537396,"[0.8987053994316387, 0.45545598313125985, 0.25...",0.5463,0.529766,0.546168,"[0.8944596899799748, 0.4642514855280812, 0.279...",0.548662,0.543877,384,0.1,0.001,1,focal,True,64
0,0.543496,"[0.8882212315048135, 0.47182295231576543, 0.27...",0.531538,0.558159,0.545854,"[0.883057788563628, 0.47765690376569037, 0.276...",0.531956,0.565384,384,0.1,0.001,12,focal,True,64
0,0.529359,"[0.8796417364016736, 0.47830599958480374, 0.23...",0.513753,0.554127,0.542007,"[0.8774556137437732, 0.4855865577308463, 0.262...",0.526527,0.571132,384,0.1,0.001,123,focal,True,64


In [13]:
best_bilstm_history_11["f1"].mean()

0.5367502541279756

In [14]:
best_bilstm_history_11["precision"].mean()

0.5305303052510495

In [15]:
best_bilstm_history_11["recall"].mean()

0.5473506769361203

In [16]:
np.stack(best_bilstm_history_11["f1_scores"]).mean(axis=0)

array([0.88885612, 0.46852831, 0.25286633])

In [17]:
np.stack(best_bilstm_history_11["precision_scores"]).mean(axis=0)

array([0.89884073, 0.44468694, 0.24806325])

In [18]:
np.stack(best_bilstm_history_11["recall_scores"]).mean(axis=0)

array([0.87945746, 0.50429894, 0.25829563])