In [1]:
import pickle
import numpy as np
import pandas as pd
import re

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.lstm_baseline_functions import lstm_hyperparameter_search

In [4]:
output_dir = "reddit_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Reddit MoC

In [5]:
%run load_redditmoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([6195, 384])

Baseline: BiLSTM

In [8]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5
split_indices = (
    df[df["set"] == "train"].index,
    df[df["set"] == "dev"].index,
    df[df["set"] == "test"].index,
)

In [9]:
size = 20
bilstm_history_20, best_bilstm_history_20, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings,
    y_data=y_data,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    output_dim=output_dim,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    path_indices=None,
    split_ids=None,
    split_indices=split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/6195 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in reddit_moc_output/lstm_history_20_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in reddit_moc_output/lstm_history_20_focal_2_best_model.csv


In [10]:
bilstm_history_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,model_id
0,0.495664,0.796578,0.621688,"[0.8935361216730039, 0.5403899721448469, 0.431...",0.643295,"[0.8639705882352942, 0.6423841059602649, 0.423...",0.610189,"[0.9251968503937008, 0.46634615384615385, 0.43...",0.285237,0.892601,...,100,0.5,0.0010,1,focal,2,False,,64,0
0,0.444055,0.799430,0.613302,"[0.8947024198822761, 0.6067415730337079, 0.338...",0.639913,"[0.8917861799217731, 0.569620253164557, 0.4583...",0.604990,"[0.8976377952755905, 0.6490384615384616, 0.268...",0.286182,0.873508,...,100,0.5,0.0010,12,focal,2,False,,64,0
0,0.539311,0.797529,0.628515,"[0.8928800513149455, 0.5684210526315789, 0.424...",0.640956,"[0.8732747804265998, 0.627906976744186, 0.4216...",0.619815,"[0.9133858267716536, 0.5192307692307693, 0.426...",0.294716,0.881464,...,100,0.5,0.0010,123,focal,2,False,,64,0
0,0.537754,0.788023,0.596051,"[0.8880407124681933, 0.5517241379310345, 0.348...",0.615659,"[0.8617283950617284, 0.6153846153846154, 0.369...",0.581760,"[0.916010498687664, 0.5, 0.32926829268292684]",0.291374,0.888624,...,100,0.5,0.0001,1,focal,2,False,,64,1
0,0.525584,0.791825,0.611774,"[0.8884615384615385, 0.5677749360613811, 0.379...",0.627810,"[0.868421052631579, 0.6065573770491803, 0.4084...",0.598920,"[0.9094488188976378, 0.5336538461538461, 0.353...",0.300413,0.889419,...,100,0.5,0.0001,12,focal,2,False,,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.509773,0.788023,0.607569,"[0.8813341885824246, 0.5837563451776651, 0.357...",0.623855,"[0.8619824341279799, 0.6182795698924731, 0.391...",0.594576,"[0.9015748031496063, 0.5528846153846154, 0.329...",0.308067,0.876691,...,384,0.1,0.0001,12,focal,2,False,,64,22
0,0.577531,0.787072,0.611055,"[0.8837508028259473, 0.5729442970822282, 0.376...",0.622699,"[0.8654088050314466, 0.6390532544378699, 0.363...",0.604121,"[0.9028871391076115, 0.5192307692307693, 0.390...",0.412106,0.873508,...,384,0.1,0.0001,123,focal,2,False,,64,22
0,0.535387,0.799430,0.614936,"[0.8991060025542784, 0.5653333333333334, 0.380...",0.631023,"[0.8756218905472637, 0.6347305389221557, 0.382...",0.603850,"[0.9238845144356955, 0.5096153846153846, 0.378...",0.308646,0.887033,...,384,0.1,0.0005,1,focal,2,False,,64,23
0,0.489781,0.801331,0.616318,"[0.8972081218274112, 0.5676392572944297, 0.384...",0.640659,"[0.8685503685503686, 0.6331360946745562, 0.420...",0.598634,"[0.9278215223097113, 0.5144230769230769, 0.353...",0.287183,0.891806,...,384,0.1,0.0005,12,focal,2,False,,64,23


In [11]:
best_bilstm_history_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,0.537754,0.788023,0.596051,"[0.8880407124681933, 0.5517241379310345, 0.348...",0.615659,"[0.8617283950617284, 0.6153846153846154, 0.369...",0.58176,"[0.916010498687664, 0.5, 0.32926829268292684]",0.291374,0.888624,...,True,100,0.5,0.0001,1,focal,2,False,,64
0,0.525584,0.791825,0.611774,"[0.8884615384615385, 0.5677749360613811, 0.379...",0.62781,"[0.868421052631579, 0.6065573770491803, 0.4084...",0.59892,"[0.9094488188976378, 0.5336538461538461, 0.353...",0.300413,0.889419,...,True,100,0.5,0.0001,12,focal,2,False,,64
0,0.541034,0.788973,0.615309,"[0.8864516129032258, 0.5692307692307692, 0.390...",0.623987,"[0.8718274111675127, 0.6098901098901099, 0.390...",0.608491,"[0.9015748031496063, 0.5336538461538461, 0.390...",0.294212,0.888624,...,True,100,0.5,0.0001,123,focal,2,False,,64


In [12]:
best_bilstm_history_20["f1"].mean()

0.6077110748432913

In [14]:
np.stack(best_bilstm_history_20["f1_scores"]).mean(axis=0)

array([0.88765129, 0.56290995, 0.37257199])