In [1]:
import pickle
import numpy as np
import pandas as pd
import re
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [3]:
from nlpsig_networks.scripts.ffn_baseline_functions import (
    histories_baseline_hyperparameter_search,
)

In [4]:
output_dir = "talklife_moc_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

Talklife MoC

In [5]:
%run load_talklifemoc.py

In [6]:
%run load_sbert-embeddings.py

In [7]:
sbert_embeddings.shape

torch.Size([18604, 384])

Baseline: Averaging history and use FFN

In [8]:
num_epochs = 100
hidden_dim_sizes = [[64, 64], [128, 128], [256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [9]:
# create indices for kfold
fold_col_names = [c for c in df.columns if "fold" in c]
fold_list = []
for foldc in fold_col_names:
    fold_list.append(
        (
            df[df[foldc] == "train"].index,
            df[df[foldc] == "dev"].index,
            df[df[foldc] == "test"].index,
        )
    )
fold_list = tuple(fold_list)

In [11]:
(
    ffn_mean_history,
    best_ffn_mean_history,
    _,
    __,
) = histories_baseline_hyperparameter_search(
    num_epochs=num_epochs,
    df=df,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    output_dim=output_dim,
    hidden_dim_sizes=hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    use_signatures=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    device=device,
    split_ids=None,
    split_indices=fold_list,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/ffn_mean_history_focal_{gamma}.csv",
    verbose=False,
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/18604 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
Computing the mean history for each item in the dataframe


  0%|          | 0/18604 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in talklife_moc_output/ffn_mean_history_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in talklife_moc_output/ffn_mean_history_focal_2_best_model.csv


In [12]:
ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,model_id,input_dim
0,,0.797140,0.532641,"[0.8804259601960975, 0.44210977701543736, 0.27...",0.522290,"[0.8981254553884878, 0.38935045317220546, 0.27...",0.548771,"[0.8634105960264901, 0.5114087301587301, 0.271...",,0.792547,...,0.1,0.0010,1,focal,2,True,5,64,0.00,768
0,,0.768921,0.531432,"[0.8605556301167628, 0.45436741088915, 0.27937...",0.503975,"[0.9095035460992907, 0.37540453074433655, 0.22...",0.585042,"[0.8166072338257768, 0.5753968253968254, 0.363...",,0.761160,...,0.1,0.0010,12,focal,2,True,5,64,0.00,768
0,,0.792948,0.534831,"[0.8771918367346939, 0.450148493848112, 0.2771...",0.518802,"[0.9002077608739361, 0.3932542624166049, 0.262...",0.558200,"[0.8553234844625573, 0.5262896825396826, 0.292...",,0.786628,...,0.1,0.0010,123,focal,2,True,5,64,0.00,768
0,,0.791980,0.534202,"[0.8770084911822339, 0.4472829616800173, 0.278...",0.516520,"[0.9001743094663449, 0.39684978870534, 0.25253...",0.559120,"[0.8550050942435048, 0.5124007936507936, 0.309...",,0.785420,...,0.1,0.0001,1,focal,2,True,5,64,0.10,768
0,,0.791120,0.537177,"[0.8764022894521668, 0.4498239436619718, 0.285...",0.517508,"[0.9009481541254791, 0.40427215189873417, 0.24...",0.565736,"[0.8531584309730005, 0.5069444444444444, 0.337...",,0.783169,...,0.1,0.0001,12,focal,2,True,5,64,0.10,768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.800849,0.541813,"[0.8831328816416507, 0.4524081259986304, 0.289...",0.526793,"[0.8994915142309978, 0.41902748414376323, 0.26...",0.561196,"[0.8673586347427407, 0.4915674603174603, 0.324...",,0.791047,...,0.2,0.0001,12,focal,2,True,5,64,0.22,768
0,,0.788110,0.537351,"[0.8738546421885776, 0.4515036001694197, 0.286...",0.516488,"[0.9022719565954561, 0.3939393939393939, 0.253...",0.568753,"[0.8471726948548141, 0.5287698412698413, 0.330...",,0.783669,...,0.2,0.0001,123,focal,2,True,5,64,0.22,768
0,,0.795635,0.538739,"[0.8790175071857852, 0.45501766050280484, 0.28...",0.524766,"[0.9023605150214592, 0.39149088308902397, 0.28...",0.561314,"[0.8568517575140092, 0.5431547619047619, 0.283...",,0.791088,...,0.2,0.0005,1,focal,2,True,5,64,0.23,768
0,,0.769512,0.524960,"[0.8610527018457328, 0.44475357710651825, 0.26...",0.499887,"[0.904748544574595, 0.3710212201591512, 0.2238...",0.571182,"[0.8213830871115639, 0.5550595238095238, 0.337...",,0.770372,...,0.2,0.0005,12,focal,2,True,5,64,0.23,768


In [13]:
best_ffn_mean_history

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size,input_dim
0,,0.793217,0.536909,"[0.8778720626631855, 0.4479964966060871, 0.284...",0.51887,"[0.9004418853776112, 0.4010192081536652, 0.255...",0.562082,"[0.8564060112073357, 0.5074404761904762, 0.322...",,0.787962,...,"(256, 256)",0.2,0.0001,1,focal,2,True,5,64,768
0,,0.791443,0.537304,"[0.8763051945926483, 0.4541541110632802, 0.281...",0.517949,"[0.9015962820771873, 0.4011406844106464, 0.251...",0.565281,"[0.8523942944472745, 0.5233134920634921, 0.320...",,0.783585,...,"(256, 256)",0.2,0.0001,12,focal,2,True,5,64,768
0,,0.79499,0.537401,"[0.8788471570161711, 0.4517810599478715, 0.281...",0.520617,"[0.9004543025120256, 0.401854714064915, 0.2595...",0.560606,"[0.85825267447784, 0.5158730158730159, 0.30769...",,0.788171,...,"(256, 256)",0.2,0.0001,123,focal,2,True,5,64,768


In [20]:
best_ffn_mean_history[
    [
        "f1",
        "f1_scores",
        "precision",
        "recall",
        "valid_f1",
        "valid_f1_scores",
        "valid_precision",
        "valid_recall",
        "hidden_dim",
        "dropout_rate",
        "learning_rate",
        "seed",
        "loss_function",
        "k_fold",
        "batch_size",
    ]
]

Unnamed: 0,f1,f1_scores,precision,recall,valid_f1,valid_f1_scores,valid_precision,valid_recall,hidden_dim,dropout_rate,learning_rate,seed,loss_function,k_fold,batch_size
0,0.536909,"[0.8778720626631855, 0.4479964966060871, 0.284...",0.51887,0.562082,0.536031,"[0.8731316635208896, 0.4655228229200389, 0.269...",0.517419,0.564078,"(256, 256)",0.2,0.0001,1,focal,True,64
0,0.537304,"[0.8763051945926483, 0.4541541110632802, 0.281...",0.517949,0.565281,0.535844,"[0.8701867650449621, 0.46232602783554627, 0.27...",0.514893,0.56863,"(256, 256)",0.2,0.0001,12,focal,True,64
0,0.537401,"[0.8788471570161711, 0.4517810599478715, 0.281...",0.520617,0.560606,0.535158,"[0.8733622227886821, 0.462729912875121, 0.2693...",0.517408,0.562142,"(256, 256)",0.2,0.0001,123,focal,True,64


In [14]:
best_ffn_mean_history["f1"].mean()

0.5372043517768734

In [15]:
best_ffn_mean_history["precision"].mean()

0.5191451019706829

In [16]:
best_ffn_mean_history["recall"].mean()

0.5626562454003695

In [17]:
np.stack(best_ffn_mean_history["f1_scores"]).mean(axis=0)

array([0.8776748 , 0.45131056, 0.28262769])

In [18]:
np.stack(best_ffn_mean_history["precision_scores"]).mean(axis=0)

array([0.90083082, 0.4013382 , 0.25526628])

In [19]:
np.stack(best_ffn_mean_history["recall_scores"]).mean(axis=0)

array([0.85568433, 0.51554233, 0.31674208])