In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
    obtain_path,
)

In [3]:
output_dir = "therapist_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [4]:
%run ../load_anno_mi.py

In [5]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-08-22 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-08-22 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-08-22 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-08-22 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-08-22 00:00:34


In [6]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)

sbert_embeddings.shape

(9699, 384)

# Baseline: LSTM classification

In [7]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

## `history_length=20`

In [8]:
size = 20
(
    bilstm_history_20_kfold,
    best_bilstm_history_20_kfold,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="main_therapist_behaviour",
    embeddings=sbert_embeddings,
    y_data=y_data_therapist,
    output_dim=output_dim_therapist,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=therapist_index,
    split_ids=therapist_transcript_id,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in therapist_talk_type_output/lstm_history_20_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in therapist_talk_type_output/lstm_history_20_focal_2_kfold_best_model.csv


In [9]:
bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.637318,0.597015,0.59963,0.596691,0.682911,0.651049,0.650485,0.653225,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.1,0.0005,0.688425,0.650007,0.650828,0.650461,0.724473,0.693701,0.692281,0.696601,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.1,0.001,0.701139,0.665687,0.667653,0.666581,0.732331,0.704217,0.703232,0.708813,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
100,0.5,0.0001,0.636939,0.597718,0.600033,0.598455,0.687922,0.657178,0.656719,0.66043,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.5,0.0005,0.684124,0.644626,0.647388,0.643834,0.725158,0.694015,0.693945,0.695655,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.5,0.001,0.696711,0.659085,0.660946,0.659065,0.733175,0.703367,0.702404,0.706243,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
200,0.1,0.0001,0.644845,0.60711,0.607992,0.608809,0.687025,0.654888,0.653751,0.657396,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0
200,0.1,0.0005,0.696015,0.657824,0.658825,0.658162,0.728903,0.698673,0.697825,0.701502,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,11.0
200,0.1,0.001,0.694687,0.65789,0.659592,0.658517,0.73365,0.70473,0.704284,0.708196,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,9.0
200,0.5,0.0001,0.644023,0.603433,0.605458,0.603947,0.684705,0.651817,0.652259,0.653091,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0


In [10]:
best_bilstm_history_20_kfold

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.696395,0.657426,"[0.7137521222410866, 0.44778362133734034, 0.62...",0.659346,"[0.725828729281768, 0.4508320726172466, 0.5977...",0.656555,"[0.7020708082832331, 0.44477611940298506, 0.65...",,0.73307,...,True,384,0.1,0.001,1,focal,2,True,5,64
0,,0.702657,0.662972,"[0.7219578518014955, 0.45351473922902497, 0.62...",0.664699,"[0.7349480968858132, 0.45941807044410415, 0.60...",0.662002,"[0.7094188376753507, 0.44776119402985076, 0.65...",,0.739241,...,True,384,0.1,0.001,12,focal,2,True,5,64
0,,0.696015,0.658061,"[0.704483695652174, 0.45349730976172176, 0.626...",0.662512,"[0.7166551485832757, 0.4675118858954041, 0.585...",0.656077,"[0.6927187708750835, 0.44029850746268656, 0.67...",,0.735127,...,True,384,0.1,0.001,123,focal,2,True,5,64


In [11]:
best_bilstm_history_20_kfold["f1"].mean()

0.6594865308186346

In [12]:
best_bilstm_history_20_kfold["precision"].mean()

0.6621856676013539

In [13]:
best_bilstm_history_20_kfold["recall"].mean()

0.6582113701318494

In [14]:
np.stack(best_bilstm_history_20_kfold["f1_scores"]).mean(axis=0)

array([0.71339789, 0.45159856, 0.62710085, 0.84584883])

In [15]:
np.stack(best_bilstm_history_20_kfold["precision_scores"]).mean(axis=0)

array([0.72581066, 0.45925401, 0.59578063, 0.86789737])

In [16]:
np.stack(best_bilstm_history_20_kfold["recall_scores"]).mean(axis=0)

array([0.70140281, 0.44427861, 0.66212989, 0.82503417])

## `history_length=50`

In [None]:
size = 50
(
    bilstm_history_50_kfold,
    best_bilstm_history_50_kfold,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="main_therapist_behaviour",
    embeddings=sbert_embeddings,
    y_data=y_data_therapist,
    output_dim=output_dim_therapist,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=therapist_index,
    split_ids=therapist_transcript_id,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
bilstm_history_50_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

In [None]:
best_bilstm_history_50_kfold

In [None]:
best_bilstm_history_50_kfold["f1"].mean()

In [None]:
best_bilstm_history_50_kfold["precision"].mean()

In [None]:
best_bilstm_history_50_kfold["recall"].mean()

In [None]:
np.stack(best_bilstm_history_50_kfold["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_bilstm_history_50_kfold["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_bilstm_history_50_kfold["recall_scores"]).mean(axis=0)