In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
    obtain_path,
)

In [3]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [4]:
%run ../load_anno_mi.py

In [5]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-08-22 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-08-22 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-08-22 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-08-22 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-08-22 00:00:34


In [6]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)

sbert_embeddings.shape

(9699, 384)

# Baseline: LSTM classification

In [7]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300, 384]
num_layers = 1
bidirectional = True
dropout_rates = [0.5, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"

## `history_length=20`

In [8]:
size = 20
(
    bilstm_history_20_kfold,
    best_bilstm_history_20_kfold,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    output_dim=output_dim_client,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=client_index,
    split_ids=client_transcript_id,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2_kfold_best_model.csv


In [9]:
bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,precision,recall,valid_accuracy,valid_f1,valid_precision,valid_recall,k,num_layers,bidirectional,seed,gamma,k_fold,n_splits,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
100,0.1,0.0001,0.623499,0.498372,0.492719,0.507305,0.690484,0.597729,0.590598,0.608157,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,4.0
100,0.1,0.0005,0.624074,0.510865,0.502274,0.525589,0.697977,0.612504,0.601428,0.629671,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,5.0
100,0.1,0.001,0.63212,0.520844,0.512226,0.536112,0.705095,0.617691,0.608909,0.630678,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,3.0
100,0.5,0.0001,0.62599,0.500738,0.495681,0.510106,0.690537,0.598539,0.591833,0.609659,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,1.0
100,0.5,0.0005,0.630651,0.510996,0.504654,0.521998,0.697923,0.609338,0.600683,0.622852,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,2.0
100,0.5,0.001,0.628608,0.511775,0.504322,0.524911,0.705738,0.617899,0.60936,0.631657,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,0.0
200,0.1,0.0001,0.619029,0.497316,0.49097,0.510467,0.683954,0.596391,0.586357,0.613731,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,10.0
200,0.1,0.0005,0.633269,0.513856,0.508516,0.52177,0.703061,0.609532,0.604679,0.616152,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,11.0
200,0.1,0.001,0.634227,0.518882,0.511307,0.53332,0.699261,0.609502,0.601913,0.621359,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,9.0
200,0.5,0.0001,0.621967,0.496306,0.491251,0.507546,0.684168,0.594199,0.585669,0.609403,20.0,1.0,1.0,45.333333,2.0,1.0,5.0,64.0,7.0


In [10]:
best_bilstm_history_20_kfold

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,loss_function,gamma,k_fold,n_splits,batch_size
0,,0.617433,0.502614,"[0.7439190210001512, 0.43656866693515906, 0.32...",0.494407,"[0.7788674470104398, 0.42643587726199844, 0.27...",0.519116,"[0.7119722382880278, 0.4471947194719472, 0.398...",,0.701509,...,True,100,0.5,0.001,1,focal,2,True,5,64
0,,0.641762,0.524595,"[0.7620607504466945, 0.46864686468646866, 0.34...",0.517144,"[0.785451197053407, 0.46864686468646866, 0.297...",0.538042,"[0.7400231347599768, 0.46864686468646866, 0.40...",,0.704078,...,True,100,0.5,0.001,12,focal,2,True,5,64
0,,0.626628,0.508117,"[0.7493325422723228, 0.4329563812600969, 0.342...",0.501415,"[0.769183922046285, 0.4240506329113924, 0.3110...",0.517575,"[0.7304800462695199, 0.44224422442244227, 0.38]",,0.711625,...,True,100,0.5,0.001,123,focal,2,True,5,64


In [11]:
best_bilstm_history_20_kfold["f1"].mean()

0.5117752892103917

In [12]:
best_bilstm_history_20_kfold["precision"].mean()

0.5043222178656798

In [13]:
best_bilstm_history_20_kfold["recall"].mean()

0.5249108435038607

In [14]:
np.stack(best_bilstm_history_20_kfold["f1_scores"]).mean(axis=0)

array([0.75177077, 0.4460573 , 0.33749779])

In [15]:
np.stack(best_bilstm_history_20_kfold["precision_scores"]).mean(axis=0)

array([0.77783419, 0.43971112, 0.29542134])

In [16]:
np.stack(best_bilstm_history_20_kfold["recall_scores"]).mean(axis=0)

array([0.72749181, 0.45269527, 0.39454545])

## `history_length=50`

In [None]:
size = 50
(
    bilstm_history_50_kfold,
    best_bilstm_history_50_kfold,
    _,
    __,
) = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    output_dim=output_dim_client,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=client_index,
    split_ids=client_transcript_id,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False,
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
bilstm_history_50_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

In [None]:
best_bilstm_history_50_kfold

In [None]:
best_bilstm_history_50_kfold["f1"].mean()

In [None]:
best_bilstm_history_50_kfold["precision"].mean()

In [None]:
best_bilstm_history_50_kfold["recall"].mean()

In [None]:
np.stack(best_bilstm_history_50_kfold["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_bilstm_history_50_kfold["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_bilstm_history_50_kfold["recall_scores"]).mean(axis=0)