In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
from nlpsig_networks.scripts.lstm_baseline_functions import (
    lstm_hyperparameter_search,
    obtain_path
)

In [3]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [4]:
%run ../load_anno_mi.py

In [5]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-07-05 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-07-05 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-07-05 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-07-05 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-07-05 00:00:34


In [6]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)
    
sbert_embeddings.shape

(13551, 384)

In [7]:
x_data = obtain_path(
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    k=20
)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

In [8]:
x_data.shape

(13551, 20, 384)

In [9]:
x_data[0]

array([[0.00154884, 0.01095446, 0.04541774, ..., 0.02605489, 0.05162514,
        0.0810306 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# Baseline: LSTM classification

In [11]:
num_epochs = 100
hidden_dim_sizes = [100, 200, 300]
num_layers = 1
bidirectional = True
dropout_rates = [0.5, 0.2, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [0, 1, 12, 123, 1234]
loss = "focal"
gamma = 2
validation_metric = "f1"

## `history_length=20`

In [13]:
size = 20
bilstm_history_20, best_bilstm_history_20, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    output_dim=output_dim_client,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=client_index,
    k_fold=False,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2_best_model.csv


In [21]:
best_bilstm_history_20["f1"].mean()

0.6410598381806171

In [16]:
size = 20
bilstm_history_20_kfold, best_bilstm_history_20_kfold, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    output_dim=output_dim_client,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=client_index,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_20_focal_2_kfold_best_model.csv


In [17]:
bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_20_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,valid_accuracy,valid_f1,k,num_layers,bidirectional,seed,gamma,k_fold,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
100,0.5,0.001,0.700173,0.626416,0.700173,0.626416,20.0,1.0,1.0,4.333333,2.0,1.0,0.0


In [18]:
best_bilstm_history_20_kfold

Unnamed: 0,loss,accuracy,f1,f1_scores,valid_loss,valid_accuracy,valid_f1,valid_f1_scores,num_layers,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,gamma,k_fold
0,focal,0.703048,0.627384,"[0.7916567342073898, 0.5830009205277693, 0.507...",,0.703048,0.627384,"[0.7916567342073898, 0.5830009205277693, 0.507...",1,True,100,0.5,0.001,0,2,True
0,focal,0.704684,0.627052,"[0.79618820726623, 0.5792079207920792, 0.50575...",,0.704684,0.627052,"[0.79618820726623, 0.5792079207920792, 0.50575...",1,True,100,0.5,0.001,1,2,True
0,focal,0.692788,0.624811,"[0.7808657156910969, 0.5881310894596987, 0.505...",,0.692788,0.624811,"[0.7808657156910969, 0.5881310894596987, 0.505...",1,True,100,0.5,0.001,12,2,True


In [19]:
best_bilstm_history_20_kfold["f1"].mean()

0.626415973043203

In [20]:
np.stack(best_bilstm_history_20_kfold["f1_scores"]).mean(axis=0)

array([0.78957022, 0.58344664, 0.50623106])

## `history_length=50`

In [12]:
size = 50
bilstm_history_50_kfold, best_bilstm_history_50_kfold, _, __ = lstm_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    history_lengths=[size],
    hidden_dim_sizes=hidden_dim_sizes,
    num_layers=num_layers,
    bidirectional=bidirectional,
    output_dim=output_dim_client,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    path_indices=client_index,
    k_fold=True,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/lstm_history_{size}_focal_{gamma}_kfold.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_50_focal_2_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/lstm_history_50_focal_2_kfold_best_model.csv


In [13]:
bilstm_history_50_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()

  bilstm_history_50_kfold.groupby(["hidden_dim", "dropout_rate", "learning_rate"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,accuracy,f1,valid_accuracy,valid_f1,k,num_layers,bidirectional,seed,gamma,k_fold,batch_size,model_id
hidden_dim,dropout_rate,learning_rate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100,0.5,0.001,0.710384,0.641526,0.710384,0.641526,50.0,1.0,1.0,4.333333,2.0,1.0,64.0,0.0


In [14]:
best_bilstm_history_50_kfold

Unnamed: 0,loss,accuracy,f1,f1_scores,valid_loss,valid_accuracy,valid_f1,valid_f1_scores,num_layers,bidirectional,hidden_dim,dropout_rate,learning_rate,seed,gamma,k_fold,batch_size
0,focal,0.721041,0.644441,"[0.807415279253749, 0.6033940917661847, 0.5225...",,0.721041,0.644441,"[0.807415279253749, 0.6033940917661847, 0.5225...",1,True,100,0.5,0.001,0,2,True,64
0,focal,0.708996,0.641054,"[0.7953488372093023, 0.6081203007518797, 0.519...",,0.708996,0.641054,"[0.7953488372093023, 0.6081203007518797, 0.519...",1,True,100,0.5,0.001,1,2,True,64
0,focal,0.701115,0.639083,"[0.7845866998135487, 0.6056009334889149, 0.527...",,0.701115,0.639083,"[0.7845866998135487, 0.6056009334889149, 0.527...",1,True,100,0.5,0.001,12,2,True,64


In [15]:
best_bilstm_history_50_kfold["f1"].mean()

0.6415258830783609

In [16]:
np.stack(best_bilstm_history_50_kfold["f1_scores"]).mean(axis=0)

array([0.79578361, 0.60570511, 0.52308894])