In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [5]:
%run ../load_anno_mi.py

In [6]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime,speaker
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-11-09 00:00:13,-1
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-11-09 00:00:24,1
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-11-09 00:00:25,-1
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-11-09 00:00:34,1
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-11-09 00:00:34,-1


In [7]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)

sbert_embeddings.shape

(9699, 384)

In [8]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [9]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 3)]
lstm_hidden_dim_sizes = [300, 400]
ffn_hidden_dim_sizes = [[32,32], [128,128], [512,512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [10]:
kwargs = {
    "num_epochs": num_epochs,
    "df": anno_mi,
    "id_column": "transcript_id",
    "label_column": "client_talk_type",
    "embeddings": sbert_embeddings,
    "y_data": y_data_client,
    "output_dim": output_dim_client,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "path_indices": client_index,
    "split_ids": client_transcript_id,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [11]:
shift = 3
window_size = 5
n = 3

## umap

In [12]:
(
    seqsignet_network_umap_kfold_11,
    best_seqsignet_network_umap_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_3_kfold_best_model.csv


In [13]:
seqsignet_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.641188,0.514702,"[0.7688457008244994, 0.4452084228620541, 0.330...",0.510156,"[0.7831433713257349, 0.46457399103139013, 0.28...",0.526272,"[0.7550607287449392, 0.4273927392739274, 0.396...",,0.660244,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.631609,0.497234,"[0.7655682972855277, 0.390893470790378, 0.3352...",0.493631,"[0.7685805887496356, 0.40770609318996415, 0.30...",0.503573,"[0.7625795257374205, 0.37541254125412543, 0.37...",,0.660726,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.630077,0.504490,"[0.7588642047962336, 0.41891891891891897, 0.33...",0.498875,"[0.772386942198263, 0.4290657439446367, 0.2951...",0.514713,"[0.7458068247541931, 0.40924092409240925, 0.38...",,0.660565,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.645785,0.521809,"[0.7694834278889221, 0.4661016949152542, 0.329...",0.516541,"[0.7953703703703704, 0.43713872832369943, 0.31...",0.529347,"[0.7452284557547716, 0.4991749174917492, 0.343...",,0.651734,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.631609,0.477131,"[0.7695826186392224, 0.3886743886743887, 0.273...",0.478563,"[0.7608818541548897, 0.40482573726541554, 0.26...",0.476204,"[0.7784846732215154, 0.37376237623762376, 0.27...",,0.657514,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.617816,0.509339,"[0.7378458652493355, 0.4679399727148704, 0.322...",0.506968,"[0.8029942157196325, 0.3988372093023256, 0.319...",0.524646,"[0.6824754193175245, 0.566006600660066, 0.3254...",,0.643866,...,True,focal,2,True,5,Conv1d,,concatenation,64,34
0,,0.632759,0.514483,"[0.7537718768859385, 0.45602605863192186, 0.33...",0.514968,"[0.78801261829653, 0.40618955512572535, 0.3507...",0.520122,"[0.7223828802776171, 0.5198019801980198, 0.318...",,0.653340,...,True,focal,2,True,5,Conv1d,,concatenation,64,34
0,,0.629310,0.498171,"[0.7587235311661199, 0.4346527259148618, 0.301...",0.498160,"[0.7832512315270936, 0.3969986357435198, 0.314...",0.501658,"[0.7356853672643147, 0.4801980198019802, 0.289...",,0.658157,...,True,focal,2,True,5,Conv1d,,concatenation,64,35
0,,0.633333,0.466458,"[0.768097818535501, 0.3596187175043328, 0.2716...",0.482585,"[0.7392350895961487, 0.37864963503649635, 0.32...",0.457541,"[0.7993059572006941, 0.3424092409240924, 0.230...",,0.666667,...,True,focal,2,True,5,Conv1d,,concatenation,64,35


In [14]:
best_seqsignet_network_umap_kfold_11["f1"].mean()

0.5133232318677906

In [15]:
best_seqsignet_network_umap_kfold_11["precision"].mean()

0.510238753566838

In [16]:
best_seqsignet_network_umap_kfold_11["recall"].mean()

0.5195560766795785

In [17]:
np.stack(best_seqsignet_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.7642087 , 0.43654488, 0.33921611])

In [18]:
np.stack(best_seqsignet_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.77701875, 0.43462865, 0.31906887])

In [19]:
np.stack(best_seqsignet_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.75284365, 0.44279428, 0.3630303 ])

# history_length=20

In [20]:
shift = 3
window_size = 5
n = 6

## umap

In [None]:
(
    seqsignet_network_umap_kfold_20,
    best_seqsignet_network_umap_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
seqsignet_network_umap_kfold_20

In [None]:
best_seqsignet_network_umap_kfold_20["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_20["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_20["recall_scores"]).mean(axis=0)

# history_length=35

In [None]:
shift = 3
window_size = 5
n = 11

## umap

In [None]:
(
    seqsignet_network_umap_kfold_35,
    best_seqsignet_network_umap_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_35

best_seqsignet_network_umap_kfold_35["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_35["recall_scores"]).mean(axis=0)

# history_length=80

In [None]:
shift = 3
window_size = 5
n = 26

## umap

In [None]:
(
    seqsignet_network_umap_kfold_80,
    best_seqsignet_network_umap_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_80

In [None]:
best_seqsignet_network_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["recall_scores"]).mean(axis=0)

# history_length=110

In [None]:
shift = 3
window_size = 5
n = 36

## umap

In [None]:
(
    seqsignet_network_umap_kfold_110,
    best_seqsignet_network_umap_kfold_110,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_110

In [None]:
best_seqsignet_network_umap_kfold_110["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_110["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_110["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["recall_scores"]).mean(axis=0)