In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_attention_encoder_functions import (
    seqsignet_attention_encoder_hyperparameter_search,
)

In [4]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [5]:
%run ../load_anno_mi.py

In [6]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime,speaker
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-11-10 00:00:13,-1
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-11-10 00:00:24,1
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-11-10 00:00:25,-1
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-11-10 00:00:34,1
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-11-10 00:00:34,-1


In [7]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)

sbert_embeddings.shape

(9699, 384)

In [8]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [9]:
num_epochs = 100
dimensions = [15]
# define swmhau parameters: (output_channels, sig_depth, num_heads)
swmhau_parameters = [(12, 3, 10), (10, 3, 5)]
num_layers = [1]
ffn_hidden_dim_sizes = [[32,32], [128,128], [512,512]]
dropout_rates = [0.1]
learning_rates = [5e-4, 3e-4, 1e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 3

In [10]:
kwargs = {
    "num_epochs": num_epochs,
    "df": anno_mi,
    "id_column": "transcript_id",
    "label_column": "client_talk_type",
    "embeddings": sbert_embeddings,
    "y_data": y_data_client,
    "output_dim": output_dim_client,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "transformer_encoder_layers": 2,
    "swmhau_parameters": swmhau_parameters,
    "num_layers": num_layers,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "path_indices": client_index,
    "split_ids": client_transcript_id,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [11]:
shift = 3
window_size = 5
n = 3

In [12]:
(
    seqsignet_attention_encoder_umap_kfold_11,
    best_seqsignet_attention_encoder_umap_kfold_11,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_attention_encoder_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_attention_encoder_umap_focal_2_3_5_3_kfold_best_model.csv


In [13]:
seqsignet_attention_encoder_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,seed,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.639272,0.379968,"[0.781563126252505, 0.17167381974248927, 0.186...",0.434493,"[0.68935041979673, 0.3341288782816229, 0.28]",0.385922,"[0.9022556390977443, 0.11551155115511551, 0.14]",,0.646114,...,1,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.64636,0.357379,"[0.7850030731407499, 0.1751824817518248, 0.111...",0.44979,"[0.6827025871285012, 0.3333333333333333, 0.333...",0.369817,"[0.9233661075766338, 0.1188118811881188, 0.067...",,0.6535,...,12,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.664368,0.429112,"[0.7986339488995698, 0.24816625916870413, 0.24...",0.499625,"[0.7097571942446043, 0.47877358490566035, 0.31...",0.425604,"[0.9129554655870445, 0.1674917491749175, 0.196...",,0.656872,...,123,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.660536,0.343206,"[0.7958393807450411, 0.17963314358001262, 0.05...",0.486353,"[0.683991683991684, 0.38482384823848237, 0.390...",0.36589,"[0.951417004048583, 0.11716171617161716, 0.029...",,0.648041,...,1,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.663793,0.342117,"[0.7957196104364556, 0.18656229615133724, 0.04...",0.483829,"[0.6810043218769294, 0.4454828660436137, 0.325]",0.366178,"[0.9569115095430885, 0.11798679867986799, 0.02...",,0.648362,...,12,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.661877,0.364149,"[0.7981106939566428, 0.14555256064690025, 0.14...",0.477551,"[0.686601375286518, 0.39705882352941174, 0.348...",0.378839,"[0.9528629265471371, 0.0891089108910891, 0.094...",,0.655267,...,123,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.650575,0.29041,"[0.788422917897224, 0.08280701754385965, 0.0]",0.314487,"[0.6664669462752147, 0.27699530516431925, 0.0]",0.337896,"[0.9650086755349914, 0.04867986798679868, 0.0]",,0.640013,...,1,focal,2,True,5,Conv1d,,concatenation,64,2
0,,0.635441,0.316496,"[0.777133188300701, 0.10774410774410774, 0.064...",0.376182,"[0.6675664451827242, 0.29304029304029305, 0.16...",0.345245,"[0.9297281665702718, 0.066006600660066, 0.04]",,0.631342,...,12,focal,2,True,5,Conv1d,,concatenation,64,2
0,,0.661877,0.314453,"[0.793737397698968, 0.14962251201098148, 0.0]",0.372577,"[0.6728332998190227, 0.4448979591836735, 0.0]",0.352515,"[0.9676113360323887, 0.08993399339933994, 0.0]",,0.642582,...,123,focal,2,True,5,Conv1d,,concatenation,64,2
0,,0.632759,0.497783,"[0.7643330876934413, 0.42455043002345577, 0.30...",0.496033,"[0.7793808235647731, 0.4034175334323923, 0.305...",0.500504,"[0.7498554077501446, 0.44801980198019803, 0.30...",,0.654785,...,1,focal,2,True,5,Conv1d,,concatenation,64,3


In [14]:
best_seqsignet_attention_encoder_umap_kfold_11["f1"].mean()

0.48906065803553905

In [15]:
best_seqsignet_attention_encoder_umap_kfold_11["precision"].mean()

0.4984881070870974

In [16]:
best_seqsignet_attention_encoder_umap_kfold_11["recall"].mean()

0.4833686372300234

In [17]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.77671022, 0.38878632, 0.30168544])

In [18]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_11["precision_scores"]).mean(
    axis=0
)

array([0.75120283, 0.42770682, 0.31655467])

In [19]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.8040293 , 0.3569857 , 0.28909091])

# history_length=20

In [20]:
shift = 3
window_size = 5
n = 6

In [None]:
(
    seqsignet_attention_encoder_umap_kfold_20,
    best_seqsignet_attention_encoder_umap_kfold_20,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
seqsignet_attention_encoder_umap_kfold_20

In [None]:
best_seqsignet_attention_encoder_umap_kfold_20["f1"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_20["precision"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_20["recall"].mean()

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_20["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_20["precision_scores"]).mean(
    axis=0
)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_20["recall_scores"]).mean(axis=0)

# history_length=35

In [None]:
shift = 3
window_size = 5
n = 11

In [None]:
(
    seqsignet_attention_encoder_umap_kfold_35,
    best_seqsignet_attention_encoder_umap_kfold_35,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_attention_encoder_umap_kfold_35

In [None]:
best_seqsignet_attention_encoder_umap_kfold_35["f1"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_35["precision"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_35["recall"].mean()

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_35["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_35["precision_scores"]).mean(
    axis=0
)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_35["recall_scores"]).mean(axis=0)

# history_length=80

In [None]:
shift = 3
window_size = 5
n = 26

In [None]:
(
    seqsignet_attention_encoder_umap_kfold_80,
    best_seqsignet_attention_encoder_umap_kfold_80,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_attention_encoder_umap_kfold_80

In [None]:
best_seqsignet_attention_encoder_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_80["precision_scores"]).mean(
    axis=0
)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_80["recall_scores"]).mean(axis=0)

# history_length=110

In [None]:
shift = 3
window_size = 5
n = 36

In [None]:
(
    seqsignet_attention_encoder_umap_kfold_110,
    best_seqsignet_attention_encoder_umap_kfold_110,
    _,
    __,
) = seqsignet_attention_encoder_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_attention_encoder_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_attention_encoder_umap_kfold_110

In [None]:
best_seqsignet_attention_encoder_umap_kfold_110["f1"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_110["precision"].mean()

In [None]:
best_seqsignet_attention_encoder_umap_kfold_110["recall"].mean()

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_110["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_110["precision_scores"]).mean(
    axis=0
)

In [None]:
np.stack(best_seqsignet_attention_encoder_umap_kfold_110["recall_scores"]).mean(axis=0)