In [1]:
import numpy as np
import pickle
import os
from tqdm.notebook import tqdm

seed = 2023

In [2]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [4]:
%run ../load_anno_mi.py

In [5]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-08-01 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-08-01 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-08-01 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-08-01 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-08-01 00:00:34


In [6]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)
    
sbert_embeddings.shape

(13551, 384)

# SWNU Network

## Obtaining path by looking at post history

We can obtain a path by looking at the history of each post. Here we look at the last 10 posts (and pad with vectors of zeros if there are less than 10 posts) including the current post.

We only want to consider paths that correspond to a client's utterance as we want to model a change in mood at that time. Their history will still contain the therapist's utterances too.

In [7]:
time_features = ["time_encoding", "timeline_index"]
standardise_method = ["minmax", None]
num_time_features = len(time_features)
add_time_in_path = True

In [8]:
num_epochs = 100
embedding_dim = 384
dimensions = [50, 15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 3), ([10], 4)]
ffn_hidden_dim_sizes = [[64,64],[256,256],[512,512]]
dropout_rates = [0.5, 0.2, 0.1]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [0, 1, 12, 123, 1234]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [None]:
size = 20
swnu_network_umap_kfold_20, best_swnu_network_umap_kfold_20, _, __ = swnu_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    embedding_dim=embedding_dim,
    output_dim=output_dim_client,
    history_lengths=[size],
    dim_reduce_methods=["umap"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    time_feature=time_features,
    standardise_method=standardise_method,
    add_time_in_path=add_time_in_path,
    path_indices=client_index,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swnu_network_umap_focal_{gamma}_{size}_kfold.csv",
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

  0%|          | 0/1 [00:00<?, ?it/s][A[A


##################################################
dimension: 50 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.





  0%|          | 0/13551 [00:00<?, ?it/s][A[A[A


  0%|          | 56/13551 [00:00<00:24, 559.39it/s][A[A[A


  1%|          | 112/13551 [00:00<00:26, 500.52it/s][A[A[A


  1%|          | 163/13551 [00:00<00:27, 478.51it/s][A[A[A


  2%|▏         | 224/13551 [00:00<00:25, 525.58it/s][A[A[A


  2%|▏         | 309/13551 [00:00<00:20, 634.60it/s][A[A[A


  3%|▎         | 385/13551 [00:00<00:19, 675.78it/s][A[A[A


  3%|▎         | 454/13551 [00:00<00:19, 655.35it/s][A[A[A


  4%|▍         | 521/13551 [00:00<00:20, 649.95it/s][A[A[A


  4%|▍         | 605/13551 [00:00<00:18, 705.23it/s][A[A[A


  5%|▌         | 684/13551 [00:01<00:17, 729.53it/s][A[A[A


  6%|▌         | 758/13551 [00:01<00:17, 732.03it/s][A[A[A


  6%|▌         | 832/13551 [00:01<00:17, 729.73it/s][A[A[A


  7%|▋         | 906/13551 [00:01<00:17, 721.49it/s][A[A[A


  7%|▋         | 979/13551 [00:01<00:17, 713.28it/s][A[A[A


  8%|▊         | 1051/13551 [00:01<00:17, 702.04i

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.





  0%|          | 0/3 [00:00<?, ?it/s][A[A[A



  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A




  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[A[A





  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[A






  0%|          | 0/3 [00:00<?, ?it/s][A[A[A[A[A[A[A






 33%|███▎      | 1/3 [21:54<43:48, 1314.39s/it][A[A[A[A[A[A[A






 67%|██████▋   | 2/3 [1:14:46<40:07, 2407.07s/it][A[A[A[A[A[A[A

In [None]:
size = 20
swnu_network_grp_kfold_20, best_swnu_network_grp_kfold_20, _, __ = swnu_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=anno_mi,
    id_column="transcript_id",
    label_column="client_talk_type",
    embeddings=sbert_embeddings,
    y_data=y_data_client,
    embedding_dim=embedding_dim,
    output_dim=output_dim_client,
    history_lengths=[size],
    dim_reduce_methods=["gaussian_random_projection"],
    dimensions=dimensions,
    log_signature=True,
    swnu_hidden_dim_sizes_and_sig_depths=swnu_hidden_dim_sizes_and_sig_depths,
    ffn_hidden_dim_sizes=ffn_hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=False,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    time_feature=time_features,
    standardise_method=standardise_method,
    path_indices=client_index,
    k_fold=True,
    patience=patience,
    validation_metric=validation_metric,
    results_output=f"{output_dir}/swnu_network_grp_focal_{gamma}_{size}_kfold.csv",
    verbose=False
)