In [1]:
import numpy as np
import pickle
import os

seed = 2023

In [2]:
import torch

# set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from nlpsig_networks.scripts.seqsignet_functions import seqsignet_hyperparameter_search

In [4]:
output_dir = "client_talk_type_output"
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

## AnnoMI

In [5]:
%run ../load_anno_mi.py

In [6]:
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime,speaker
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-11-06 00:00:13,-1
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-11-06 00:00:24,1
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-11-06 00:00:25,-1
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-11-06 00:00:34,1
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-11-06 00:00:34,-1


In [7]:
with open("../anno_mi_sbert.pkl", "rb") as f:
    sbert_embeddings = pickle.load(f)

sbert_embeddings.shape

(9699, 384)

In [8]:
features = ["time_encoding", "timeline_index"]
standardise_method = ["z_score", None]
include_features_in_path = True
include_features_in_input = True

In [9]:
num_epochs = 100
dimensions = [15]
swnu_hidden_dim_sizes_and_sig_depths = [([12], 3), ([10], 4)]
lstm_hidden_dim_sizes = [384]
ffn_hidden_dim_sizes = [[256, 256], [512, 512]]
dropout_rates = [0.1, 0.2]
learning_rates = [1e-3, 1e-4, 5e-4]
seeds = [1, 12, 123]
loss = "focal"
gamma = 2
validation_metric = "f1"
patience = 5

In [10]:
kwargs = {
    "num_epochs": num_epochs,
    "df": anno_mi,
    "id_column": "transcript_id",
    "label_column": "client_talk_type",
    "embeddings": sbert_embeddings,
    "y_data": y_data_client,
    "output_dim": output_dim_client,
    "dimensions": dimensions,
    "log_signature": True,
    "pooling": "signature",
    "swnu_hidden_dim_sizes_and_sig_depths": swnu_hidden_dim_sizes_and_sig_depths,
    "lstm_hidden_dim_sizes": lstm_hidden_dim_sizes,
    "ffn_hidden_dim_sizes": ffn_hidden_dim_sizes,
    "dropout_rates": dropout_rates,
    "learning_rates": learning_rates,
    "BiLSTM": True,
    "seeds": seeds,
    "loss": loss,
    "gamma": gamma,
    "device": device,
    "features": features,
    "standardise_method": standardise_method,
    "include_features_in_path": include_features_in_path,
    "include_features_in_input": include_features_in_input,
    "path_indices": client_index,
    "split_ids": client_transcript_id,
    "k_fold": True,
    "patience": patience,
    "validation_metric": validation_metric,
    "verbose": False,
}

# history_length=11

In [11]:
shift = 3
window_size = 5
n = 3

## umap

In [12]:
(
    seqsignet_network_umap_kfold_11,
    best_seqsignet_network_umap_kfold_11,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 3: history length = 11
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_3_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_3_kfold_best_model.csv


In [13]:
seqsignet_network_umap_kfold_11

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.611111,0.502581,"[0.7341233667578245, 0.4286815728604472, 0.344...",0.493668,"[0.7733674775928298, 0.402315484804631, 0.3053...",0.517926,"[0.6986697513013302, 0.45874587458745875, 0.39...",,0.658638,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.620881,0.501930,"[0.7482993197278912, 0.4057239057239057, 0.351...",0.494783,"[0.7657384987893463, 0.41408934707903783, 0.30...",0.515230,"[0.7316367842683632, 0.3976897689768977, 0.416...",,0.656551,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.629502,0.519934,"[0.7464012251148544, 0.4676056338028169, 0.345...",0.518975,"[0.7932942708333334, 0.40786240786240785, 0.35...",0.529654,"[0.7047426257952574, 0.5478547854785478, 0.336...",,0.653019,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.627011,0.506394,"[0.7606939874364345, 0.4149203688181056, 0.343...",0.498905,"[0.7877942998760843, 0.42163543441226575, 0.28...",0.523695,"[0.7353961827646038, 0.4084158415841584, 0.427...",,0.652698,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.625096,0.501356,"[0.755071599045346, 0.41999225106547855, 0.329...",0.496563,"[0.7797288971041282, 0.39590942293645, 0.31404...",0.508192,"[0.731925968768074, 0.4471947194719472, 0.3454...",,0.654624,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.631801,0.506223,"[0.7598097502972652, 0.4264761188416698, 0.332...",0.507136,"[0.781651376146789, 0.391845196959226, 0.34791...",0.508386,"[0.7391555812608445, 0.46782178217821785, 0.31...",,0.656390,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.630843,0.494234,"[0.75860075166233, 0.4068209500609013, 0.31728...",0.496307,"[0.7583815028901734, 0.40047961630695444, 0.33...",0.492547,"[0.7588201272411799, 0.41336633663366334, 0.30...",,0.664740,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.644253,0.514260,"[0.7732212722793799, 0.41723356009070295, 0.35...",0.512631,"[0.7749056055765321, 0.46324269889224573, 0.29...",0.526118,"[0.7715442452284558, 0.3795379537953795, 0.427...",,0.662331,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.613218,0.493774,"[0.7461263408820025, 0.39167374681393374, 0.34...",0.486533,"[0.7695144437615242, 0.40367775831873903, 0.28...",0.511191,"[0.7241179872758821, 0.3803630363036304, 0.429...",,0.655909,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [14]:
best_seqsignet_network_umap_kfold_11["f1"].mean()

0.5039883344206261

In [15]:
best_seqsignet_network_umap_kfold_11["precision"].mean()

0.4973872956702275

In [16]:
best_seqsignet_network_umap_kfold_11["recall"].mean()

0.5170708545695518

In [17]:
np.stack(best_seqsignet_network_umap_kfold_11["f1_scores"]).mean(axis=0)

array([0.75345718, 0.42934889, 0.32915893])

In [18]:
np.stack(best_seqsignet_network_umap_kfold_11["precision_scores"]).mean(axis=0)

array([0.78173743, 0.42128618, 0.28913827])

In [19]:
np.stack(best_seqsignet_network_umap_kfold_11["recall_scores"]).mean(axis=0)

array([0.7275882 , 0.44059406, 0.3830303 ])

# history_length=20

In [20]:
shift = 3
window_size = 5
n = 6

## umap

In [21]:
(
    seqsignet_network_umap_kfold_20,
    best_seqsignet_network_umap_kfold_20,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 6: history length = 20
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_6_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_6_kfold_best_model.csv


In [22]:
seqsignet_network_umap_kfold_20

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.627395,0.504944,"[0.7528854690736905, 0.4312757201646091, 0.330...",0.498665,"[0.7709090909090909, 0.4302134646962233, 0.294...",0.514797,"[0.7356853672643147, 0.43234323432343236, 0.37...",,0.660244,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.619732,0.523631,"[0.7339246119733924, 0.486731843575419, 0.3502...",0.514527,"[0.8112745098039216, 0.4219128329297821, 0.310...",0.548980,"[0.6700404858299596, 0.5750825082508251, 0.401...",,0.633590,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.622031,0.504553,"[0.7447647951441577, 0.4556013488197827, 0.313...",0.498002,"[0.7835249042145593, 0.4172958133150309, 0.293...",0.515891,"[0.7096587622903412, 0.5016501650165016, 0.336...",,0.654143,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.638123,0.516378,"[0.7609437602018103, 0.4523618895116093, 0.335...",0.510052,"[0.7814690643096617, 0.4393468118195957, 0.309...",0.524971,"[0.741469057258531, 0.4661716171617162, 0.3672...",,0.669878,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.646743,0.521554,"[0.7699309939803259, 0.44391597304795877, 0.35...",0.519354,"[0.7819862809424396, 0.4271548436308162, 0.348...",0.524338,"[0.7582417582417582, 0.46204620462046203, 0.35...",,0.662492,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.650383,0.507753,"[0.7786062517923718, 0.42838929026774325, 0.31...",0.508838,"[0.7721843003412969, 0.44171779141104295, 0.31...",0.506993,"[0.785135916714864, 0.4158415841584158, 0.32]",,0.666506,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.634291,0.494963,"[0.7631349782293179, 0.418436873747495, 0.3033...",0.496572,"[0.7660256410256411, 0.406858924395947, 0.3168...",0.493956,"[0.760266049739734, 0.4306930693069307, 0.2909...",,0.660405,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.639847,0.509005,"[0.7692532088681447, 0.42726081258191356, 0.33...",0.505777,"[0.776044732195409, 0.45403899721448465, 0.287...",0.518379,"[0.7625795257374205, 0.4034653465346535, 0.389...",,0.664740,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.617625,0.490651,"[0.7479650732573628, 0.40951229343006856, 0.31...",0.485394,"[0.7659896938466202, 0.40031520882584715, 0.28...",0.497849,"[0.7307692307692307, 0.41914191419141916, 0.34...",,0.650289,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


In [23]:
best_seqsignet_network_umap_kfold_20["f1"].mean()

0.510861082467528

In [24]:
best_seqsignet_network_umap_kfold_20["precision"].mean()

0.5073385085213514

In [25]:
best_seqsignet_network_umap_kfold_20["recall"].mean()

0.5200645667034411

In [26]:
np.stack(best_seqsignet_network_umap_kfold_20["f1_scores"]).mean(axis=0)

array([0.76189555, 0.43781325, 0.33287445])

In [27]:
np.stack(best_seqsignet_network_umap_kfold_20["precision_scores"]).mean(axis=0)

array([0.78462886, 0.42840878, 0.30897789])

In [28]:
np.stack(best_seqsignet_network_umap_kfold_20["recall_scores"]).mean(axis=0)

array([0.74166185, 0.45489549, 0.36363636])

# history_length=35

In [29]:
shift = 3
window_size = 5
n = 11

## umap

In [30]:
(
    seqsignet_network_umap_kfold_35,
    best_seqsignet_network_umap_kfold_35,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

given shift 3, window size 5 and n 11: history length = 35
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.
saving results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_11_kfold.csv
saving the best model results dataframe to CSV for this hyperparameter search in client_talk_type_output/seqsignet_umap_focal_2_3_5_11_kfold_best_model.csv


In [31]:
seqsignet_network_umap_kfold_35

Unnamed: 0,loss,accuracy,f1,f1_scores,precision,precision_scores,recall,recall_scores,valid_loss,valid_accuracy,...,BiLSTM,loss_function,gamma,k_fold,n_splits,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,,0.608621,0.478777,"[0.7390165842304335, 0.3722267871815941, 0.325...",0.476805,"[0.7435597189695551, 0.3707037643207856, 0.316...",0.480945,"[0.7345286292654714, 0.37376237623762376, 0.33...",,0.663455,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.618008,0.501971,"[0.7415292353823087, 0.43674456083803387, 0.32...",0.494202,"[0.7699252801992528, 0.4267716535433071, 0.285...",0.515328,"[0.7151532677848468, 0.4471947194719472, 0.383...",,0.656230,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.633142,0.505956,"[0.7573497147871875, 0.42446043165467623, 0.33...",0.504528,"[0.7662030186445694, 0.4116279069767442, 0.335...",0.507727,"[0.7486986697513013, 0.4381188118811881, 0.336...",,0.660083,...,True,focal,2,True,5,Conv1d,,concatenation,64,0
0,,0.635441,0.510980,"[0.7620459599703484, 0.4434463046141282, 0.327...",0.504646,"[0.7818679647094615, 0.43896523848019403, 0.29...",0.520711,"[0.7432041642567958, 0.44801980198019803, 0.37...",,0.669236,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
0,,0.640805,0.506184,"[0.7670830271858927, 0.4447900466562986, 0.306...",0.506043,"[0.7798028084852107, 0.42058823529411765, 0.31...",0.507694,"[0.7547715442452284, 0.47194719471947194, 0.29...",,0.669557,...,True,focal,2,True,5,Conv1d,,concatenation,64,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,,0.610920,0.490020,"[0.7406955460646736, 0.42945230322814654, 0.29...",0.486599,"[0.7837314396384765, 0.3831715210355987, 0.292...",0.499287,"[0.70213996529786, 0.4884488448844885, 0.30727...",,0.638728,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.644828,0.504444,"[0.7729187854133374, 0.43464441795788644, 0.30...",0.506763,"[0.7766423357664234, 0.4191570881226054, 0.324...",0.503214,"[0.7692307692307693, 0.4513201320132013, 0.289...",,0.666827,...,True,focal,2,True,5,Conv1d,,concatenation,64,22
0,,0.603831,0.501930,"[0.7270432313032503, 0.45050717033927945, 0.32...",0.495129,"[0.8, 0.3910139647844566, 0.2943722943722944]",0.522848,"[0.6662810873337189, 0.5313531353135313, 0.370...",,0.657675,...,True,focal,2,True,5,Conv1d,,concatenation,64,23
0,,0.616284,0.497625,"[0.7526363362458571, 0.3977226514843432, 0.342...",0.489238,"[0.7855345911949686, 0.3921411387329591, 0.290...",0.514677,"[0.7223828802776171, 0.4034653465346535, 0.418...",,0.655909,...,True,focal,2,True,5,Conv1d,,concatenation,64,23


best_seqsignet_network_umap_kfold_35["f1"].mean()

In [32]:
best_seqsignet_network_umap_kfold_35["precision"].mean()

0.5026076389223167

In [33]:
best_seqsignet_network_umap_kfold_35["recall"].mean()

0.5114010909477299

In [34]:
np.stack(best_seqsignet_network_umap_kfold_35["f1_scores"]).mean(axis=0)

array([0.76638787, 0.41034152, 0.33765119])

In [35]:
np.stack(best_seqsignet_network_umap_kfold_35["precision_scores"]).mean(axis=0)

array([0.77623315, 0.41592437, 0.3156654 ])

In [36]:
np.stack(best_seqsignet_network_umap_kfold_35["recall_scores"]).mean(axis=0)

array([0.7574706 , 0.41006601, 0.36666667])

# history_length=80

In [37]:
shift = 3
window_size = 5
n = 26

## umap

In [None]:
(
    seqsignet_network_umap_kfold_80,
    best_seqsignet_network_umap_kfold_80,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


##################################################
dimension: 15 | method: umap
given shift 3, window size 5 and n 26: history length = 80
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' feature...
[INFO] Adding 'time_diff' feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/9699 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
seqsignet_network_umap_kfold_80

In [None]:
best_seqsignet_network_umap_kfold_80["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_80["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_80["recall_scores"]).mean(axis=0)

# history_length=110

In [None]:
shift = 3
window_size = 5
n = 36

## umap

In [None]:
(
    seqsignet_network_umap_kfold_110,
    best_seqsignet_network_umap_kfold_110,
    _,
    __,
) = seqsignet_hyperparameter_search(
    shift=shift,
    window_size=window_size,
    n=n,
    dim_reduce_methods=["umap"],
    results_output=f"{output_dir}/seqsignet_umap_focal_{gamma}_{shift}_{window_size}_{n}_kfold.csv",
    **kwargs,
)

In [None]:
seqsignet_network_umap_kfold_110

In [None]:
best_seqsignet_network_umap_kfold_110["f1"].mean()

In [None]:
best_seqsignet_network_umap_kfold_110["precision"].mean()

In [None]:
best_seqsignet_network_umap_kfold_110["recall"].mean()

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["f1_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["precision_scores"]).mean(axis=0)

In [None]:
np.stack(best_seqsignet_network_umap_kfold_110["recall_scores"]).mean(axis=0)