In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
import pickle
import os

import nlpsig
import nlpsig_networks

from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from nlpsig.classification_utils import split_dataset
from nlpsig_networks.pytorch_utils import training_pytorch, testing_pytorch, set_seed
from nlpsig_networks.ffn import FeedforwardNeuralNetModel
from nlpsig_networks.deepsignet import StackedDeepSigNet
from nlpsig_networks.focal_loss import FocalLoss, ClassBalanced_FocalLoss
from sklearn import metrics

from tqdm.notebook import tqdm

seed = 2023

In [2]:
import signatory

## AnnoMI

In [3]:
anno_mi = pd.read_csv("AnnoMI-full.csv")
anno_mi["datetime"] = pd.to_datetime(anno_mi["timestamp"])
anno_mi = anno_mi.drop(columns=["video_title", "video_url"])
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-05-10 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-05-10 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-05-10 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-05-10 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-05-10 00:00:34


In [4]:
len(anno_mi)

13551

In [5]:
anno_mi["client_talk_type"].value_counts() / anno_mi["interlocutor"].value_counts()["client"]

neutral    0.627063
change     0.248030
sustain    0.124907
Name: client_talk_type, dtype: float64

In [6]:
anno_mi["interlocutor"].value_counts()

therapist    6826
client       6725
Name: interlocutor, dtype: int64

In [7]:
anno_mi["topic"].value_counts()

reducing alcohol consumption                                                          2326
more exercise / increasing activity                                                   2034
reducing recidivism                                                                   1303
reducing drug use                                                                     1104
diabetes management                                                                    948
smoking cessation                                                                      923
smoking cessation                                                                      541
taking medicine / following medical procedure                                          448
asthma management                                                                      431
avoiding DOI                                                                           394
changing approach to disease                                                           315

In [8]:
len(anno_mi["transcript_id"].unique())

133

## Only considering client for now...

In [9]:
client_index = [isinstance(x, str) for x in anno_mi["client_talk_type"]]
sum(client_index)

6725

In [10]:
y_data = anno_mi["client_talk_type"][client_index]
y_data.shape

(6725,)

In [11]:
y_data[0:20]

1     neutral
3     neutral
5     neutral
7     neutral
9     neutral
11    neutral
13    neutral
15    neutral
17    neutral
19    neutral
21    neutral
23    neutral
25    neutral
27    neutral
29    neutral
31    neutral
33    neutral
35     change
37     change
39     change
Name: client_talk_type, dtype: object

In [12]:
label_to_id = {y_data.unique()[i]: i for i in range(len(y_data.unique()))}
id_to_label = {v: k for k, v in label_to_id.items()}

In [13]:
label_to_id

{'neutral': 0, 'change': 1, 'sustain': 2}

In [14]:
id_to_label

{0: 'neutral', 1: 'change', 2: 'sustain'}

In [15]:
y_data = [label_to_id[x] for x in y_data]
y_data[0:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]

## Obtaining SBERT Embeddings

We can use the `SentenceEncoder` class within `nlpsig` to obtain sentence embeddings from a model. This class uses the [`sentence-transformer`](https://www.sbert.net/docs/package_reference/SentenceTransformer.html) package and here, we have use the pre-trained `all-mpnet-base-v2` model by passing this name as a string to the class - alternative models can be found [here](https://www.sbert.net/docs/pretrained_models.html).

We can pass these into the constructor of the class to initialise our text encoder as follows:

In [17]:
# sbert_768_embeddings = np.load("anno_mi_sentence_embeddings_768.npy")

In [16]:
# initialise the Text Encoder
sbert_model_768 = "all-mpnet-base-v2"
text_encoder_sbert_768 = nlpsig.SentenceEncoder(df=anno_mi,
                                                feature_name="utterance_text",
                                                model_name=sbert_model_768)
text_encoder_sbert_768.load_pretrained_model()

The class has a `.encode_sentence_transformer()` method which first loads in the model (using the `model_name` and `model_args` attributes) and then obtains an embedding for each sentence. These sentence embeddings are then stored in the `embeddings_sentence` attribute of the object.

In [17]:
text_encoder_sbert_768.obtain_embeddings()
sbert_768_embeddings = text_encoder_sbert_768.sentence_embeddings

[INFO] number of sentences to encode: 13551


Batches:   0%|          | 0/212 [00:00<?, ?it/s]

In [18]:
np.save("anno_mi_sentence_embeddings_768", sbert_768_embeddings)

## SBERT with 384 dimension vectors

In [18]:
# sbert_384_embeddings = np.load("anno_mi_sentence_embeddings_384.npy")

In [20]:
# initialise the Text Encoder
sbert_model_384 = "all-MiniLM-L12-v2"
text_encoder_sbert_384 = nlpsig.SentenceEncoder(df=anno_mi,
                                                feature_name="utterance_text",
                                                model_name=sbert_model_384)
text_encoder_sbert_384.load_pretrained_model()

In [21]:
text_encoder_sbert_384.obtain_embeddings()
sbert_384_embeddings = text_encoder_sbert_384.sentence_embeddings

[INFO] number of sentences to encode: 13551


Batches:   0%|          | 0/212 [00:00<?, ?it/s]

In [22]:
np.save("anno_mi_sentence_embeddings_384", sbert_384_embeddings)

## Pretrained BERT and pooling

In [19]:
# pooled_mean_pretrained = np.load("anno_mi_pretrained_BERT_mean.npy")
# pooled_max_pretrained = np.load("anno_mi_pretrained_BERT_max.npy")
# pooled_sum_pretrained = np.load("anno_mi_pretrained_BERT_sum.npy")
# pooled_cls_pretrained = np.load("anno_mi_pretrained_BERT_cls.npy")

In [24]:
bert_model = "bert-base-uncased"

In [25]:
text_encoder_pretrained_BERT = nlpsig.TextEncoder(df=anno_mi,
                                                  feature_name="utterance_text",
                                                  model_name=bert_model)
text_encoder_pretrained_BERT.load_pretrained_model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
text_encoder_pretrained_BERT.tokenize_text(skip_special_tokens=False)

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 13551
})

In [27]:
token_embeddings_pretrained = text_encoder_pretrained_BERT.obtain_embeddings(method="hidden_layer")

  0%|          | 0/136 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [28]:
pooled_mean_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings()
pooled_max_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="max")
pooled_sum_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="sum") 
pooled_cls_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="cls")

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

In [29]:
pooled_mean_pretrained.shape

(13551, 768)

In [30]:
pooled_max_pretrained.shape

(13551, 768)

In [31]:
pooled_sum_pretrained.shape

(13551, 768)

In [32]:
pooled_cls_pretrained.shape

(13551, 768)

In [33]:
np.save("anno_mi_pretrained_BERT_mean", pooled_mean_pretrained)
np.save("anno_mi_pretrained_BERT_max", pooled_max_pretrained)
np.save("anno_mi_pretrained_BERT_sum", pooled_sum_pretrained)
np.save("anno_mi_pretrained_BERT_cls", pooled_cls_pretrained)

## Fine-tuning BERT and pooling

### (Ignoring this part for now while, but will run this on GPU cluster soon...)

In [20]:
# pooled_mean = np.load("anno_mi_BERT_mean.npy")
# pooled_max = np.load("anno_mi_BERT_max.npy")
# pooled_sum = np.load("anno_mi_BERT_sum.npy")
# pooled_cls = np.load("anno_mi_BERT_cls.npy")

In [35]:
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
)

model = AutoModelForMaskedLM.from_pretrained(bert_model)
tokenizer = AutoTokenizer.from_pretrained(bert_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
text_encoder_BERT = nlpsig.TextEncoder(df=anno_mi,
                                       feature_name="utterance_text",
                                       model=model,
                                       tokenizer=tokenizer,
                                       data_collator=data_collator)

In [37]:
text_encoder_BERT.tokenize_text(skip_special_tokens=False)

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 13551
})

### Training the model

In [38]:
# set up data_collator for language modelling (has dynamic padding)
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=True,
                                                       mlm_probability=0.15)

In [39]:
text_encoder_BERT.split_dataset(seed=seed)

[INFO] Splitting up dataset into train / validation / test sets, and saving to `.dataset_split`.


DatasetDict({
    train: Dataset({
        features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10840
    })
    test: Dataset({
        features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1356
    })
    validation: Dataset({
        features: ['mi_qua

In [40]:
type(text_encoder_BERT.dataset_split)

datasets.dataset_dict.DatasetDict

In [41]:
model_name = "bert-base-uncased-anno-mi"
text_encoder_BERT.set_up_training_args(output_dir=model_name,
                                  num_train_epochs=600,
                                  per_device_train_batch_size=128,
                                  disable_tqdm=False,
                                  save_strategy="steps",
                                  save_steps=10000,
                                  seed=seed)

[INFO] Setting up TrainingArguments object and saving to `.training_args`.


TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ign

In [42]:
type(text_encoder_BERT.training_args)

transformers.training_args.TrainingArguments

In [43]:
text_encoder_BERT.set_up_trainer(data_collator=data_collator_for_LM)

[INFO] Setting up Trainer object, and saving to `.trainer`.


<transformers.trainer.Trainer at 0x2b04c1e80>

In [44]:
type(text_encoder_BERT.trainer)

transformers.trainer.Trainer

In [45]:
torch.cuda.is_available()

False

In [46]:
torch.cuda.device_count()

0

In [47]:
# set to only report errors to avoid excessing logging
transformers.utils.logging.set_verbosity(40)

In [None]:
text_encoder_BERT.fit_transformer_with_trainer_api()

[INFO] Training model with 109514298 parameters...




Epoch,Training Loss,Validation Loss


In [None]:
text_encoder_BERT.trainer.save_model(model_name)

### Evaluating model on masked language modelling task

In [None]:
text_encoder_BERT.tokenizer.special_tokens_map

In [None]:
def compute_masked_character_accuracy(fill_mask, words):
    was_correct = []
    print(f"Evaluating with {len(words)} words")
    for word in tqdm(words):
        masked_strings = [word[:i] + '<mask>' + word[i+1:] for i in range(len(word))]
        predictions = [fill_mask(word)[0]['sequence'] for word in masked_strings]
        was_correct += [pred == word for pred in predictions]
    
    acc = np.sum(was_correct) / len(was_correct)
    print(f"Accuracy: {acc}")
    return acc

In [None]:
fill_mask = pipeline("fill-mask",
                     model=model_name,
                     tokenizer=model_name)

compute_masked_character_accuracy(fill_mask, text_encoder_BERT.dataset_split["test"]["word"])

### Obtain embeddings from model

In [None]:
# setting the model to CPU (might not be always necessary to run this)
text_encoder_BERT.model.to('cpu')
token_embeddings = text_encoder_BERT.obtain_embeddings(method="hidden_layer")

In [None]:
token_embeddings.shape

In [None]:
pooled_mean = text_encoder_BERT.pool_token_embeddings()
pooled_max = text_encoder_BERT.pool_token_embeddings(method="max")
pooled_sum = text_encoder_BERT.pool_token_embeddings(method="sum")
pooled_cls = text_encoder_BERT.pool_token_embeddings(method="cls")

In [None]:
pooled_mean.shape

In [None]:
pooled_max.shape

In [None]:
pooled_sum.shape

In [None]:
pooled_cls.shape

In [None]:
np.save("anno_mi_BERT_mean", pooled_mean)
np.save("anno_mi_BERT_max", pooled_max)
np.save("anno_mi_BERT_sum", pooled_sum)
np.save("anno_mi_BERT_cls", pooled_cls)

# Baseline 1: FFN baseline

Using the embeddings for the sentences directly in a FFN.

Below is a function that takes in some inputs x_data, y_data and fits a FFN. Will do early stopping if the F1 score continually gets worse.

In [81]:
def implement_ffn(x_data,
                  y_data,
                  hidden_dim,
                  learning_rate,
                  loss,
                  gamma=0):
    # set seed
    set_seed(seed)
    
    # initialise FFN
    ffn_model = FeedforwardNeuralNetModel(input_dim=x_data.shape[1],
                                          hidden_dim=hidden_dim,
                                          output_dim=len(label_to_id),
                                          dropout_rate=0.1)
    # print(ffn_model)
    
    # split dataset
    train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),
                                       y_data=torch.tensor(y_data),
                                       train_size=0.8,
                                       valid_size=0.5,
                                       shuffle=True,
                                       as_DataLoader=True,
                                       seed=seed)

    # define loss
    if loss == "focal":
        criterion = FocalLoss(gamma = gamma)
    elif loss == "cross_entropy":
        criterion = torch.nn.CrossEntropyLoss()

    # define optimizer
    optimizer = torch.optim.Adam(ffn_model.parameters(), lr=learning_rate)
    # define scheduler for adjusting the learning rate
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    # scheduler = StepLR(optimizer, step_size = 4, gamma = 0.5)
    # scheduler = None
    
    ffn_model = training_pytorch(model=ffn_model,
                                 train_loader=train,
                                 criterion=criterion,
                                 optimizer=optimizer,
                                 num_epochs=10000,
                                 scheduler=scheduler,
                                 valid_loader=valid,
                                 seed=seed,
                                 early_stopping=True,
                                 early_stopping_metric="f1",
                                 patience=10,
                                 verbose=True,
                                 verbose_epoch=100)

    pred, label = testing_pytorch(ffn_model, test, criterion)
    print(f"proportion of labels in prediction: {[sum(pred==i)/len(pred) for i in label_to_id.values()]}")
    print(f"proportion of labels in data: {[sum(label==i)/len(label) for i in label_to_id.values()]}")
    
    f1_scores = metrics.f1_score(label, pred, average=None)
    print(f"- f1: {f1_scores}")
    print(f"- f1 (average): {sum(f1_scores)/len(f1_scores)}")
    print(f"- accuracy: {sum(pred==label)/len(pred)}")
    
    return ffn_model

Going to try out some variations (1 hidden layer, 2 hidden layers and 3 hidden layers - all of size 100)

In [82]:
hidden_dim_trials = [[100]*i for i in range(1, 6)]
learning_rate = 1e-4
loss = "cross_entropy"

In [83]:
hidden_dim_trials

[[100],
 [100, 100],
 [100, 100, 100],
 [100, 100, 100, 100],
 [100, 100, 100, 100, 100]]

## SBERT 768

In [84]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=sbert_768_embeddings[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0885392427444458
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.023568868637085
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.016163945198059 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 40!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.6995780386707999
proportion of labels in prediction: [tensor(0.7173), tensor(0.1845), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81243063 0.53284672 0.47337278]
- f1 (average): 0.6062167096746555
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.116580843925476
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0521271228790283
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.026239974932237 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 45!
Accuracy on dataset of size 672: 70.08928680419922 %.
Average loss: 0.7389487461610273
proportion of labels in prediction: [tensor(0.6875), tensor(0.2024), tensor(0.1101)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80817253 0.51748252 0.46327684]
- f1 (average): 0.5963106282850795
- accuracy: 0.7008928656578064

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0993850231170654
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0102291107177734
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.994836606762626 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 42!
Accuracy on dataset of size 672: 70.23809814453125 %.
Average loss: 0.7553030956875194
proportion of labels in prediction: [tensor(0.6801), tensor(0.2098), tensor(0.1101)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80821918 0.52920962 0.46327684]
- f1 (average): 0.600235212077837
- accuracy: 0.7023809552192688

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.072503924369812
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.993831992149353
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9791417772119696 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 47!
Accuracy on dataset of size 672: 69.04762268066406 %.
Average loss: 0.8187992193482139
proportion of labels in prediction: [tensor(0.6801), tensor(0.1890), tensor(0.1310)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79908676 0.50541516 0.46073298]
- f1 (average): 0.5884116349129783
- accuracy: 0.6904761791229248

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1058584451675415
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0399665832519531
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0396888039328835 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 50!
Accuracy on dataset of size 672: 69.19642639160156 %.
Average loss: 0.8239231001247059
proportion of labels in prediction: [tensor(0.7113), tensor(0.1815), tensor(0.1071)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79821628 0.51470588 0.42285714]
- f1 (average): 0.57859310056241
- accuracy: 0.6919642686843872


## SBERT 384

In [85]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=sbert_384_embeddings[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0692368745803833
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0163383483886719
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0099996599284085 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 52!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.7035864103924144
proportion of labels in prediction: [tensor(0.7232), tensor(0.1696), tensor(0.1071)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81767956 0.51515152 0.51428571]
- f1 (average): 0.6157055958160931
- accuracy: 0.71875

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1142284870147705
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0404751300811768
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.043692480434071 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 45!
Accuracy on dataset of size 672: 70.53571319580078 %.
Average loss: 0.7237868959253485
proportion of labels in prediction: [tensor(0.7247), tensor(0.1741), tensor(0.1012)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80573951 0.51685393 0.46783626]
- f1 (average): 0.5968099014143323
- accuracy: 0.7053571343421936

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0598180294036865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9991912245750427
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.004194275899367 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 67!
Accuracy on dataset of size 672: 70.68452453613281 %.
Average loss: 0.7685691226612438
proportion of labels in prediction: [tensor(0.6979), tensor(0.1830), tensor(0.1190)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8018018  0.54212454 0.49180328]
- f1 (average): 0.6119098742049561
- accuracy: 0.706845223903656

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.075019359588623
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9795213937759399
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0088304281234741 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 31!
Accuracy on dataset of size 672: 65.92262268066406 %.
Average loss: 0.8153174790469083
proportion of labels in prediction: [tensor(0.7188), tensor(0.2812), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81374723 0.44837758 0.        ]
- f1 (average): 0.42070826983410625
- accuracy: 0.6592261791229248

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1322060823440552
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0921815633773804
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0522005774758079 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 49!
Accuracy on dataset of size 672: 66.81547546386719 %.
Average loss: 0.814079609784213
proportion of labels in prediction: [tensor(0.7009), tensor(0.2321), tensor(0.0670)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80449438 0.43137255 0.33783784]
- f1 (average): 0.524568256293306
- accuracy: 0.668154776096344


## Pretrained BERT

### Mean pooled

In [86]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_mean_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0839786529541016
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7457855343818665
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8354992433027788 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 43!
Accuracy on dataset of size 672: 69.79166412353516 %.
Average loss: 0.7122157866304571
proportion of labels in prediction: [tensor(0.7039), tensor(0.2083), tensor(0.0878)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80269058 0.51724138 0.44444444]
- f1 (average): 0.5881254689048102
- accuracy: 0.6979166865348816

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1131868362426758
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8533899784088135
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8532686233520508 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 35!
Accuracy on dataset of size 672: 70.98213958740234 %.
Average loss: 0.701988697052002
proportion of labels in prediction: [tensor(0.6949), tensor(0.2128), tensor(0.0923)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81038375 0.53924915 0.47272727]
- f1 (average): 0.6074533888877603
- accuracy: 0.7098214030265808

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.098788857460022
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8470583558082581
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8576378388838335 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 36!
Accuracy on dataset of size 672: 70.83333587646484 %.
Average loss: 0.7304971489039335
proportion of labels in prediction: [tensor(0.6830), tensor(0.2188), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80637813 0.55892256 0.46153846]
- f1 (average): 0.6089463841931573
- accuracy: 0.7083333134651184

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0730299949645996
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.835125207901001
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8732638359069824 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 45!
Accuracy on dataset of size 672: 70.68452453613281 %.
Average loss: 0.7486193478107452
proportion of labels in prediction: [tensor(0.7068), tensor(0.1994), tensor(0.0938)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81208054 0.52112676 0.45783133]
- f1 (average): 0.5970128742591122
- accuracy: 0.706845223903656

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1056439876556396
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9540625214576721
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.882446364922957 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 51!
Accuracy on dataset of size 672: 72.76786041259766 %.
Average loss: 0.7767329216003418
proportion of labels in prediction: [tensor(0.6905), tensor(0.1979), tensor(0.1116)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81993205 0.55123675 0.5505618 ]
- f1 (average): 0.6405768655665137
- accuracy: 0.7276785969734192


### Max pooled

In [87]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_max_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0957067012786865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7537550926208496
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8680760372768749 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 52!
Accuracy on dataset of size 672: 69.79166412353516 %.
Average loss: 0.7068305503238331
proportion of labels in prediction: [tensor(0.7173), tensor(0.2054), tensor(0.0774)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80133185 0.52777778 0.41290323]
- f1 (average): 0.580670952360115
- accuracy: 0.6979166865348816

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1076050996780396
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9455707669258118
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9177429025823419 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 53!
Accuracy on dataset of size 672: 71.42857360839844 %.
Average loss: 0.7085082368417219
proportion of labels in prediction: [tensor(0.7202), tensor(0.2024), tensor(0.0774)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80841639 0.53846154 0.49032258]
- f1 (average): 0.6124001696394794
- accuracy: 0.7142857313156128

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0997354984283447
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.929989755153656
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9196747053753246 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 53!
Accuracy on dataset of size 672: 70.08928680419922 %.
Average loss: 0.7250180461189963
proportion of labels in prediction: [tensor(0.7024), tensor(0.2143), tensor(0.0833)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8013468  0.53741497 0.44025157]
- f1 (average): 0.5930044465534133
- accuracy: 0.7008928656578064

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0746076107025146
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9190363883972168
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9312605370174755 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 53!
Accuracy on dataset of size 672: 69.64286041259766 %.
Average loss: 0.7155440043319355
proportion of labels in prediction: [tensor(0.7083), tensor(0.2054), tensor(0.0863)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79553073 0.52083333 0.45962733]
- f1 (average): 0.5919971295942877
- accuracy: 0.6964285969734192

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1055570840835571
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.035354733467102
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9438045024871826 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 58!
Accuracy on dataset of size 672: 70.23809814453125 %.
Average loss: 0.7381011681123213
proportion of labels in prediction: [tensor(0.7024), tensor(0.2068), tensor(0.0908)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80359147 0.52595156 0.46341463]
- f1 (average): 0.5976525538326346
- accuracy: 0.7023809552192688


### Sum pooled

In [88]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.0271031856536865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8685790300369263
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8226919986984946 || Accuracy: 0.6463595628738403 || F1-score: 0.4247232595366326
Early stopping at epoch 31!
Accuracy on dataset of size 672: 70.98213958740234 %.
Average loss: 0.774150935086337
proportion of labels in prediction: [tensor(0.7173), tensor(0.2039), tensor(0.0789)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81021088 0.55052265 0.42307692]
- f1 (average): 0.5946034826546994
- accuracy: 0.7098214030265808

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.114878535270691
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7568445205688477
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8589342778379266 || Accuracy: 0.6508172154426575 || F1-score: 0.46548398969451604
Early stopping at epoch 35!
Accuracy on dataset of size 672: 70.83333587646484 %.
Average loss: 0.7620173096656799
proportion of labels in prediction: [tensor(0.7039), tensor(0.2113), tensor(0.0848)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80493274 0.56849315 0.425     ]
- f1 (average): 0.5994752953703134
- accuracy: 0.7083333134651184

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.111946940422058
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8133658766746521
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8259054476564581 || Accuracy: 0.637444257736206 || F1-score: 0.4151506273197447
Early stopping at epoch 29!
Accuracy on dataset of size 672: 70.38690185546875 %.
Average loss: 0.8185790181159973
proportion of labels in prediction: [tensor(0.6815), tensor(0.2277), tensor(0.0908)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8027366  0.55445545 0.45121951]
- f1 (average): 0.6028038532640426
- accuracy: 0.7038690447807312

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0758711099624634
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8967788219451904
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9189632643352855 || Accuracy: 0.6315007209777832 || F1-score: 0.30164060280829436
Early stopping at epoch 37!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.8004374504089355
proportion of labels in prediction: [tensor(0.7024), tensor(0.1935), tensor(0.1042)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81481481 0.56428571 0.50867052]
- f1 (average): 0.6292570164439142
- accuracy: 0.7232142686843872

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1112335920333862
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9961893558502197
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.975874434817921 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 41!
Accuracy on dataset of size 672: 70.08928680419922 %.
Average loss: 0.9458087682723999
proportion of labels in prediction: [tensor(0.6830), tensor(0.2158), tensor(0.1012)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79954442 0.53559322 0.47953216]
- f1 (average): 0.6048899344053565
- accuracy: 0.7008928656578064


### CLS

In [89]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_cls_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.072586178779602
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7834479808807373
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8409460566260598 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 43!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.6774346611716531
proportion of labels in prediction: [tensor(0.7366), tensor(0.1786), tensor(0.0848)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81619256 0.53333333 0.475     ]
- f1 (average): 0.6081752978361293
- accuracy: 0.71875

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1158097982406616
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9066270589828491
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8640799901702187 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 38!
Accuracy on dataset of size 672: 72.02381134033203 %.
Average loss: 0.673450849272988
proportion of labels in prediction: [tensor(0.7351), tensor(0.1741), tensor(0.0908)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81489595 0.52434457 0.51219512]
- f1 (average): 0.617145212888559
- accuracy: 0.7202380895614624

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099726676940918
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9140689969062805
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8757737441496416 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 44!
Accuracy on dataset of size 672: 71.57737731933594 %.
Average loss: 0.6988080143928528
proportion of labels in prediction: [tensor(0.7068), tensor(0.1964), tensor(0.0967)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81208054 0.5248227  0.52380952]
- f1 (average): 0.6202375852525788
- accuracy: 0.7157738208770752

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0739099979400635
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8497714996337891
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8862914334643971 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 36!
Accuracy on dataset of size 672: 70.08928680419922 %.
Average loss: 0.7242212187160145
proportion of labels in prediction: [tensor(0.6786), tensor(0.2381), tensor(0.0833)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79771429 0.5483871  0.46540881]
- f1 (average): 0.6038367291733087
- accuracy: 0.7008928656578064

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1050199270248413
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0113009214401245
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8908429525115273 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 60!
Accuracy on dataset of size 672: 70.38690185546875 %.
Average loss: 0.7129268104379828
proportion of labels in prediction: [tensor(0.7217), tensor(0.1771), tensor(0.1012)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80530973 0.49814126 0.49122807]
- f1 (average): 0.5982263562097445
- accuracy: 0.7038690447807312


## Fine-tuned BERT

### Mean pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_mean_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=1e-5,
                  loss="cross_entropy")

### Max pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_max_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# Baseline 2: Averaging history and use FFN

Here, we will use `nlpsig` to construct some paths of embeddings which we will average and use those in a FFN.

First, we define the arguments for how we want to construct our path. As we're going to just do a simple average of embeddings, I'll set zero padding as false, and construct the path by looking at the last `k` posts.

We will consider one where we average their histories and that is the only inputs to the FFN. Alternatively, we can concatenate the full post embedding as well.

In [90]:
path_specifics = {"pad_by": "history",
                  "zero_padding": False,
                  "method": "k_last",
                  "k": 5,
                  "time_feature": None,
                  "embeddings": "full",
                  "include_current_embedding": True}

In [91]:
def obtain_mean_history(embeddings, path_specifics, concatenate_current = True):
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings)
    path = paths.pad(**path_specifics)
    # remove last two columns (which contains the id and the label)
    path = path[client_index][:,:,:-2]
    # average in the first dimension
    path = path.mean(1).astype("float")
    # concatenate with current embedding
    if concatenate_current:
        path = np.concatenate([path, embeddings[client_index]], axis=1)
    return path

## SBERT 768

In [92]:
path_history = obtain_mean_history(sbert_768_embeddings, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0953056812286377
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9892064332962036
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9894395037130876 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 33!
Accuracy on dataset of size 672: 71.13095092773438 %.
Average loss: 0.6916292418133129
proportion of labels in prediction: [tensor(0.7068), tensor(0.1964), tensor(0.0967)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80536913 0.57446809 0.44047619]
- f1 (average): 0.6067711343664507
- accuracy: 0.711309552192688

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0840489864349365
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9823988676071167
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9628760164434259 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 39!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.7162079377607866
proportion of labels in prediction: [tensor(0.6920), tensor(0.1964), tensor(0.1116)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80542986 0.59574468 0.43820225]
- f1 (average): 0.6131255974318229
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0726913213729858
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.004361629486084
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9863766540180553 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 36!
Accuracy on dataset of size 672: 70.98213958740234 %.
Average loss: 0.7356059171936729
proportion of labels in prediction: [tensor(0.6801), tensor(0.2054), tensor(0.1146)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80593607 0.56944444 0.46666667]
- f1 (average): 0.6140157280568239
- accuracy: 0.7098214030265808

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.094172477722168
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0252065658569336
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9813522750681097 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 49!
Accuracy on dataset of size 672: 71.57737731933594 %.
Average loss: 0.8058877749876543
proportion of labels in prediction: [tensor(0.6667), tensor(0.2217), tensor(0.1116)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80968858 0.60200669 0.4494382 ]
- f1 (average): 0.6203778241750936
- accuracy: 0.7157738208770752

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.079013705253601
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0607026815414429
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.000501123341647 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 42!
Accuracy on dataset of size 672: 67.55952453613281 %.
Average loss: 0.7973260446028276
proportion of labels in prediction: [tensor(0.7083), tensor(0.2917), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82011173 0.50289017 0.        ]
- f1 (average): 0.44100063508466003
- accuracy: 0.675595223903656


In [93]:
path_history = obtain_mean_history(sbert_768_embeddings, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.088150978088379
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0530861616134644
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.024484547701749 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6441184282302856
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6339896321296692
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8492850823835894 || Accuracy: 0.6285289525985718 || F1-score: 0.4134916549087558
Early stopping at epoch 109!
Accuracy on dataset of size 672: 61.60714340209961 %.
Average loss: 0.8589339093728499
proportion of labels in prediction: [tensor(0.8155), tensor(0.1429), tensor(0.0417)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74870734 0.30081301 0.22900763]
- f1 (average): 0.4261759946

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1165460348129272
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0676894187927246
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.034143946387551 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 55!
Accuracy on dataset of size 672: 61.904762268066406 %.
Average loss: 0.8625814156098799
proportion of labels in prediction: [tensor(0.8482), tensor(0.1027), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.760364   0.20091324 0.26470588]
- f1 (average): 0.4086610428021877
- accuracy: 0.6190476417541504

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0994921922683716
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0285676717758179
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0022970925677905 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 56!
Accuracy on dataset of size 672: 64.13690185546875 %.
Average loss: 0.9409157254479148
proportion of labels in prediction: [tensor(0.8080), tensor(0.1265), tensor(0.0655)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77754678 0.24680851 0.38095238]
- f1 (average): 0.46843588971248545
- accuracy: 0.6413690447807312

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0725284814834595
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0014491081237793
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.983965044671839 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 62!
Accuracy on dataset of size 672: 63.54166793823242 %.
Average loss: 0.9649328806183555
proportion of labels in prediction: [tensor(0.7708), tensor(0.1429), tensor(0.0863)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77267876 0.25203252 0.42236025]
- f1 (average): 0.4823571769262706
- accuracy: 0.6354166865348816

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.105851650238037
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0425540208816528
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.040925459428267 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 54!
Accuracy on dataset of size 672: 62.2023811340332 %.
Average loss: 1.0156917788765647
proportion of labels in prediction: [tensor(0.7440), tensor(0.1771), tensor(0.0789)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75952122 0.29739777 0.37179487]
- f1 (average): 0.476237953342532
- accuracy: 0.6220238208770752


## SBERT 384

In [94]:
path_history = obtain_mean_history(sbert_384_embeddings, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0886486768722534
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.024571180343628
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0028003996068782 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 50!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.7124848663806915
proportion of labels in prediction: [tensor(0.7217), tensor(0.1637), tensor(0.1146)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82522124 0.51538462 0.51111111]
- f1 (average): 0.6172389884779266
- accuracy: 0.7232142686843872

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1162652969360352
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0470809936523438
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0164382349361072 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 51!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.7254285487261686
proportion of labels in prediction: [tensor(0.6964), tensor(0.1801), tensor(0.1235)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82074408 0.52398524 0.47311828]
- f1 (average): 0.6059492001982608
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099664330482483
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0039958953857422
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.982631347396157 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 42!
Accuracy on dataset of size 672: 71.13095092773438 %.
Average loss: 0.7638534090735696
proportion of labels in prediction: [tensor(0.6801), tensor(0.2068), tensor(0.1131)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81278539 0.56747405 0.44692737]
- f1 (average): 0.6090622702908121
- accuracy: 0.711309552192688

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0726091861724854
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9895793199539185
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9734922322359952 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 58!
Accuracy on dataset of size 672: 70.08928680419922 %.
Average loss: 0.7831103612076152
proportion of labels in prediction: [tensor(0.7158), tensor(0.2188), tensor(0.0655)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81555556 0.51851852 0.36734694]
- f1 (average): 0.567140337616528
- accuracy: 0.7008928656578064

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1058921813964844
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0393548011779785
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0376879410310225 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 38!
Accuracy on dataset of size 672: 66.96428680419922 %.
Average loss: 0.8052765293554827
proportion of labels in prediction: [tensor(0.7083), tensor(0.2917), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82011173 0.47976879 0.        ]
- f1 (average): 0.4332935059902477
- accuracy: 0.6696428656578064


In [95]:
path_history = obtain_mean_history(sbert_384_embeddings, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0689338445663452
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.029910683631897
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0149357481436296 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 87!
Accuracy on dataset of size 672: 62.94643020629883 %.
Average loss: 0.8852156509052623
proportion of labels in prediction: [tensor(0.8497), tensor(0.1235), tensor(0.0268)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75959596 0.29184549 0.21487603]
- f1 (average): 0.4221058287386808
- accuracy: 0.6294642686843872

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.114090919494629
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0463939905166626
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0466171394694934 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 55!
Accuracy on dataset of size 672: 63.244049072265625 %.
Average loss: 0.896749659018083
proportion of labels in prediction: [tensor(0.8899), tensor(0.0804), tensor(0.0298)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77089479 0.18627451 0.22764228]
- f1 (average): 0.3949371916068632
- accuracy: 0.632440447807312

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0598030090332031
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.002704381942749
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0059379068287937 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 54!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9146971485831521
proportion of labels in prediction: [tensor(0.8854), tensor(0.0774), tensor(0.0372)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77317554 0.13861386 0.203125  ]
- f1 (average): 0.37163813459748346
- accuracy: 0.6235119104385376

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0749456882476807
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9822454452514648
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0100843906402588 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 38!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9153450619090687
proportion of labels in prediction: [tensor(0.8854), tensor(0.1146), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.78106509 0.20264317 0.        ]
- f1 (average): 0.327902753521188
- accuracy: 0.6235119104385376

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.132216215133667
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0948266983032227
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0533526160500266 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 62!
Accuracy on dataset of size 672: 63.988094329833984 %.
Average loss: 0.9347266392274336
proportion of labels in prediction: [tensor(0.8854), tensor(0.0060), tensor(0.1086)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77712032 0.02597403 0.38636364]
- f1 (average): 0.39648599263983875
- accuracy: 0.6398809552192688


## Pretrained BERT

### Mean pooled

In [96]:
path_history = obtain_mean_history(pooled_mean_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1021214723587036
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8234513998031616
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8453716310587797 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 30!
Accuracy on dataset of size 672: 70.83333587646484 %.
Average loss: 0.7448050704869357
proportion of labels in prediction: [tensor(0.6830), tensor(0.2664), tensor(0.0506)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81548975 0.55927052 0.37956204]
- f1 (average): 0.5847741033144898
- accuracy: 0.7083333134651184

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0823612213134766
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7924083471298218
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8414776433597911 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 46!
Accuracy on dataset of size 672: 70.23809814453125 %.
Average loss: 0.7604863047599792
proportion of labels in prediction: [tensor(0.6801), tensor(0.2336), tensor(0.0863)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80593607 0.54071661 0.44720497]
- f1 (average): 0.5979525514604368
- accuracy: 0.7023809552192688

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0727800130844116
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8618066906929016
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8695350614461032 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 36!
Accuracy on dataset of size 672: 71.57737731933594 %.
Average loss: 0.7291001677513123
proportion of labels in prediction: [tensor(0.7024), tensor(0.2039), tensor(0.0938)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81705948 0.54355401 0.46987952]
- f1 (average): 0.6101643362556937
- accuracy: 0.7157738208770752

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0944077968597412
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9358684420585632
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8738451112400402 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 49!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.7574839646166022
proportion of labels in prediction: [tensor(0.7068), tensor(0.2039), tensor(0.0893)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81655481 0.52961672 0.46625767]
- f1 (average): 0.6041430677645776
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0790690183639526
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0292452573776245
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.893870928070762 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 48!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.8019252961332147
proportion of labels in prediction: [tensor(0.6920), tensor(0.2068), tensor(0.1012)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82352941 0.55363322 0.49122807]
- f1 (average): 0.6227968999777415
- accuracy: 0.7232142686843872


In [97]:
path_history = obtain_mean_history(pooled_mean_pretrained, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0827751159667969
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9896537661552429
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9003090154040944 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 43!
Accuracy on dataset of size 672: 64.28571319580078 %.
Average loss: 0.8660504817962646
proportion of labels in prediction: [tensor(0.9241), tensor(0.0655), tensor(0.0104)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77307692 0.2371134  0.12727273]
- f1 (average): 0.3791543508038353
- accuracy: 0.6428571343421936

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1143455505371094
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9700237512588501
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8952573537826538 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 62!
Accuracy on dataset of size 672: 63.69047546386719 %.
Average loss: 0.8783302090384744
proportion of labels in prediction: [tensor(0.8512), tensor(0.1310), tensor(0.0179)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76286579 0.33613445 0.17391304]
- f1 (average): 0.42430442979631194
- accuracy: 0.636904776096344

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0997600555419922
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9804369211196899
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9060063145377419 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 63!
Accuracy on dataset of size 672: 65.0297622680664 %.
Average loss: 0.9121526696465232
proportion of labels in prediction: [tensor(0.7619), tensor(0.1830), tensor(0.0551)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76476907 0.42490842 0.32857143]
- f1 (average): 0.5060829730002662
- accuracy: 0.6502976417541504

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0729719400405884
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8793776631355286
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9026380343870684 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 42!
Accuracy on dataset of size 672: 63.69047546386719 %.
Average loss: 0.9111166000366211
proportion of labels in prediction: [tensor(0.8943), tensor(0.0491), tensor(0.0565)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76862745 0.12021858 0.35460993]
- f1 (average): 0.41448531976445974
- accuracy: 0.636904776096344

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1056631803512573
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0125389099121094
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9033064083619551 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 74!
Accuracy on dataset of size 672: 61.60714340209961 %.
Average loss: 0.9209303693337874
proportion of labels in prediction: [tensor(0.8765), tensor(0.0551), tensor(0.0685)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75992063 0.11764706 0.26845638]
- f1 (average): 0.3820080231943635
- accuracy: 0.6160714030265808


### Max pooled

In [98]:
path_history = obtain_mean_history(pooled_max_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1084880828857422
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7761728763580322
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8622845031998374 || Accuracy: 0.6255571842193604 || F1-score: 0.27174123669475575
Early stopping at epoch 65!
Accuracy on dataset of size 672: 70.98213958740234 %.
Average loss: 0.7004730430516329
proportion of labels in prediction: [tensor(0.7039), tensor(0.2143), tensor(0.0818)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80493274 0.55102041 0.46835443]
- f1 (average): 0.6081025246563404
- accuracy: 0.7098214030265808

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0752454996109009
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8784411549568176
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8661506284366954 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 53!
Accuracy on dataset of size 672: 69.94047546386719 %.
Average loss: 0.7430935881354592
proportion of labels in prediction: [tensor(0.6473), tensor(0.2798), tensor(0.0729)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79625293 0.56213018 0.46052632]
- f1 (average): 0.6063031402349117
- accuracy: 0.699404776096344

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0746275186538696
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8988773822784424
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8991819565946405 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 42!
Accuracy on dataset of size 672: 68.75 %.
Average loss: 0.7781992012804205
proportion of labels in prediction: [tensor(0.6696), tensor(0.2887), tensor(0.0417)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8009206  0.54651163 0.30534351]
- f1 (average): 0.5509252459154371
- accuracy: 0.6875

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0943225622177124
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0057708024978638
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9081212932413275 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 69!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.7408329302614386
proportion of labels in prediction: [tensor(0.6801), tensor(0.2068), tensor(0.1131)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80593607 0.55363322 0.51396648]
- f1 (average): 0.6245119238331226
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0788952112197876
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0040643215179443
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9158212109045549 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 77!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.7152866341850974
proportion of labels in prediction: [tensor(0.6964), tensor(0.2039), tensor(0.0997)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80947012 0.57142857 0.49411765]
- f1 (average): 0.6250054475003078
- accuracy: 0.71875


In [99]:
path_history = obtain_mean_history(pooled_max_pretrained, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0947773456573486
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0492063760757446
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9014952128583734 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 43!
Accuracy on dataset of size 672: 61.60714340209961 %.
Average loss: 0.905403272672133
proportion of labels in prediction: [tensor(0.9866), tensor(0.0134), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76155268 0.02515723 0.        ]
- f1 (average): 0.26223663764207134
- accuracy: 0.6160714030265808

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1085178852081299
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9874736070632935
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8997597694396973 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 55!
Accuracy on dataset of size 672: 61.904762268066406 %.
Average loss: 0.9084995226426558
proportion of labels in prediction: [tensor(0.9464), tensor(0.0491), tensor(0.0045)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76208531 0.12021858 0.05660377]
- f1 (average): 0.3129692202922501
- accuracy: 0.6190476417541504

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099276065826416
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9842522144317627
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9089005426927046 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 56!
Accuracy on dataset of size 672: 62.05356979370117 %.
Average loss: 0.9172859354452654
proportion of labels in prediction: [tensor(0.9107), tensor(0.0729), tensor(0.0164)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76042677 0.17085427 0.14035088]
- f1 (average): 0.3572106395586192
- accuracy: 0.6205357313156128

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0751796960830688
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8894482254981995
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9061743021011353 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 94!
Accuracy on dataset of size 672: 62.5 %.
Average loss: 0.9255607344887473
proportion of labels in prediction: [tensor(0.9643), tensor(0.0015), tensor(0.0342)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76663543 0.         0.17460317]
- f1 (average): 0.31374620034413847
- accuracy: 0.625

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1056452989578247
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0734694004058838
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9058432199738242 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 68!
Accuracy on dataset of size 672: 61.755950927734375 %.
Average loss: 0.9206284392963756
proportion of labels in prediction: [tensor(0.9643), tensor(0.0357), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76663543 0.06896552 0.        ]
- f1 (average): 0.27853364789020674
- accuracy: 0.617559552192688


### Sum pooled

In [100]:
path_history = obtain_mean_history(pooled_sum_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.727465033531189
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.6842592358589172
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8152492263100364 || Accuracy: 0.6508172154426575 || F1-score: 0.45380069492888103
Early stopping at epoch 43!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.8815002387220209
proportion of labels in prediction: [tensor(0.7054), tensor(0.2128), tensor(0.0818)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81522956 0.5665529  0.4556962 ]
- f1 (average): 0.6124928889418044
- accuracy: 0.71875

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1402323246002197
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7112229466438293
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8173842213370583 || Accuracy: 0.6448736786842346 || F1-score: 0.4025425170419203
Early stopping at epoch 39!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.9226987957954407
proportion of labels in prediction: [tensor(0.6920), tensor(0.2143), tensor(0.0938)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80542986 0.55782313 0.4939759 ]
- f1 (average): 0.6190762990398507
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0828022956848145
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8738025426864624
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8608328808437694 || Accuracy: 0.6344724893569946 || F1-score: 0.3448035645816642
Early stopping at epoch 35!
Accuracy on dataset of size 672: 71.2797622680664 %.
Average loss: 0.84981154311787
proportion of labels in prediction: [tensor(0.6860), tensor(0.2247), tensor(0.0893)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80681818 0.57142857 0.46625767]
- f1 (average): 0.6148348073194699
- accuracy: 0.7127976417541504

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.094190001487732
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9194858074188232
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8628855564377524 || Accuracy: 0.6181277632713318 || F1-score: 0.25466789103152737
Early stopping at epoch 27!
Accuracy on dataset of size 672: 72.02381134033203 %.
Average loss: 0.8458069942214272
proportion of labels in prediction: [tensor(0.6533), tensor(0.2485), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81118881 0.5615142  0.55621302]
- f1 (average): 0.642972008174629
- accuracy: 0.7202380895614624

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0835667848587036
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9686885476112366
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9038537090474908 || Accuracy: 0.6196136474609375 || F1-score: 0.2803511448714408
Early stopping at epoch 33!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.8853492032397877
proportion of labels in prediction: [tensor(0.6786), tensor(0.2247), tensor(0.0967)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.816      0.55813953 0.53571429]
- f1 (average): 0.6366179401993355
- accuracy: 0.7232142686843872


In [101]:
path_history = obtain_mean_history(pooled_sum_pretrained, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.5255906581878662
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.133524775505066
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8961977579376914 || Accuracy: 0.6210995316505432 || F1-score: 0.25951624835194914
Early stopping at epoch 59!
Accuracy on dataset of size 672: 66.51786041259766 %.
Average loss: 0.9034843119707975
proportion of labels in prediction: [tensor(0.7589), tensor(0.1920), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76856835 0.45878136 0.38235294]
- f1 (average): 0.5365675520838179
- accuracy: 0.6651785969734192

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0804401636123657
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0644162893295288
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9117570194331083 || Accuracy: 0.6210995316505432 || F1-score: 0.25542315918117936
Early stopping at epoch 51!
Accuracy on dataset of size 672: 66.2202377319336 %.
Average loss: 0.8689939108761874
proportion of labels in prediction: [tensor(0.8006), tensor(0.1503), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76907001 0.43027888 0.33823529]
- f1 (average): 0.5125280630097064
- accuracy: 0.6622023582458496

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0874210596084595
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9809868931770325
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9239951751448892 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 41!
Accuracy on dataset of size 672: 65.17857360839844 %.
Average loss: 0.9270960688591003
proportion of labels in prediction: [tensor(0.7396), tensor(0.1875), tensor(0.0729)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75982533 0.46376812 0.34210526]
- f1 (average): 0.5218995688702802
- accuracy: 0.6517857313156128

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0957541465759277
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0000782012939453
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.941019968553023 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 52!
Accuracy on dataset of size 672: 66.36904907226562 %.
Average loss: 0.9391943487254056
proportion of labels in prediction: [tensor(0.7723), tensor(0.1637), tensor(0.0640)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77185501 0.41538462 0.4109589 ]
- f1 (average): 0.5327328433850617
- accuracy: 0.663690447807312

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1101337671279907
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0370057821273804
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9489994157444347 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 48!
Accuracy on dataset of size 672: 64.58333587646484 %.
Average loss: 0.9552247036587108
proportion of labels in prediction: [tensor(0.7411), tensor(0.1890), tensor(0.0699)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7568157  0.44043321 0.34666667]
- f1 (average): 0.5146385276812152
- accuracy: 0.6458333134651184


### CLS

In [102]:
path_history = obtain_mean_history(pooled_cls_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099766492843628
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7809126377105713
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8579818931492892 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 61!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.6975206190889532
proportion of labels in prediction: [tensor(0.6979), tensor(0.2128), tensor(0.0893)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82207207 0.55972696 0.47852761]
- f1 (average): 0.6201088806304578
- accuracy: 0.7232142686843872

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0808454751968384
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8400460481643677
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8549019748514349 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 40!
Accuracy on dataset of size 672: 72.61904907226562 %.
Average loss: 0.6978309317068621
proportion of labels in prediction: [tensor(0.7485), tensor(0.1533), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82429501 0.52173913 0.49704142]
- f1 (average): 0.614358520466371
- accuracy: 0.726190447807312

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0731648206710815
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8906587362289429
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8822337497364391 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 56!
Accuracy on dataset of size 672: 72.61904907226562 %.
Average loss: 0.6981977766210382
proportion of labels in prediction: [tensor(0.6979), tensor(0.2068), tensor(0.0952)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81756757 0.56747405 0.51497006]
- f1 (average): 0.6333372252969044
- accuracy: 0.726190447807312

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0950363874435425
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.977047324180603
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8841557882048867 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 49!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.7547444159334357
proportion of labels in prediction: [tensor(0.6801), tensor(0.2158), tensor(0.1042)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81278539 0.53559322 0.55491329]
- f1 (average): 0.6344306344215082
- accuracy: 0.71875

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0791399478912354
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0230438709259033
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8975516286763278 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 49!
Accuracy on dataset of size 672: 73.66071319580078 %.
Average loss: 0.7267429070039229
proportion of labels in prediction: [tensor(0.6905), tensor(0.2113), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82899207 0.57534247 0.53254438]
- f1 (average): 0.6456263056439435
- accuracy: 0.7366071343421936


In [103]:
path_history = obtain_mean_history(pooled_cls_pretrained, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: [100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0739248991012573
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9890086650848389
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9000857418233698 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 28!
Accuracy on dataset of size 672: 62.2023811340332 %.
Average loss: 0.9152315746654164
proportion of labels in prediction: [tensor(0.9955), tensor(0.0045), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76654412 0.0130719  0.        ]
- f1 (average): 0.25987200435729846
- accuracy: 0.6220238208770752

********** hidden_dim: [100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.116005539894104
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9669814705848694
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.8972187475724653 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 70!
Accuracy on dataset of size 672: 62.64881134033203 %.
Average loss: 0.8904514421116222
proportion of labels in prediction: [tensor(0.8705), tensor(0.1057), tensor(0.0238)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76095618 0.26244344 0.16806723]
- f1 (average): 0.3971556137011962
- accuracy: 0.6264880895614624

********** hidden_dim: [100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.100395917892456
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9996486306190491
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9076314135031267 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 56!
Accuracy on dataset of size 672: 62.64881134033203 %.
Average loss: 0.9038712111386386
proportion of labels in prediction: [tensor(0.7768), tensor(0.1830), tensor(0.0402)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75451647 0.35164835 0.27692308]
- f1 (average): 0.4610293001366328
- accuracy: 0.6264880895614624

********** hidden_dim: [100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0736373662948608
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8722844123840332
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9062130667946555 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 72!
Accuracy on dataset of size 672: 63.244049072265625 %.
Average loss: 0.900691958990964
proportion of labels in prediction: [tensor(0.8289), tensor(0.1176), tensor(0.0536)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76434426 0.26200873 0.31654676]
- f1 (average): 0.44763325283648814
- accuracy: 0.632440447807312

********** hidden_dim: [100, 100, 100, 100, 100]


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1049774885177612
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0856366157531738
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9055847525596619 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 59!
Accuracy on dataset of size 672: 61.60714340209961 %.
Average loss: 0.8962898362766613
proportion of labels in prediction: [tensor(0.8988), tensor(0.0506), tensor(0.0506)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76441838 0.06521739 0.24817518]
- f1 (average): 0.3592703170359009
- accuracy: 0.6160714030265808


## Fine-tuned BERT

### Mean pooled

In [175]:
path_history = obtain_mean_history(pooled_mean, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

NameError: name 'pooled_mean' is not defined

In [None]:
path_history = obtain_mean_history(pooled_mean, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Max pooled

In [None]:
path_history = obtain_mean_history(pooled_max, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
path_history = obtain_mean_history(pooled_max, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
path_history = obtain_mean_history(pooled_sum, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
path_history = obtain_mean_history(pooled_sum, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
path_history = obtain_mean_history(pooled_cls, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
path_history = obtain_mean_history(pooled_cls, path_specifics, concatenate_current=False)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# Baseline 3: LSTM classification

# Baseline 4: FFN using signatures

First, we dimension reduce these and then take signatures. We use the path signature as input to the FFN for classification.

In [104]:
path_specifics = {"pad_by": "history",
                  "zero_padding": False,
                  "method": "k_last",
                  "k": 5,
                  "time_feature": None,
                  "embeddings": "dim_reduced",
                  "include_current_embedding": True}

In [114]:
def obtain_signatures_history(embeddings, path_specifics, dimension, sig_depth, concatenate_current=True):
    # dimension reduce
    reduction = nlpsig.DimReduce(method="gaussian_random_projection", n_components=dimension)
    # reduction = nlpsig.DimReduce(method="umap", n_components=dimension)
    embeddings_reduced = reduction.fit_transform(embeddings, random_state=seed)
    
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings,
                               embeddings_reduced=embeddings_reduced)
    path = paths.pad(**path_specifics)
    # remove last two columns (which contains the id and the label)
    path = path[client_index][:,:,:-2].astype("float")
    
    # convert to torch tensor to compute signature using signatory
    path = torch.from_numpy(path).float()
    sig = signatory.signature(path, sig_depth).float()
    
    # concatenate with current embedding
    if concatenate_current:
        sig = torch.cat([sig, torch.tensor(embeddings[client_index])], dim=1)

    return sig

In [115]:
dimension = 5
sig_depth = 4

## SBERT 768

In [116]:
signature_history = obtain_signatures_history(sbert_768_embeddings, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1548])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1171363592147827
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.053240180015564
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0347932902249424 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 40!
Accuracy on dataset of size 672: 67.11309814453125 %.
Average loss: 0.7697976990179582
proportion of labels in prediction: [tensor(0.6920), tensor(0.2143), tensor(0.0938)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.78054299 0.50340136 0.38554217]
- f1 (average): 0.5564955052147519
- accuracy: 0.6711309552192688

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1026197671890259
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0218136310577393
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9916267178275369 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 36!
Accuracy on dataset of size 672: 65.47618865966797 %.
Average loss: 0.8570250272750854
proportion of labels in prediction: [tensor(0.6786), tensor(0.2143), tensor(0.1071)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.768      0.4829932  0.37714286]
- f1 (average): 0.5427120181405897
- accuracy: 0.6547619104385376

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0876795053482056
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0153101682662964
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9809776111082598 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 34!
Accuracy on dataset of size 672: 64.58333587646484 %.
Average loss: 0.9729593179442666
proportion of labels in prediction: [tensor(0.6622), tensor(0.2247), tensor(0.1131)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76388889 0.47840532 0.3575419 ]
- f1 (average): 0.5332787013149493
- accuracy: 0.6458333134651184

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0767967700958252
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.02414870262146
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9755970781499689 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 26!
Accuracy on dataset of size 672: 62.05356979370117 %.
Average loss: 0.9041527916084636
proportion of labels in prediction: [tensor(0.6443), tensor(0.3557), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77230047 0.45244216 0.        ]
- f1 (average): 0.4082475429555337
- accuracy: 0.6205357313156128

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0699200630187988
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9752826690673828
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9633186188611117 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 25!
Accuracy on dataset of size 672: 61.904762268066406 %.
Average loss: 0.9764535427093506
proportion of labels in prediction: [tensor(0.6429), tensor(0.3571), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76850764 0.45641026 0.        ]
- f1 (average): 0.4083059648277039
- accuracy: 0.6190476417541504


In [120]:
signature_history = obtain_signatures_history(sbert_768_embeddings,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1296181678771973
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.106845498085022
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0863242040980945 || Accuracy: 0.49034175276756287 || F1-score: 0.27059271026782716
Early stopping at epoch 12!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9313619353554465
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- accuracy: 0.6235119104385376

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1389586925506592
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.082794189453125
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0830620310523293 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 26!
Accuracy on dataset of size 672: 61.904762268066406 %.
Average loss: 0.9399037523703142
proportion of labels in prediction: [tensor(0.9881), tensor(0.0119), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76638966 0.01265823 0.        ]
- f1 (average): 0.25968262873483955
- accuracy: 0.6190476417541504

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0805878639221191
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0350637435913086
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0200353373180737 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 35!
Accuracy on dataset of size 672: 62.2023811340332 %.
Average loss: 0.9935095581141385
proportion of labels in prediction: [tensor(0.9821), tensor(0.0179), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7673772  0.04938272 0.        ]
- f1 (average): 0.2722533057205079
- accuracy: 0.6220238208770752

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0874937772750854
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0431901216506958
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0160185857252642 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8067488074302673
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.42302054166793823
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.12337866154584 || Accuracy: 0.570579469203949 || F1-score: 0.2948740694523326
Early stopping at epoch 102!
Accuracy on dataset of size 672: 57.58928680419922 %.
Average loss: 1.1595185507427563
proportion of labels in prediction: [tensor(0.8914), tensor(0.0729), tensor(0.0357)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7367387  0.08040201 0.06299213]
- f1 (average): 0.2933776131

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0989593267440796
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0831507444381714
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0347781506451694 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8126439452171326
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.9172384142875671
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.0070092190395703 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6922479271888733
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.134027123451233
--------------------------------------------------
Validation || Epoch: 201 || Loss: 1.0152975104071877 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 301/10000 

Epoch: 2501/10000 || Item: 0/85 || Loss: 0.8530833721160889
--------------------------------------------------
##### Epoch: 2501/10000 || Loss: 0.39883291721343994
--------------------------------------------------
Validation || Epoch: 2501 || Loss: 1.0210722034627742 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 2601/10000 || Item: 0/85 || Loss: 0.7698783874511719
--------------------------------------------------
##### Epoch: 2601/10000 || Loss: 0.5527031421661377
--------------------------------------------------
Validation || Epoch: 2601 || Loss: 1.0221493352543225 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 2701/10000 || Item: 0/85 || Loss: 0.8497977256774902
--------------------------------------------------
##### Epoch: 2701/10000 || Loss: 0.745418131351471
--------------------------------------------------
Validation || Epoch: 2701 || Loss: 1.045716865496202 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Ep

Epoch: 5001/10000 || Item: 0/85 || Loss: 0.7900695204734802
--------------------------------------------------
##### Epoch: 5001/10000 || Loss: 1.849651575088501
--------------------------------------------------
Validation || Epoch: 5001 || Loss: 1.0109401995485479 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 5101/10000 || Item: 0/85 || Loss: 0.7636962532997131
--------------------------------------------------
##### Epoch: 5101/10000 || Loss: 0.452621191740036
--------------------------------------------------
Validation || Epoch: 5101 || Loss: 1.0351335406303406 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 5201/10000 || Item: 0/85 || Loss: 0.8651379346847534
--------------------------------------------------
##### Epoch: 5201/10000 || Loss: 0.9647705554962158
--------------------------------------------------
Validation || Epoch: 5201 || Loss: 1.014183147387071 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoc

Epoch: 7501/10000 || Item: 0/85 || Loss: 0.7741328477859497
--------------------------------------------------
##### Epoch: 7501/10000 || Loss: 0.941972553730011
--------------------------------------------------
Validation || Epoch: 7501 || Loss: 1.0525735020637512 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 7601/10000 || Item: 0/85 || Loss: 0.935082197189331
--------------------------------------------------
##### Epoch: 7601/10000 || Loss: 0.6530162692070007
--------------------------------------------------
Validation || Epoch: 7601 || Loss: 1.0332807573405178 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 7701/10000 || Item: 0/85 || Loss: 0.7856796383857727
--------------------------------------------------
##### Epoch: 7701/10000 || Loss: 1.0814590454101562
--------------------------------------------------
Validation || Epoch: 7701 || Loss: 1.0234933441335505 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epo

Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 1.0316040895201943
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- accuracy: 0.6235119104385376


## SBERT 384

In [121]:
signature_history = obtain_signatures_history(sbert_384_embeddings, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1164])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0521900653839111
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9816925525665283
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9814600131728433 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 33!
Accuracy on dataset of size 672: 67.55952453613281 %.
Average loss: 0.7989161989905618
proportion of labels in prediction: [tensor(0.7098), tensor(0.1920), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79241071 0.48028674 0.37869822]
- f1 (average): 0.5504652258296799
- accuracy: 0.675595223903656

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.093984603881836
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.052046537399292
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0197868834842334 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 65!
Accuracy on dataset of size 672: 66.36904907226562 %.
Average loss: 0.8858089717951688
proportion of labels in prediction: [tensor(0.6905), tensor(0.2039), tensor(0.1057)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77689694 0.48083624 0.3908046 ]
- f1 (average): 0.5495125922924343
- accuracy: 0.663690447807312

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1012998819351196
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0739198923110962
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0168006257577376 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 41!
Accuracy on dataset of size 672: 62.94643020629883 %.
Average loss: 0.9017694700847972
proportion of labels in prediction: [tensor(0.6949), tensor(0.3051), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.78329571 0.42816901 0.        ]
- f1 (average): 0.40382157504848504
- accuracy: 0.6294642686843872

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1113770008087158
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0579935312271118
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.030761794610457 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 25!
Accuracy on dataset of size 672: 61.30952453613281 %.
Average loss: 0.9385421655394814
proportion of labels in prediction: [tensor(0.6741), tensor(0.3259), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7706422  0.41192412 0.        ]
- f1 (average): 0.3941887736920182
- accuracy: 0.613095223903656

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0995882749557495
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0875303745269775
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0160398049788042 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 35!
Accuracy on dataset of size 672: 61.755950927734375 %.
Average loss: 1.000803914937106
proportion of labels in prediction: [tensor(0.6637), tensor(0.3363), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.77456647 0.42553191 0.        ]
- f1 (average): 0.40003279629401883
- accuracy: 0.617559552192688


In [122]:
signature_history = obtain_signatures_history(sbert_384_embeddings,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1300742626190186
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1073238849639893
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0831208662553267 || Accuracy: 0.5156017541885376 || F1-score: 0.284446079101693
Early stopping at epoch 12!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9457747556946494
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- accuracy: 0.6235119104385376

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1388251781463623
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0828475952148438
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0809132727709683 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 41!
Accuracy on dataset of size 672: 59.970237731933594 %.
Average loss: 1.0203760428862139
proportion of labels in prediction: [tensor(0.9598), tensor(0.0387), tensor(0.0015)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75       0.04545455 0.        ]
- f1 (average): 0.2651515151515152
- accuracy: 0.5997023582458496

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0802912712097168
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0371514558792114
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0190406333316455 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8518695831298828
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.5540586709976196
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9600368413058195 || Accuracy: 0.6181277632713318 || F1-score: 0.2816021054363023
Epoch: 201/10000 || Item: 0/85 || Loss: 0.8303055763244629
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.33418047428131104
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.9751427390358665 || Accuracy: 0.6181277632713318 || F1-score: 0.2816021054363023
Epoch: 301/10000 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.087432861328125
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.043636679649353
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0156650976701216 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7902301549911499
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.42113643884658813
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.0998676256699995 || Accuracy: 0.5750371217727661 || F1-score: 0.30910314332554906
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7233343720436096
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.1722097396850586
--------------------------------------------------
Validation || Epoch: 201 || Loss: 1.0915379144928672 || Accuracy: 0.5750371217727661 || F1-score: 0.30910314332554906
Epoch: 301/10000 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0990127325057983
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.083052635192871
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0344014926390215 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 41!
Accuracy on dataset of size 672: 57.886905670166016 %.
Average loss: 1.2762420014901594
proportion of labels in prediction: [tensor(0.8824), tensor(0.0982), tensor(0.0193)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.72924901 0.16666667 0.03448276]
- f1 (average): 0.3101328123816879
- accuracy: 0.5788690447807312


## Pretrained BERT

### Mean pooled

In [123]:
signature_history = obtain_signatures_history(pooled_mean_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1548])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 6.673676013946533
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8129488229751587
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.8461785208095203 || Accuracy: 0.5720653533935547 || F1-score: 0.3435318842984348
Early stopping at epoch 30!
Accuracy on dataset of size 672: 55.9523811340332 %.
Average loss: 1.7869080738587813
proportion of labels in prediction: [tensor(0.8006), tensor(0.1518), tensor(0.0476)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71891327 0.21428571 0.07407407]
- f1 (average): 0.335757686332399
- accuracy: 0.5595238208770752

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.5531699657440186
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 2.798264741897583
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0943772250955754 || Accuracy: 0.5824665427207947 || F1-score: 0.371300486514424
Early stopping at epoch 12!
Accuracy on dataset of size 672: 60.863094329833984 %.
Average loss: 1.1093834096735173
proportion of labels in prediction: [tensor(0.8810), tensor(0.0923), tensor(0.0268)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.75568744 0.20754717 0.08264463]
- f1 (average): 0.3486264120301714
- accuracy: 0.6086309552192688

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1850405931472778
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8447656631469727
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.035943871194666 || Accuracy: 0.6136701107025146 || F1-score: 0.26448658301845257
Early stopping at epoch 29!
Accuracy on dataset of size 672: 58.48214340209961 %.
Average loss: 1.5035460103641858
proportion of labels in prediction: [tensor(0.8006), tensor(0.1711), tensor(0.0283)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74190178 0.24150943 0.09836066]
- f1 (average): 0.36059062202816805
- accuracy: 0.5848214030265808

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0213910341262817
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7883847951889038
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.986276084726507 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 27!
Accuracy on dataset of size 672: 54.31547546386719 %.
Average loss: 1.6418841318650679
proportion of labels in prediction: [tensor(0.7143), tensor(0.2664), tensor(0.0193)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.70300334 0.27355623 0.06896552]
- f1 (average): 0.34850836176185856
- accuracy: 0.543154776096344

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0394494533538818
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8005906939506531
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9860641251910817 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 29!
Accuracy on dataset of size 672: 56.39881134033203 %.
Average loss: 1.7352937134829434
proportion of labels in prediction: [tensor(0.7366), tensor(0.2083), tensor(0.0551)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71772429 0.28275862 0.14285714]
- f1 (average): 0.3811133507956869
- accuracy: 0.5639880895614624


In [124]:
signature_history = obtain_signatures_history(pooled_mean_pretrained,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.7441978454589844
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 4.3166351318359375
--------------------------------------------------
Validation || Epoch: 1 || Loss: 2.4711484258825127 || Accuracy: 0.4041604697704315 || F1-score: 0.3118328262267227
Early stopping at epoch 23!
Accuracy on dataset of size 672: 55.05952453613281 %.
Average loss: 1.7590262673117898
proportion of labels in prediction: [tensor(0.8452), tensor(0.1071), tensor(0.0476)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71529889 0.10810811 0.07407407]
- f1 (average): 0.2991603558979446
- accuracy: 0.550595223903656

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.1577088832855225
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.2402211427688599
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.284072149883617 || Accuracy: 0.4710252583026886 || F1-score: 0.32294703244466244
Early stopping at epoch 15!
Accuracy on dataset of size 672: 59.375 %.
Average loss: 1.221100395376032
proportion of labels in prediction: [tensor(0.9360), tensor(0.0491), tensor(0.0149)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74427481 0.08743169 0.01769912]
- f1 (average): 0.2831352060645414
- accuracy: 0.59375

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.4558333158493042
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7245966792106628
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0402219241315669 || Accuracy: 0.601783037185669 || F1-score: 0.2927995350219556
Early stopping at epoch 50!
Accuracy on dataset of size 672: 57.886905670166016 %.
Average loss: 1.361782810904763
proportion of labels in prediction: [tensor(0.8720), tensor(0.0938), tensor(0.0342)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73631841 0.12206573 0.0952381 ]
- f1 (average): 0.3178740769659416
- accuracy: 0.5788690447807312

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1106586456298828
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0407090187072754
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.006961995905096 || Accuracy: 0.6210995316505432 || F1-score: 0.25542315918117936
Early stopping at epoch 27!
Accuracy on dataset of size 672: 55.654762268066406 %.
Average loss: 1.5424025925722988
proportion of labels in prediction: [tensor(0.8333), tensor(0.1205), tensor(0.0461)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71910112 0.17316017 0.02985075]
- f1 (average): 0.3073706810081118
- accuracy: 0.5565476417541504

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0596250295639038
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8599212765693665
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0026457201350818 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 27!
Accuracy on dataset of size 672: 54.01785659790039 %.
Average loss: 1.7785337079655041
proportion of labels in prediction: [tensor(0.8110), tensor(0.1533), tensor(0.0357)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.70746888 0.15810277 0.03149606]
- f1 (average): 0.2990225698195315
- accuracy: 0.5401785969734192


### Max pooled

In [125]:
signature_history = obtain_signatures_history(pooled_max_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1548])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 29.369617462158203
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 7.37844181060791
--------------------------------------------------
Validation || Epoch: 1 || Loss: 5.658988670869307 || Accuracy: 0.5408617854118347 || F1-score: 0.33396321640666926
Early stopping at epoch 28!
Accuracy on dataset of size 672: 55.505950927734375 %.
Average loss: 3.3138548894362017
proportion of labels in prediction: [tensor(0.7723), tensor(0.1756), tensor(0.0521)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.70362473 0.25373134 0.13043478]
- f1 (average): 0.36259695312258583
- accuracy: 0.555059552192688

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 6.738647937774658
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 3.3066258430480957
--------------------------------------------------
Validation || Epoch: 1 || Loss: 2.2583379203622993 || Accuracy: 0.508172333240509 || F1-score: 0.3325111038910033
Early stopping at epoch 49!
Accuracy on dataset of size 672: 56.39881134033203 %.
Average loss: 1.5961687889966099
proportion of labels in prediction: [tensor(0.8006), tensor(0.1607), tensor(0.0387)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71891327 0.2248062  0.09302326]
- f1 (average): 0.34558090933391655
- accuracy: 0.5639880895614624

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.146319627761841
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1189898252487183
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.190718260678378 || Accuracy: 0.5661218166351318 || F1-score: 0.3281647443478431
Early stopping at epoch 12!
Accuracy on dataset of size 672: 59.375 %.
Average loss: 1.1160890730944546
proportion of labels in prediction: [tensor(0.9330), tensor(0.0595), tensor(0.0074)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7456979  0.09473684 0.        ]
- f1 (average): 0.2801449129515951
- accuracy: 0.59375

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.9987678527832031
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.6346924304962158
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0325498147444292 || Accuracy: 0.601783037185669 || F1-score: 0.2609936055090999
Early stopping at epoch 39!
Accuracy on dataset of size 672: 58.48214340209961 %.
Average loss: 1.368440563028509
proportion of labels in prediction: [tensor(0.8244), tensor(0.1503), tensor(0.0253)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73997945 0.23904382 0.05      ]
- f1 (average): 0.34300775657220384
- accuracy: 0.5848214030265808

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.020685076713562
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8861537575721741
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9861424511129205 || Accuracy: 0.6240713000297546 || F1-score: 0.2598877839178142
Early stopping at epoch 27!
Accuracy on dataset of size 672: 54.91071319580078 %.
Average loss: 1.835107304833152
proportion of labels in prediction: [tensor(0.7679), tensor(0.1860), tensor(0.0461)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71657754 0.20363636 0.08955224]
- f1 (average): 0.3365887141830952
- accuracy: 0.5491071343421936


In [126]:
signature_history = obtain_signatures_history(pooled_max_pretrained,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 21.10601234436035
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 7.336426734924316
--------------------------------------------------
Validation || Epoch: 1 || Loss: 8.051020622253418 || Accuracy: 0.40861812233924866 || F1-score: 0.29812342336125414
Early stopping at epoch 15!
Accuracy on dataset of size 672: 55.505950927734375 %.
Average loss: 3.8297226320613516
proportion of labels in prediction: [tensor(0.7753), tensor(0.1607), tensor(0.0640)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.70851064 0.23255814 0.1369863 ]
- f1 (average): 0.3593516930675397
- accuracy: 0.555059552192688

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 8.153376579284668
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 4.407326698303223
--------------------------------------------------
Validation || Epoch: 1 || Loss: 2.7375375032424927 || Accuracy: 0.4234769642353058 || F1-score: 0.30676960211418874
Early stopping at epoch 16!
Accuracy on dataset of size 672: 55.80356979370117 %.
Average loss: 1.5238312591205945
proportion of labels in prediction: [tensor(0.8393), tensor(0.1116), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71617497 0.16       0.07352941]
- f1 (average): 0.31656812877745194
- accuracy: 0.5580357313156128

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.3812642097473145
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1157633066177368
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.3977787061171099 || Accuracy: 0.5690935850143433 || F1-score: 0.3374554877625502
Early stopping at epoch 12!
Accuracy on dataset of size 672: 58.779762268066406 %.
Average loss: 1.1003501631996848
proportion of labels in prediction: [tensor(0.9301), tensor(0.0476), tensor(0.0223)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74137931 0.07692308 0.01694915]
- f1 (average): 0.27841717993675913
- accuracy: 0.5877976417541504

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.5172810554504395
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.377053141593933
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0480764779177578 || Accuracy: 0.6077265739440918 || F1-score: 0.2737765736092296
Early stopping at epoch 12!
Accuracy on dataset of size 672: 59.6726188659668 %.
Average loss: 1.0093801888552578
proportion of labels in prediction: [tensor(0.9628), tensor(0.0283), tensor(0.0089)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74859287 0.01183432 0.01834862]
- f1 (average): 0.25959193797464275
- accuracy: 0.5967261791229248

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.110856533050537
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9218065738677979
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0166223970326511 || Accuracy: 0.6196136474609375 || F1-score: 0.255514705882353
Early stopping at epoch 33!
Accuracy on dataset of size 672: 58.779762268066406 %.
Average loss: 1.4010108384219082
proportion of labels in prediction: [tensor(0.8631), tensor(0.1086), tensor(0.0283)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73473473 0.20627803 0.08196721]
- f1 (average): 0.3409933249184394
- accuracy: 0.5877976417541504


### Sum pooled

In [127]:
signature_history = obtain_signatures_history(pooled_sum_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1548])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 10158999.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 826402.75
--------------------------------------------------
Validation || Epoch: 1 || Loss: 3137068.1420454546 || Accuracy: 0.4799405634403229 || F1-score: 0.35294660410939477
Early stopping at epoch 28!
Accuracy on dataset of size 672: 51.04166793823242 %.
Average loss: 2561722.2400568184
proportion of labels in prediction: [tensor(0.7381), tensor(0.1949), tensor(0.0670)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.66448087 0.22064057 0.10810811]
- f1 (average): 0.33107651727335524
- accuracy: 0.5104166865348816

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 10571232.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1005705.8125
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1273091.6903409092 || Accuracy: 0.4814264476299286 || F1-score: 0.3493515365134195
Early stopping at epoch 34!
Accuracy on dataset of size 672: 50.89285659790039 %.
Average loss: 705194.6590909091
proportion of labels in prediction: [tensor(0.6458), tensor(0.2693), tensor(0.0848)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.66588511 0.26586103 0.175     ]
- f1 (average): 0.36891537952065395
- accuracy: 0.5089285969734192

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 4063484.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 236304.796875
--------------------------------------------------
Validation || Epoch: 1 || Loss: 255779.0028409091 || Accuracy: 0.4739970266819 || F1-score: 0.3305498948890851
Early stopping at epoch 35!
Accuracy on dataset of size 672: 52.97618865966797 %.
Average loss: 217720.61115056818
proportion of labels in prediction: [tensor(0.7500), tensor(0.1935), tensor(0.0565)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.68472373 0.22857143 0.11347518]
- f1 (average): 0.34225677761788037
- accuracy: 0.5297619104385376

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2699980.5
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 10819.095703125
--------------------------------------------------
Validation || Epoch: 1 || Loss: 154303.06516335226 || Accuracy: 0.539375901222229 || F1-score: 0.2983546252699524
Early stopping at epoch 34!
Accuracy on dataset of size 672: 50.89285659790039 %.
Average loss: 55088.51191850142
proportion of labels in prediction: [tensor(0.6845), tensor(0.2589), tensor(0.0565)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.65756542 0.27777778 0.11347518]
- f1 (average): 0.3496061234424462
- accuracy: 0.5089285969734192

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 761477.125
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 19553.0
--------------------------------------------------
Validation || Epoch: 1 || Loss: 35425.541015625 || Accuracy: 0.4457652270793915 || F1-score: 0.3175629083374602
Early stopping at epoch 13!
Accuracy on dataset of size 672: 54.16666793823242 %.
Average loss: 19257.948508522728
proportion of labels in prediction: [tensor(0.7812), tensor(0.1414), tensor(0.0774)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.70550847 0.16326531 0.14193548]
- f1 (average): 0.33690308818989595
- accuracy: 0.5416666865348816


In [128]:
signature_history = obtain_signatures_history(pooled_sum_pretrained,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 24631632.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1984559.375
--------------------------------------------------
Validation || Epoch: 1 || Loss: 3590062.5795454546 || Accuracy: 0.3298662602901459 || F1-score: 0.29204124151612504
Early stopping at epoch 36!
Accuracy on dataset of size 672: 53.125 %.
Average loss: 2319916.460227273
proportion of labels in prediction: [tensor(0.7530), tensor(0.1667), tensor(0.0804)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.68756757 0.22137405 0.12738854]
- f1 (average): 0.34544338280031384
- accuracy: 0.53125

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 4012826.75
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1179324.75
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1133526.5369318181 || Accuracy: 0.46656760573387146 || F1-score: 0.3453807718135437
Early stopping at epoch 29!
Accuracy on dataset of size 672: 55.35714340209961 %.
Average loss: 682893.0625
proportion of labels in prediction: [tensor(0.7589), tensor(0.1295), tensor(0.1116)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71474704 0.21940928 0.15730337]
- f1 (average): 0.36381989777157026
- accuracy: 0.5535714030265808

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 4776060.5
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 455326.0
--------------------------------------------------
Validation || Epoch: 1 || Loss: 530180.6803977273 || Accuracy: 0.4739970266819 || F1-score: 0.3368789105631211
Early stopping at epoch 23!
Accuracy on dataset of size 672: 53.4226188659668 %.
Average loss: 243556.21839488635
proportion of labels in prediction: [tensor(0.7396), tensor(0.1994), tensor(0.0610)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.69650655 0.23239437 0.09722222]
- f1 (average): 0.342041046212582
- accuracy: 0.5342261791229248

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2530993.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 15131.3095703125
--------------------------------------------------
Validation || Epoch: 1 || Loss: 184071.51988636365 || Accuracy: 0.5319464802742004 || F1-score: 0.29903258102710484
Early stopping at epoch 16!
Accuracy on dataset of size 672: 49.404762268066406 %.
Average loss: 65795.36257102272
proportion of labels in prediction: [tensor(0.6830), tensor(0.1652), tensor(0.1518)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.65375854 0.16091954 0.23414634]
- f1 (average): 0.34960814127817663
- accuracy: 0.494047611951828

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 318155.34375
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 30153.4765625
--------------------------------------------------
Validation || Epoch: 1 || Loss: 55488.094060724434 || Accuracy: 0.45170876383781433 || F1-score: 0.3292489229434595
Early stopping at epoch 29!
Accuracy on dataset of size 672: 52.23214340209961 %.
Average loss: 11308.965975674715
proportion of labels in prediction: [tensor(0.6964), tensor(0.2634), tensor(0.0402)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.68094701 0.26299694 0.09230769]
- f1 (average): 0.3454172155350232
- accuracy: 0.5223214030265808


### CLS

In [129]:
signature_history = obtain_signatures_history(pooled_cls_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 1548])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 6.332112789154053
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.459240198135376
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.6063351522792468 || Accuracy: 0.5423476696014404 || F1-score: 0.34955472538253995
Early stopping at epoch 12!
Accuracy on dataset of size 672: 58.48214340209961 %.
Average loss: 1.2180937853726475
proportion of labels in prediction: [tensor(0.8497), tensor(0.1146), tensor(0.0357)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73535354 0.18502203 0.12598425]
- f1 (average): 0.3487866045845858
- accuracy: 0.5848214030265808

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.6944917440414429
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.6130502820014954
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.088607582178983 || Accuracy: 0.560178279876709 || F1-score: 0.31353754359978375
Early stopping at epoch 36!
Accuracy on dataset of size 672: 57.14285659790039 %.
Average loss: 1.3183693994175305
proportion of labels in prediction: [tensor(0.7649), tensor(0.1964), tensor(0.0387)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73097535 0.26241135 0.09302326]
- f1 (average): 0.36213665055679206
- accuracy: 0.5714285969734192

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.185005784034729
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8278284072875977
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0067377090454102 || Accuracy: 0.591381847858429 || F1-score: 0.25419086992167733
Early stopping at epoch 34!
Accuracy on dataset of size 672: 59.970237731933594 %.
Average loss: 1.3248264572837136
proportion of labels in prediction: [tensor(0.8199), tensor(0.1443), tensor(0.0357)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74020619 0.27530364 0.15748031]
- f1 (average): 0.39099671475077885
- accuracy: 0.5997023582458496

********** hidden_dim: [100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0278944969177246
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8212054371833801
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9786758639595725 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 25!
Accuracy on dataset of size 672: 58.18452453613281 %.
Average loss: 1.6404736908999356
proportion of labels in prediction: [tensor(0.7321), tensor(0.2292), tensor(0.0387)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73545554 0.30263158 0.15503876]
- f1 (average): 0.39770862733207907
- accuracy: 0.581845223903656

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0648689270019531
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.8715722560882568
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.960474046793851 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 31!
Accuracy on dataset of size 672: 60.26785659790039 %.
Average loss: 1.6285462054339321
proportion of labels in prediction: [tensor(0.7411), tensor(0.2039), tensor(0.0551)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73718648 0.34146341 0.25714286]
- f1 (average): 0.44526424980716545
- accuracy: 0.6026785969734192


In [130]:
signature_history = obtain_signatures_history(pooled_cls_pretrained,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: [100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.4455976486206055
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 2.673224687576294
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.9785998301072554 || Accuracy: 0.4011887013912201 || F1-score: 0.31288628387717954
Early stopping at epoch 38!
Accuracy on dataset of size 672: 57.738094329833984 %.
Average loss: 1.6094523885033347
proportion of labels in prediction: [tensor(0.8304), tensor(0.1250), tensor(0.0446)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.72671443 0.23931624 0.07518797]
- f1 (average): 0.34707288039184814
- accuracy: 0.5773809552192688

********** hidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.1884889602661133
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.2756325006484985
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1500525637106462 || Accuracy: 0.4606240689754486 || F1-score: 0.31100315939063394
Early stopping at epoch 21!
Accuracy on dataset of size 672: 58.779762268066406 %.
Average loss: 1.244567719372836
proportion of labels in prediction: [tensor(0.8958), tensor(0.0818), tensor(0.0223)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74045054 0.14634146 0.03389831]
- f1 (average): 0.30689676906231367
- accuracy: 0.5877976417541504

********** hidden_dim: [100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.3909130096435547
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0126690864562988
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0167371251366355 || Accuracy: 0.6062406897544861 || F1-score: 0.2793916609706083
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6514841318130493
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6830207705497742
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.1357428214766763 || Accuracy: 0.5898959636688232 || F1-score: 0.32296635643798327
Epoch: 201/10000 || Item: 0/85 || Loss: 0.662000834941864
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.311781108379364
--------------------------------------------------
Validation || Epoch: 201 || Loss: 1.15751404653896 || Accuracy: 0.5898959636688232 || F1-score: 0.32296635643798327
Epoch: 301/10000 || I

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.2129356861114502
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9796766042709351
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9908564253286882 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 38!
Accuracy on dataset of size 672: 56.69643020629883 %.
Average loss: 1.493797854943709
proportion of labels in prediction: [tensor(0.8408), tensor(0.1190), tensor(0.0402)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71747967 0.19130435 0.09230769]
- f1 (average): 0.3336972383101757
- accuracy: 0.5669642686843872

********** hidden_dim: [100, 100, 100, 100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0765931606292725
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9636225700378418
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9956892999735746 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 27!
Accuracy on dataset of size 672: 56.5476188659668 %.
Average loss: 1.5414887558330188
proportion of labels in prediction: [tensor(0.7738), tensor(0.1830), tensor(0.0432)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.71565495 0.25641026 0.13636364]
- f1 (average): 0.3694762816168568
- accuracy: 0.5654761791229248


### Fine-tuned BERT

### Mean pooled

In [None]:
signature_history = obtain_signatures_history(pooled_mean, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
signature_history = obtain_signatures_history(pooled_mean,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Max pooled

In [None]:
signature_history = obtain_signatures_history(pooled_max,
                                              path_specifics,
                                              dimension,
                                              sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
signature_history = obtain_signatures_history(pooled_max,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
signature_history = obtain_signatures_history(pooled_sum, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
signature_history = obtain_signatures_history(pooled_sum,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
signature_history = obtain_signatures_history(pooled_cls, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

In [None]:
signature_history = obtain_signatures_history(pooled_cls,
                                              path_specifics,
                                              dimension,
                                              sig_depth,
                                              concatenate_current=False)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# StackedDeepSigNet

## Obtaining path by looking at post history

We can obtain a path by looking at the history of each post. Here we look at the last 10 posts (and pad with vectors of zeros if there are less than 10 posts) including the current post.

We only want to consider paths that correspond to a client's utterance as we want to model a change in mood at that time. Their history will still contain the therapist's utterances too.

In [173]:
time_features = ["time_encoding", "timeline_index"]
path_specifics = {"pad_by": "history",
                  "zero_padding": True,
                  "method": "k_last",
                  "k": 5,
                  "time_feature": time_features,
                  "standardise_method": ["minmax", None],
                  "embeddings": "dim_reduced",
                  "include_current_embedding": True,
                  "pad_from_below": False}

In [174]:
def obtain_SDSN_input(embeddings, path_specifics):
    reduction = nlpsig.DimReduce(method="gaussian_random_projection", n_components=50)
    embeddings_reduced = reduction.fit_transform(embeddings, random_state=seed)
    
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings,
                               embeddings_reduced=embeddings_reduced)
    
    paths.pad(**path_specifics)
    
    paths.array_padded = paths.array_padded[client_index]
    paths.embeddings = paths.embeddings[client_index]
    paths.embeddings_reduced = paths.embeddings_reduced[client_index]
    
    return paths.get_torch_path_for_SDSN(
        include_time_features_in_path=True,
        include_time_features_in_input=True,
        include_embedding_in_input=True,
        reduced_embeddings=False
    )

In [175]:
def implement_sdsn(x_data,
                   y_data,
                   sig_depth,
                   input_channels,
                   output_channels,
                   lstm_hidden_dim,
                   ffn_hidden_dim,
                   BiLSTM,
                   learning_rate,
                   loss,
                   gamma = 0):
    SDSN_args = {
        "input_channels": input_channels,
        "output_channels": output_channels,
        "num_time_features": len(time_features),
        "embedding_dim": x_data.shape[2]-input_channels-len(time_features),
        "sig_depth": sig_depth,
        "hidden_dim_lstm": lstm_hidden_dim,
        "hidden_dim_ffn": ffn_hidden_dim,
        "output_dim": len(label_to_id),
        "dropout_rate": 0.1,
        "augmentation_type": "Conv1d",
        "BiLSTM": BiLSTM,
        "comb_method": "concatenation"
    }
    
    sdsn_model = StackedDeepSigNet(**SDSN_args)
    # print(sdsn_model)
    
    # split dataset
    train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),
                                       y_data=torch.tensor(y_data),
                                       train_size=0.8,
                                       valid_size=0.5,
                                       shuffle=True,
                                       as_DataLoader=True,
                                       seed=seed)
    
    # define loss
    if loss == "focal":    
        criterion = FocalLoss(gamma = gamma)
    elif loss == "cross_entropy":
        criterion = torch.nn.CrossEntropyLoss()

    # define optimizer
    optimizer = torch.optim.Adam(sdsn_model.parameters(), lr=learning_rate)
    # define scheduler for adjusting the learning rate
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    # scheduler = StepLR(optimizer, step_size = 10, gamma = 0.5)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 
    #                                         T_0 = 8,# Number of iterations for the first restart
    #                                         T_mult = 1, # A factor increases TiTi after a restart
    #                                         eta_min = learning_rate*0.1)
    # scheduler = None
    
    sdsn_model = training_pytorch(model=sdsn_model,
                                  train_loader=train,
                                  criterion=criterion,
                                  optimizer=optimizer,
                                  num_epochs=10000,
                                  scheduler=scheduler,
                                  valid_loader=valid,
                                  early_stopping=True,
                                  early_stopping_metric="f1",
                                  patience=100,
                                  verbose=True,
                                  verbose_epoch=100,
                                  seed=seed)

    pred, label = testing_pytorch(sdsn_model, test, criterion)
    print(f"proportion of labels in prediction: {[sum(pred==i)/len(pred) for i in label_to_id.values()]}")
    print(f"proportion of labels in data: {[sum(label==i)/len(label) for i in label_to_id.values()]}")
    
    f1_scores = metrics.f1_score(label, pred, average=None)
    print(f"- f1: {f1_scores}")
    print(f"- f1 (average): {sum(f1_scores)/len(f1_scores)}")
    print(f"- accuracy: {sum(pred==label)/len(pred)}")
    
    return sdsn_model

In [176]:
lstm_hidden_dim_trial = [[8,8], [12,12,8], [12,12,12,8]]
ffn_hidden_dim_trial = [[100]*i for i in range(2, 6)]
sig_depth = 3
output_channels = 10
BiLSTM = True
learning_rate = 1e-4

## SBERT 768

In [177]:
input_channels

52

In [178]:
from __future__ import annotations
import signatory
import torch
import torch.nn as nn


class StackedDeepSigNet(nn.Module):
    """
    Stacked Deep Signature Neural Network for classification.
    """

    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        num_time_features: int,
        embedding_dim: int,
        sig_depth: int,
        hidden_dim_lstm: list[int] | int,
        hidden_dim_ffn: list[int] | int,
        output_dim: int,
        dropout_rate: float,
        augmentation_type: str = "Conv1d",
        augmentation_args: dict | None = None,
        hidden_dim_aug: list[int] | int | None = None,
        BiLSTM: bool = False,
        comb_method: str = "gated_addition",
    ):
        """
        Stacked Deep Signature Neural Network for classification.

        Parameters
        ----------
        input_channels : int
            Dimension of the embeddings that will be passed in.
        output_channels : int
            Requested dimension of the embeddings after convolution layer.
        num_time_features : int
            Number of time features to add to FFN input. If none, set to zero.
        embedding_dim : int
            Dimension of embedding to add to FFN input. If none, set to zero.
        sig_depth : int
            The depth to truncate the path signature at.
        hidden_dim_lstm : list[int] | int
            Dimensions of the hidden layers in the LSTM blocks.
        hidden_dim_ffn : list[int] | int
            Dimension of the hidden layers in the FFN.
        output_dim : int
            Dimension of the output layer in the FFN.
        dropout_rate : float
            Dropout rate in the FFN.
        augmentation_type : str, optional
            Method of augmenting the path, by default "Conv1d".
            Options are:
            - "Conv1d": passes path through 1D convolution layer.
            - "signatory": passes path through `Augment` layer from `signatory` package.
        augmentation_args : dict | None, optional
            Arguments to pass into `torch.Conv1d` or `signatory.Augment`, by default None.
            If None, by default will set `kernel_size=3`, `stride=1`, `padding=0`.
        hidden_dim_aug : list[int] | int | None
            Dimensions of the hidden layers in the augmentation layer.
            Passed into `Augment` class from `signatory` package if
            `augmentation_type='signatory'`, by default None.
        BiLSTM : bool, optional
            Whether or not a birectional LSTM is used,
            by default False (unidirectional LSTM is used in this case).
        comb_method : str, optional
            Determines how to combine the path signature and embeddings,
            by default "gated_addition".
            Options are:
            - concatenation: concatenation of path signature and embedding vector
            - gated_addition: element-wise addition of path signature and embedding vector
        """
        super(StackedDeepSigNet, self).__init__()
        self.input_channels = input_channels
        
        if isinstance(hidden_dim_lstm, int):
            hidden_dim_lstm = [hidden_dim_lstm]
        if isinstance(hidden_dim_ffn, int):
            hidden_dim_ffn = [hidden_dim_ffn]
        self.hidden_dim_lstm = hidden_dim_lstm
        self.hidden_dim_ffn = hidden_dim_ffn
        
        self.embedding_dim = embedding_dim
        self.num_time_features = num_time_features
        if comb_method not in ["concatenation", "gated_addition"]:
            raise ValueError(
                "`comb_method` must be either 'concatenation' or 'gated_addition'."
            )
        self.comb_method = comb_method
        if augmentation_type not in ["Conv1d", "signatory"]:
            raise ValueError("`augmentation_type` must be 'Conv1d' or 'signatory'.")
        
        self.augmentation_type = augmentation_type
        if isinstance(hidden_dim_aug, int):
            hidden_dim_aug = [hidden_dim_aug]
        elif hidden_dim_aug is None:
            hidden_dim_aug = []
        self.hidden_dim_aug = hidden_dim_aug
        if augmentation_args is None:
            augmentation_args = {"kernel_size": 3,
                                 "stride": 1,
                                 "padding": 1}
        # convolution
        self.conv = nn.Conv1d(
            in_channels=input_channels,
            out_channels=output_channels,
            **augmentation_args,
        )
        self.augment = signatory.Augment(
            in_channels=input_channels,
            layer_sizes=self.hidden_dim_aug + [output_channels],
            include_original=False,
            include_time=False,
            **augmentation_args,
        )
        # non-linearity
        self.tanh1 = nn.Tanh()

        self.signature_layers = []
        self.lstm_layers = []
        for l in range(len(self.hidden_dim_lstm)):
            self.signature_layers.append(signatory.LogSignature(depth=sig_depth, stream=True))
            if l == 0:
                input_dim_lstm = signatory.logsignature_channels(output_channels, sig_depth)
            else:
                input_dim_lstm = signatory.logsignature_channels(self.hidden_dim_lstm[l-1], sig_depth)
            self.lstm_layers.append(nn.LSTM(
                input_size=input_dim_lstm,
                hidden_size=self.hidden_dim_lstm[l],
                num_layers=1,
                batch_first=True,
                bidirectional=False if l!=(len(self.hidden_dim_lstm)-1) else BiLSTM,
            ))
        
        self.signature_layers = nn.ModuleList(self.signature_layers)
        self.lstm_layers = nn.ModuleList(self.lstm_layers)

        # signature without lift (for passing into FFN)
        mult = 2 if BiLSTM else 1
        self.signature2 = signatory.LogSignature(depth=sig_depth, stream=False)

        # find dimension of features to pass through FFN
        if self.comb_method == "concatenation":
            input_dim = (
                signatory.logsignature_channels(
                    in_channels=mult * self.hidden_dim_lstm[-1], depth=sig_depth
                )
                + self.num_time_features
                + self.embedding_dim
            )
        elif self.comb_method == "gated_addition":
            input_dim = self.embedding_dim
            input_gated_linear = (
                signatory.logsignature_channels(
                    in_channels=mult * self.hidden_dim_lstm[-1], depth=sig_depth
                )
                + self.num_time_features
            )
            if self.embedding_dim > 0:
                self.fc_scale = nn.Linear(input_gated_linear, self.embedding_dim)
                self.scaler = torch.nn.Parameter(torch.zeros(1, self.embedding_dim))
            else:
                self.fc_scale = nn.Linear(input_gated_linear, input_gated_linear)
                self.scaler = torch.nn.Parameter(torch.zeros(1, input_gated_linear))
            # non-linearity
            self.tanh2 = nn.Tanh()

        # FFN: input layer
        self.ffn_input_layer = nn.Linear(input_dim, self.hidden_dim_ffn[0])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        input_dim = self.hidden_dim_ffn[0]
        
        # FFN: hidden layers
        self.ffn_linear_layers = []
        self.ffn_non_linear_layers = []
        self.dropout_layers = []
        for l in range(len(self.hidden_dim_ffn)):
            self.ffn_linear_layers.append(nn.Linear(input_dim, self.hidden_dim_ffn[l]))
            self.ffn_non_linear_layers.append(nn.ReLU())
            self.dropout_layers.append(nn.Dropout(dropout_rate))
            input_dim = self.hidden_dim_ffn[l]
        
        self.ffn_linear_layers = nn.ModuleList(self.ffn_linear_layers)
        self.ffn_non_linear_layers = nn.ModuleList(self.ffn_non_linear_layers)
        self.dropout_layers = nn.ModuleList(self.dropout_layers)
        
        # FFN: readout
        self.ffn_final_layer = nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor):
        # x has dimensions [batch, length of signal, channels]
        
        print(f"input size: {x.shape}")
        
        # convolution
        if self.augmentation_type == "Conv1d":
            # input has dimensions [batch, length of signal, channels]
            # swap dimensions to get [batch, channels, length of signal]
            # (nn.Conv1d expects this)
            out = torch.transpose(x, 1, 2)
            # get only the path information
            out = self.conv(out[:, : self.input_channels, :])
            out = self.tanh1(out)
            # make output have dimensions [batch, length of signal, channels]
            out = torch.transpose(out, 1, 2)
        elif self.augmentation_type == "signatory":
            # input has dimensions [batch, length of signal, channels]
            # (signatory.Augment expects this)
            # and get only the path information
            # output has dimensions [batch, length of signal, channels]
            out = self.augment(x[:, :, : self.input_channels])

        print(f"after conv: {out.shape}")
        
        # take signature lifts and lstm
        for l in range(len(self.hidden_dim_lstm)):
            out = self.signature_layers[l](out)
            print(f"after signature: {out.shape}")
            out, _ = self.lstm_layers[l](out)
            print(f"after lstm: {out.shape}")
        
        print(f"after snwu: {out.shape}")
        # signature
        out = self.signature2(out)
        print(f"after last signature: {out.shape}")

        # combine last post embedding
        if x.shape[2] > self.input_channels:
            # we have things to concatenate to the path
            if self.comb_method == "concatenation":
                if self.num_time_features > 0:
                    # concatenate any time features
                    # take the maximum for the latest time
                    out = torch.cat(
                        (
                            out,
                            x[
                                :,
                                :,
                                self.input_channels : (
                                    self.input_channels + self.num_time_features
                                ),
                            ].max(1)[0],
                        ),
                        dim=1,
                    )
                if x.shape[2] > self.input_channels + self.num_time_features:
                    # concatenate current post embedding if provided
                    out = torch.cat(
                        (
                            out,
                            x[:, 0, (self.input_channels + self.num_time_features) :],
                        ),
                        dim=1,
                    )
            elif self.comb_method == "gated_addition":
                if self.num_time_features > 0:
                    # concatenate any time features
                    out_gated = torch.cat(
                        (
                            out,
                            x[
                                :,
                                :,
                                self.input_channels : (
                                    self.input_channels + self.num_time_features
                                ),
                            ].max(1)[0],
                        ),
                        dim=1,
                    )
                else:
                    out_gated = out
                out_gated = self.fc_scale(out_gated.float())
                out_gated = self.tanh2(out_gated)
                out_gated = torch.mul(self.scaler, out_gated)
                if x.shape[2] > self.input_channels + self.num_time_features:
                    # concatenate current post embedding if provided
                    out = (
                        out_gated
                        + x[:, 0, (self.input_channels + self.num_time_features) :],
                    )
                else:
                    out = out_gated

        # FFN: input layer
        out = self.ffn_input_layer(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        # FFN: hidden layers    
        for l in range(len(self.hidden_dim_ffn)):
            out = self.ffn_linear_layers[l](out)
            out = self.ffn_non_linear_layers[l](out)
            out = self.dropout_layers[l](out)

        # FFN: readout
        out = self.ffn_final_layer(out)

        return out

In [179]:
from __future__ import annotations
import signatory
import torch
import torch.nn as nn


class StackedDeepSigNet(nn.Module):
    """
    Stacked Deep Signature Neural Network for classification.
    """

    def __init__(
        self,
        input_channels: int,
        output_channels: int,
        num_time_features: int,
        embedding_dim: int,
        sig_depth: int,
        hidden_dim_lstm: list[int] | int,
        hidden_dim_ffn: list[int] | int,
        output_dim: int,
        dropout_rate: float,
        augmentation_type: str = "Conv1d",
        augmentation_args: dict | None = None,
        hidden_dim_aug: list[int] | int | None = None,
        BiLSTM: bool = False,
        comb_method: str = "gated_addition",
    ):
        """
        Stacked Deep Signature Neural Network for classification.

        Parameters
        ----------
        input_channels : int
            Dimension of the embeddings that will be passed in.
        output_channels : int
            Requested dimension of the embeddings after convolution layer.
        num_time_features : int
            Number of time features to add to FFN input. If none, set to zero.
        embedding_dim : int
            Dimension of embedding to add to FFN input. If none, set to zero.
        sig_depth : int
            The depth to truncate the path signature at.
        hidden_dim_lstm : list[int] | int
            Dimensions of the hidden layers in the LSTM blocks.
        hidden_dim_ffn : list[int] | int
            Dimension of the hidden layers in the FFN.
        output_dim : int
            Dimension of the output layer in the FFN.
        dropout_rate : float
            Dropout rate in the FFN.
        augmentation_type : str, optional
            Method of augmenting the path, by default "Conv1d".
            Options are:
            - "Conv1d": passes path through 1D convolution layer.
            - "signatory": passes path through `Augment` layer from `signatory` package.
        augmentation_args : dict | None, optional
            Arguments to pass into `torch.Conv1d` or `signatory.Augment`, by default None.
            If None, by default will set `kernel_size=3`, `stride=1`, `padding=0`.
        hidden_dim_aug : list[int] | int | None
            Dimensions of the hidden layers in the augmentation layer.
            Passed into `Augment` class from `signatory` package if
            `augmentation_type='signatory'`, by default None.
        BiLSTM : bool, optional
            Whether or not a birectional LSTM is used,
            by default False (unidirectional LSTM is used in this case).
        comb_method : str, optional
            Determines how to combine the path signature and embeddings,
            by default "gated_addition".
            Options are:
            - concatenation: concatenation of path signature and embedding vector
            - gated_addition: element-wise addition of path signature and embedding vector
        """
        super(StackedDeepSigNet, self).__init__()
        self.input_channels = input_channels
        
        if isinstance(hidden_dim_lstm, int):
            hidden_dim_lstm = [hidden_dim_lstm]
        if isinstance(hidden_dim_ffn, int):
            hidden_dim_ffn = [hidden_dim_ffn]
        self.hidden_dim_lstm = hidden_dim_lstm
        self.hidden_dim_ffn = hidden_dim_ffn
        
        self.embedding_dim = embedding_dim
        self.num_time_features = num_time_features
        if comb_method not in ["concatenation", "gated_addition"]:
            raise ValueError(
                "`comb_method` must be either 'concatenation' or 'gated_addition'."
            )
        self.comb_method = comb_method
        if augmentation_type not in ["Conv1d", "signatory"]:
            raise ValueError("`augmentation_type` must be 'Conv1d' or 'signatory'.")
        
        self.augmentation_type = augmentation_type
        if isinstance(hidden_dim_aug, int):
            hidden_dim_aug = [hidden_dim_aug]
        elif hidden_dim_aug is None:
            hidden_dim_aug = []
        self.hidden_dim_aug = hidden_dim_aug
        if augmentation_args is None:
            augmentation_args = {"kernel_size": 3,
                                 "stride": 1,
                                 "padding": 1}
        # convolution
        self.conv = nn.Conv1d(
            in_channels=input_channels,
            out_channels=output_channels,
            **augmentation_args,
        )
        self.augment = signatory.Augment(
            in_channels=input_channels,
            layer_sizes=self.hidden_dim_aug + [output_channels],
            include_original=False,
            include_time=False,
            **augmentation_args,
        )
        # non-linearity
        self.tanh1 = nn.Tanh()

        self.signature_layers = []
        self.lstm_layers = []
        for l in range(len(self.hidden_dim_lstm)):
            self.signature_layers.append(signatory.LogSignature(depth=sig_depth, stream=True))
            if l == 0:
                input_dim_lstm = signatory.logsignature_channels(output_channels, sig_depth)
            else:
                input_dim_lstm = signatory.logsignature_channels(self.hidden_dim_lstm[l-1], sig_depth)
            self.lstm_layers.append(nn.LSTM(
                input_size=input_dim_lstm,
                hidden_size=self.hidden_dim_lstm[l],
                num_layers=1,
                batch_first=True,
                bidirectional=False if l!=(len(self.hidden_dim_lstm)-1) else BiLSTM,
            ))
        
        self.signature_layers = nn.ModuleList(self.signature_layers)
        self.lstm_layers = nn.ModuleList(self.lstm_layers)

        # signature without lift (for passing into FFN)
        mult = 2 if BiLSTM else 1
        self.signature2 = signatory.LogSignature(depth=sig_depth, stream=False)

        # find dimension of features to pass through FFN
        if self.comb_method == "concatenation":
            input_dim = (
                signatory.logsignature_channels(
                    in_channels=mult * self.hidden_dim_lstm[-1], depth=sig_depth
                )
                + self.num_time_features
                + self.embedding_dim
            )
        elif self.comb_method == "gated_addition":
            input_dim = self.embedding_dim
            input_gated_linear = (
                signatory.logsignature_channels(
                    in_channels=mult * self.hidden_dim_lstm[-1], depth=sig_depth
                )
                + self.num_time_features
            )
            if self.embedding_dim > 0:
                self.fc_scale = nn.Linear(input_gated_linear, self.embedding_dim)
                self.scaler = torch.nn.Parameter(torch.zeros(1, self.embedding_dim))
            else:
                self.fc_scale = nn.Linear(input_gated_linear, input_gated_linear)
                self.scaler = torch.nn.Parameter(torch.zeros(1, input_gated_linear))
            # non-linearity
            self.tanh2 = nn.Tanh()

        # FFN: input layer
        self.ffn_input_layer = nn.Linear(input_dim, self.hidden_dim_ffn[0])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        input_dim = self.hidden_dim_ffn[0]
        
        # FFN: hidden layers
        self.ffn_linear_layers = []
        self.ffn_non_linear_layers = []
        self.dropout_layers = []
        for l in range(len(self.hidden_dim_ffn)):
            self.ffn_linear_layers.append(nn.Linear(input_dim, self.hidden_dim_ffn[l]))
            self.ffn_non_linear_layers.append(nn.ReLU())
            self.dropout_layers.append(nn.Dropout(dropout_rate))
            input_dim = self.hidden_dim_ffn[l]
        
        self.ffn_linear_layers = nn.ModuleList(self.ffn_linear_layers)
        self.ffn_non_linear_layers = nn.ModuleList(self.ffn_non_linear_layers)
        self.dropout_layers = nn.ModuleList(self.dropout_layers)
        
        # FFN: readout
        self.ffn_final_layer = nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor):
        # x has dimensions [batch, length of signal, channels]

        # convolution
        if self.augmentation_type == "Conv1d":
            # input has dimensions [batch, length of signal, channels]
            # swap dimensions to get [batch, channels, length of signal]
            # (nn.Conv1d expects this)
            out = torch.transpose(x, 1, 2)
            # get only the path information
            out = self.conv(out[:, : self.input_channels, :])
            out = self.tanh1(out)
            # make output have dimensions [batch, length of signal, channels]
            out = torch.transpose(out, 1, 2)
        elif self.augmentation_type == "signatory":
            # input has dimensions [batch, length of signal, channels]
            # (signatory.Augment expects this)
            # and get only the path information
            # output has dimensions [batch, length of signal, channels]
            out = self.augment(x[:, :, : self.input_channels])

        # take signature lifts and lstm
        for l in range(len(self.hidden_dim_lstm)):
            out = self.signature_layers[l](out)
            out, _ = self.lstm_layers[l](out)
        
        # signature
        out = self.signature2(out)

        # combine last post embedding
        if x.shape[2] > self.input_channels:
            # we have things to concatenate to the path
            if self.comb_method == "concatenation":
                if self.num_time_features > 0:
                    # concatenate any time features
                    # take the maximum for the latest time
                    out = torch.cat(
                        (
                            out,
                            x[
                                :,
                                :,
                                self.input_channels : (
                                    self.input_channels + self.num_time_features
                                ),
                            ].max(1)[0],
                        ),
                        dim=1,
                    )
                if x.shape[2] > self.input_channels + self.num_time_features:
                    # concatenate current post embedding if provided
                    out = torch.cat(
                        (
                            out,
                            x[:, 0, (self.input_channels + self.num_time_features) :],
                        ),
                        dim=1,
                    )
            elif self.comb_method == "gated_addition":
                if self.num_time_features > 0:
                    # concatenate any time features
                    out_gated = torch.cat(
                        (
                            out,
                            x[
                                :,
                                :,
                                self.input_channels : (
                                    self.input_channels + self.num_time_features
                                ),
                            ].max(1)[0],
                        ),
                        dim=1,
                    )
                else:
                    out_gated = out
                out_gated = self.fc_scale(out_gated.float())
                out_gated = self.tanh2(out_gated)
                out_gated = torch.mul(self.scaler, out_gated)
                if x.shape[2] > self.input_channels + self.num_time_features:
                    # concatenate current post embedding if provided
                    out = (
                        out_gated
                        + x[:, 0, (self.input_channels + self.num_time_features) :],
                    )
                else:
                    out = out_gated

        # FFN: input layer
        out = self.ffn_input_layer(out)
        out = self.relu(out)
        out = self.dropout(out)
        
        # FFN: hidden layers    
        for l in range(len(self.hidden_dim_ffn)):
            out = self.ffn_linear_layers[l](out)
            out = self.ffn_non_linear_layers[l](out)
            out = self.dropout_layers[l](out)

        # FFN: readout
        out = self.ffn_final_layer(out)

        return out


In [180]:
x_data, input_channels = obtain_SDSN_input(sbert_768_embeddings, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.265817642211914
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0676361322402954
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9867492589083585 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.721148669719696
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.2635515034198761
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6812250993468545 || Accuracy: 0.7043090462684631 || F1-score: 0.5708052355774097
Early stopping at epoch 143!
Accuracy on dataset of size 672: 70.98213958740234 %.
Average loss: 0.7221233248710632
proportion of labels in prediction: [tensor(0.7411), tensor(0.1696), tensor(0.0893)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81570338 0.52272727 0.41717791]
- f1 (average): 0.5852028558

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.088348150253296
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0833232402801514
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0117262439294294 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7248660922050476
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.12598766386508942
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6911819739775225 || Accuracy: 0.7087666988372803 || F1-score: 0.5776151579459678
Early stopping at epoch 146!
Accuracy on dataset of size 672: 72.17262268066406 %.
Average loss: 0.7211203033273871
proportion of labels in prediction: [tensor(0.7351), tensor(0.1592), tensor(0.1057)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82146769 0.52918288 0.48275862]
- f1 (average): 0.61113639

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0947375297546387
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0724949836730957
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0100325345993042 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6952031850814819
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.0778966173529625
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7145196145231073 || Accuracy: 0.710252583026886 || F1-score: 0.5707901727575452
Early stopping at epoch 149!
Accuracy on dataset of size 672: 70.83333587646484 %.
Average loss: 0.7661464701999318
proportion of labels in prediction: [tensor(0.7351), tensor(0.1562), tensor(0.1086)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81270537 0.50980392 0.45454545]
- f1 (average): 0.592351581

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.092410922050476
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0839177370071411
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9791089675643228 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5057088732719421
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7203328609466553
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7227513627572493 || Accuracy: 0.7072808146476746 || F1-score: 0.5747369384667627
Early stopping at epoch 154!
Accuracy on dataset of size 672: 72.02381134033203 %.
Average loss: 0.7385878400369124
proportion of labels in prediction: [tensor(0.7500), tensor(0.1518), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82340195 0.52380952 0.44970414]
- f1 (average): 0.598971871

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.107893466949463
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0447618961334229
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9967642751607028 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6416645646095276
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.23477709293365479
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6964212222532793 || Accuracy: 0.7057949304580688 || F1-score: 0.57327064568848
Early stopping at epoch 162!
Accuracy on dataset of size 672: 72.32142639160156 %.
Average loss: 0.733739587393674
proportion of labels in prediction: [tensor(0.7321), tensor(0.1696), tensor(0.0982)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82107574 0.54545455 0.47337278]
- f1 (average): 0.61330102248

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.080686092376709
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0380885601043701
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9953892230987549 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6616314649581909
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.2370094209909439
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7219040610573508 || Accuracy: 0.7087666988372803 || F1-score: 0.5653091474687719
Early stopping at epoch 146!
Accuracy on dataset of size 672: 70.38690185546875 %.
Average loss: 0.7294310277158563
proportion of labels in prediction: [tensor(0.7173), tensor(0.1801), tensor(0.1027)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81243063 0.51660517 0.43023256]
- f1 (average): 0.586422785

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1386523246765137
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.054408073425293
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0090482343326916 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7550916075706482
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.07384757697582245
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7554692409255288 || Accuracy: 0.6968796253204346 || F1-score: 0.550924250528957
Early stopping at epoch 183!
Accuracy on dataset of size 672: 69.94047546386719 %.
Average loss: 0.749945645982569
proportion of labels in prediction: [tensor(0.7515), tensor(0.1533), tensor(0.0952)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81385281 0.4743083  0.40718563]
- f1 (average): 0.5651155809

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0972087383270264
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0582512617111206
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9941164092584089 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.632483959197998
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.9149890542030334
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7416840249841864 || Accuracy: 0.6745913624763489 || F1-score: 0.42293048962640895
Early stopping at epoch 132!
Accuracy on dataset of size 672: 67.11309814453125 %.
Average loss: 0.7763690244067799
proportion of labels in prediction: [tensor(0.7738), tensor(0.2247), tensor(0.0015)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80724175 0.4717608  0.01923077]
- f1 (average): 0.43274443

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

ValueError: Argument 'path' must have stream dimension of size at least 2. (Need at least this many points to define a path.)

## SBERT 384

In [None]:
x_data, input_channels = obtain_SDSN_input(sbert_384_embeddings, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

## Pretrained BERT

### Mean pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_mean_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Max pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_max_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Sum pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_sum_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### CLS

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_cls_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

## Fine-tuned BERT

### Mean pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_mean, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Max pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_max, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Sum pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_sum, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### CLS

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_cls, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

Baselines:
   - just looking at the sentence embeddings (encodes nothing about the history on the post)
       - highlights importance of looking at the sequence
   - averaging history
   - comparing the cosine similarity between previous post and current post to see if switch
   
Test for:
- How many posts do you need to look back?