In [2]:
import pandas as pd
import numpy as np
import torch
import transformers
import pickle
import os

import nlpsig
import nlpsig_networks

from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from nlpsig.classification_utils import split_dataset
from nlpsig_networks.pytorch_utils import training_pytorch, testing_pytorch, set_seed
from nlpsig_networks.ffn import FeedforwardNeuralNetModel
from nlpsig_networks.deepsignet import StackedDeepSigNet
from nlpsig_networks.focal_loss import FocalLoss, ClassBalanced_FocalLoss
from sklearn import metrics

from tqdm.notebook import tqdm

seed = 2023

In [3]:
import signatory

## AnnoMI

In [4]:
anno_mi = pd.read_csv("AnnoMI-full.csv")
anno_mi["datetime"] = pd.to_datetime(anno_mi["timestamp"])
anno_mi = anno_mi.drop(columns=["video_title", "video_url"])
anno_mi.head()

Unnamed: 0,mi_quality,transcript_id,topic,utterance_id,interlocutor,timestamp,utterance_text,annotator_id,therapist_input_exists,therapist_input_subtype,reflection_exists,reflection_subtype,question_exists,question_subtype,main_therapist_behaviour,client_talk_type,datetime
0,high,0,reducing alcohol consumption,0,therapist,00:00:13,Thanks for filling it out. We give this form t...,3,False,,False,,True,open,question,,2023-05-05 00:00:13
1,high,0,reducing alcohol consumption,1,client,00:00:24,Sure.,3,,,,,,,,neutral,2023-05-05 00:00:24
2,high,0,reducing alcohol consumption,2,therapist,00:00:25,"So, let's see. It looks that you put-- You dri...",3,True,information,False,,False,,therapist_input,,2023-05-05 00:00:25
3,high,0,reducing alcohol consumption,3,client,00:00:34,Mm-hmm.,3,,,,,,,,neutral,2023-05-05 00:00:34
4,high,0,reducing alcohol consumption,4,therapist,00:00:34,-and you usually have three to four drinks whe...,3,True,information,False,,False,,therapist_input,,2023-05-05 00:00:34


In [5]:
len(anno_mi)

13551

In [6]:
anno_mi["client_talk_type"].value_counts() / anno_mi["interlocutor"].value_counts()["client"]

neutral    0.627063
change     0.248030
sustain    0.124907
Name: client_talk_type, dtype: float64

In [7]:
anno_mi["interlocutor"].value_counts()

therapist    6826
client       6725
Name: interlocutor, dtype: int64

In [8]:
anno_mi["topic"].value_counts()

reducing alcohol consumption                                                          2326
more exercise / increasing activity                                                   2034
reducing recidivism                                                                   1303
reducing drug use                                                                     1104
diabetes management                                                                    948
smoking cessation                                                                      923
smoking cessation                                                                      541
taking medicine / following medical procedure                                          448
asthma management                                                                      431
avoiding DOI                                                                           394
changing approach to disease                                                           315

In [9]:
len(anno_mi["transcript_id"].unique())

133

## Only considering client for now...

In [10]:
client_index = [isinstance(x, str) for x in anno_mi["client_talk_type"]]
sum(client_index)

6725

In [11]:
y_data = anno_mi["client_talk_type"][client_index]
y_data.shape

(6725,)

In [12]:
y_data[0:20]

1     neutral
3     neutral
5     neutral
7     neutral
9     neutral
11    neutral
13    neutral
15    neutral
17    neutral
19    neutral
21    neutral
23    neutral
25    neutral
27    neutral
29    neutral
31    neutral
33    neutral
35     change
37     change
39     change
Name: client_talk_type, dtype: object

In [13]:
label_to_id = {y_data.unique()[i]: i for i in range(len(y_data.unique()))}
id_to_label = {v: k for k, v in label_to_id.items()}

In [14]:
label_to_id

{'neutral': 0, 'change': 1, 'sustain': 2}

In [15]:
id_to_label

{0: 'neutral', 1: 'change', 2: 'sustain'}

In [16]:
y_data = [label_to_id[x] for x in y_data]
y_data[0:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]

## Obtaining SBERT Embeddings

We can use the `SentenceEncoder` class within `nlpsig` to obtain sentence embeddings from a model. This class uses the [`sentence-transformer`](https://www.sbert.net/docs/package_reference/SentenceTransformer.html) package and here, we have use the pre-trained `all-mpnet-base-v2` model by passing this name as a string to the class - alternative models can be found [here](https://www.sbert.net/docs/pretrained_models.html).

We can pass these into the constructor of the class to initialise our text encoder as follows:

In [17]:
# sbert_768_embeddings = np.load("anno_mi_sentence_embeddings_768.npy")

In [16]:
# initialise the Text Encoder
sbert_model_768 = "all-mpnet-base-v2"
text_encoder_sbert_768 = nlpsig.SentenceEncoder(df=anno_mi,
                                                feature_name="utterance_text",
                                                model_name=sbert_model_768)
text_encoder_sbert_768.load_pretrained_model()

The class has a `.encode_sentence_transformer()` method which first loads in the model (using the `model_name` and `model_args` attributes) and then obtains an embedding for each sentence. These sentence embeddings are then stored in the `embeddings_sentence` attribute of the object.

In [17]:
text_encoder_sbert_768.obtain_embeddings()
sbert_768_embeddings = text_encoder_sbert_768.sentence_embeddings

[INFO] number of sentences to encode: 13551


Batches:   0%|          | 0/212 [00:00<?, ?it/s]

In [18]:
np.save("anno_mi_sentence_embeddings_768", sbert_768_embeddings)

## SBERT with 384 dimension vectors

In [18]:
# sbert_384_embeddings = np.load("anno_mi_sentence_embeddings_384.npy")

In [20]:
# initialise the Text Encoder
sbert_model_384 = "all-MiniLM-L12-v2"
text_encoder_sbert_384 = nlpsig.SentenceEncoder(df=anno_mi,
                                                feature_name="utterance_text",
                                                model_name=sbert_model_384)
text_encoder_sbert_384.load_pretrained_model()

In [21]:
text_encoder_sbert_384.obtain_embeddings()
sbert_384_embeddings = text_encoder_sbert_384.sentence_embeddings

[INFO] number of sentences to encode: 13551


Batches:   0%|          | 0/212 [00:00<?, ?it/s]

In [22]:
np.save("anno_mi_sentence_embeddings_384", sbert_384_embeddings)

## Pretrained BERT and pooling

In [19]:
# pooled_mean_pretrained = np.load("anno_mi_pretrained_BERT_mean.npy")
# pooled_max_pretrained = np.load("anno_mi_pretrained_BERT_max.npy")
# pooled_sum_pretrained = np.load("anno_mi_pretrained_BERT_sum.npy")
# pooled_cls_pretrained = np.load("anno_mi_pretrained_BERT_cls.npy")

In [24]:
bert_model = "bert-base-uncased"

In [25]:
text_encoder_pretrained_BERT = nlpsig.TextEncoder(df=anno_mi,
                                                  feature_name="utterance_text",
                                                  model_name=bert_model)
text_encoder_pretrained_BERT.load_pretrained_model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
text_encoder_pretrained_BERT.tokenize_text(skip_special_tokens=False)

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask', 'tokens'],
    num_rows: 13551
})

In [27]:
token_embeddings_pretrained = text_encoder_pretrained_BERT.obtain_embeddings(method="hidden_layer")

  0%|          | 0/136 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [28]:
pooled_mean_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings()
pooled_max_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="max")
pooled_sum_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="sum") 
pooled_cls_pretrained = text_encoder_pretrained_BERT.pool_token_embeddings(method="cls")

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

  0%|          | 0/13551 [00:00<?, ?it/s]

In [29]:
pooled_mean_pretrained.shape

(13551, 768)

In [30]:
pooled_max_pretrained.shape

(13551, 768)

In [31]:
pooled_sum_pretrained.shape

(13551, 768)

In [32]:
pooled_cls_pretrained.shape

(13551, 768)

In [33]:
np.save("anno_mi_pretrained_BERT_mean", pooled_mean_pretrained)
np.save("anno_mi_pretrained_BERT_max", pooled_max_pretrained)
np.save("anno_mi_pretrained_BERT_sum", pooled_sum_pretrained)
np.save("anno_mi_pretrained_BERT_cls", pooled_cls_pretrained)

## Fine-tuning BERT and pooling

### (Ignoring this part for now while, but will run this on GPU cluster soon...)

In [20]:
# pooled_mean = np.load("anno_mi_BERT_mean.npy")
# pooled_max = np.load("anno_mi_BERT_max.npy")
# pooled_sum = np.load("anno_mi_BERT_sum.npy")
# pooled_cls = np.load("anno_mi_BERT_cls.npy")

In [35]:
from transformers import (
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    pipeline,
)

model = AutoModelForMaskedLM.from_pretrained(bert_model)
tokenizer = AutoTokenizer.from_pretrained(bert_model)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [36]:
text_encoder_BERT = nlpsig.TextEncoder(df=anno_mi,
                                       feature_name="utterance_text",
                                       model=model,
                                       tokenizer=tokenizer,
                                       data_collator=data_collator)

In [37]:
text_encoder_BERT.tokenize_text(skip_special_tokens=False)

[INFO] Setting return_special_tokens_mask=True
[INFO] Tokenizing the dataset...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Saving the tokenized text for each sentence into `.df['tokens']`...


Map:   0%|          | 0/13551 [00:00<?, ? examples/s]

[INFO] Creating tokenized dataframe and setting in `.tokenized_df` attribute...
[INFO] Note: 'text_id' is the column name for denoting the corresponding text id


Dataset({
    features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 13551
})

### Training the model

In [38]:
# set up data_collator for language modelling (has dynamic padding)
data_collator_for_LM = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                       mlm=True,
                                                       mlm_probability=0.15)

In [39]:
text_encoder_BERT.split_dataset(seed=seed)

[INFO] Splitting up dataset into train / validation / test sets, and saving to `.dataset_split`.


DatasetDict({
    train: Dataset({
        features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 10840
    })
    test: Dataset({
        features: ['mi_quality', 'transcript_id', 'topic', 'utterance_id', 'interlocutor', 'timestamp', 'utterance_text', 'annotator_id', 'therapist_input_exists', 'therapist_input_subtype', 'reflection_exists', 'reflection_subtype', 'question_exists', 'question_subtype', 'main_therapist_behaviour', 'client_talk_type', 'datetime', 'tokens', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 1356
    })
    validation: Dataset({
        features: ['mi_qua

In [40]:
type(text_encoder_BERT.dataset_split)

datasets.dataset_dict.DatasetDict

In [41]:
model_name = "bert-base-uncased-anno-mi"
text_encoder_BERT.set_up_training_args(output_dir=model_name,
                                  num_train_epochs=600,
                                  per_device_train_batch_size=128,
                                  disable_tqdm=False,
                                  save_strategy="steps",
                                  save_steps=10000,
                                  seed=seed)

[INFO] Setting up TrainingArguments object and saving to `.training_args`.


TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ign

In [42]:
type(text_encoder_BERT.training_args)

transformers.training_args.TrainingArguments

In [43]:
text_encoder_BERT.set_up_trainer(data_collator=data_collator_for_LM)

[INFO] Setting up Trainer object, and saving to `.trainer`.


<transformers.trainer.Trainer at 0x2b04c1e80>

In [44]:
type(text_encoder_BERT.trainer)

transformers.trainer.Trainer

In [45]:
torch.cuda.is_available()

False

In [46]:
torch.cuda.device_count()

0

In [47]:
# set to only report errors to avoid excessing logging
transformers.utils.logging.set_verbosity(40)

In [None]:
text_encoder_BERT.fit_transformer_with_trainer_api()

[INFO] Training model with 109514298 parameters...




Epoch,Training Loss,Validation Loss


In [None]:
text_encoder_BERT.trainer.save_model(model_name)

### Evaluating model on masked language modelling task

In [None]:
text_encoder_BERT.tokenizer.special_tokens_map

In [None]:
def compute_masked_character_accuracy(fill_mask, words):
    was_correct = []
    print(f"Evaluating with {len(words)} words")
    for word in tqdm(words):
        masked_strings = [word[:i] + '<mask>' + word[i+1:] for i in range(len(word))]
        predictions = [fill_mask(word)[0]['sequence'] for word in masked_strings]
        was_correct += [pred == word for pred in predictions]
    
    acc = np.sum(was_correct) / len(was_correct)
    print(f"Accuracy: {acc}")
    return acc

In [None]:
fill_mask = pipeline("fill-mask",
                     model=model_name,
                     tokenizer=model_name)

compute_masked_character_accuracy(fill_mask, text_encoder_BERT.dataset_split["test"]["word"])

### Obtain embeddings from model

In [None]:
# setting the model to CPU (might not be always necessary to run this)
text_encoder_BERT.model.to('cpu')
token_embeddings = text_encoder_BERT.obtain_embeddings(method="hidden_layer")

In [None]:
token_embeddings.shape

In [None]:
pooled_mean = text_encoder_BERT.pool_token_embeddings()
pooled_max = text_encoder_BERT.pool_token_embeddings(method="max")
pooled_sum = text_encoder_BERT.pool_token_embeddings(method="sum")
pooled_cls = text_encoder_BERT.pool_token_embeddings(method="cls")

In [None]:
pooled_mean.shape

In [None]:
pooled_max.shape

In [None]:
pooled_sum.shape

In [None]:
pooled_cls.shape

In [None]:
np.save("anno_mi_BERT_mean", pooled_mean)
np.save("anno_mi_BERT_max", pooled_max)
np.save("anno_mi_BERT_sum", pooled_sum)
np.save("anno_mi_BERT_cls", pooled_cls)

# Baseline 1: FFN baseline

Using the embeddings for the sentences directly in a FFN.

Below is a function that takes in some inputs x_data, y_data and fits a FFN. Will do early stopping if the F1 score continually does not improve (patience is a bit high since we actually count if the F1 isn't changed in an epoch, so we make it a bit higher to allow it to stay at the same F1 performance for a few epochs basically).

In [21]:
def implement_ffn(x_data,
                  y_data,
                  hidden_dim,
                  learning_rate,
                  loss,
                  gamma=0):
    # set seed
    set_seed(seed)
    
    # initialise FFN
    ffn_model = FeedforwardNeuralNetModel(input_dim=x_data.shape[1],
                                          hidden_dim=hidden_dim,
                                          output_dim=len(label_to_id),
                                          dropout_rate=0.1)
    
    # split dataset
    train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),
                                       y_data=torch.tensor(y_data),
                                       train_size=0.8,
                                       valid_size=0.5,
                                       shuffle=True,
                                       as_DataLoader=True,
                                       seed=seed)

    # define loss
    if loss == "focal":
        criterion = FocalLoss(gamma = gamma)
    elif loss == "cross_entropy":
        criterion = torch.nn.CrossEntropyLoss()

    # define optimizer
    optimizer = torch.optim.Adam(ffn_model.parameters(), lr=learning_rate)
    # define scheduler for adjusting the learning rate
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    # scheduler = StepLR(optimizer, step_size = 4, gamma = 0.5)
    # scheduler = None
    
    ffn_model = training_pytorch(model=ffn_model,
                                 train_loader=train,
                                 criterion=criterion,
                                 optimizer=optimizer,
                                 num_epochs=10000,
                                 scheduler=scheduler,
                                 valid_loader=valid,
                                 seed=seed,
                                 early_stopping=True,
                                 early_stopping_metric="f1",
                                 patience=100,
                                 verbose=True,
                                 verbose_epoch=100)

    pred, label = testing_pytorch(ffn_model, test, criterion)
    print(f"proportion of labels in prediction: {[sum(pred==i)/len(pred) for i in label_to_id.values()]}")
    print(f"proportion of labels in data: {[sum(label==i)/len(label) for i in label_to_id.values()]}")
    
    f1_scores = metrics.f1_score(label, pred, average=None)
    print(f"- f1: {f1_scores}")
    print(f"- f1 (average): {sum(f1_scores)/len(f1_scores)}")
    print(f"- accuracy: {sum(pred==label)/len(pred)}")
    
    return ffn_model

Going to try out some variations (1 hidden layer, 2 hidden layers and 3 hidden layers - all of size 100)

In [22]:
hidden_dim_trials = [100, [100,100], [100,100,100]]
learning_rate = 2e-5
loss = "cross_entropy"

## SBERT 768

In [156]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=sbert_768_embeddings[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0885392427444458
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0795083045959473
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0757082592357288 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5589850544929504
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.9038074016571045
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6840004433285106 || Accuracy: 0.6953937411308289 || F1-score: 0.532593092563333
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7341877222061157
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.368276834487915
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.6921473199670966 || Accuracy: 0.7013372778892517 || F1-score: 0.5449254096356076
Early stopping at ep

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.116580843925476
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.118660807609558
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1014031171798706 || Accuracy: 0.12184249609708786 || F1-score: 0.07240618101545253
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7407606244087219
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4813379645347595
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.685649649663405 || Accuracy: 0.6939078569412231 || F1-score: 0.521815389563948
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7610892653465271
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.2291091680526733
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.6832592649893328 || Accuracy: 0.7028231620788574 || F1-score: 0.5480318366095817
Early stopping at epo

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0993850231170654
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.084618330001831
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0871503244746814 || Accuracy: 0.37147101759910583 || F1-score: 0.2602133749674733
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6977670192718506
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.1788264811038971
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7017621939832513 || Accuracy: 0.7013372778892517 || F1-score: 0.4817348905283425
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6531517505645752
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.25363898277282715
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.6932478276166049 || Accuracy: 0.7028231620788574 || F1-score: 0.5252829038733572
Early stopping at 

## SBERT 384

In [95]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=sbert_384_embeddings[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0692368745803833
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0597608089447021
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0575233806263318 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5714445114135742
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.993977427482605
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7121773145415566 || Accuracy: 0.6775631308555603 || F1-score: 0.4852881287212332
Early stopping at epoch 193!
Accuracy on dataset of size 672: 69.49404907226562 %.
Average loss: 0.7372136386958036
proportion of labels in prediction: [tensor(0.7693), tensor(0.1786), tensor(0.0521)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80982906 0.51111111 0.27536232]
- f1 (average): 0.532100829

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1142284870147705
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0898610353469849
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1067617481405085 || Accuracy: 0.25557205080986023 || F1-score: 0.13570019723865875
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7563970685005188
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.24487124383449554
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7194767540151422 || Accuracy: 0.6864784359931946 || F1-score: 0.48013654513009985
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7156612277030945
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.055163860321045
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7143634991212324 || Accuracy: 0.6924219727516174 || F1-score: 0.5152181344117231
Early stopping a

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0598180294036865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0403138399124146
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.051291660829024 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8030747175216675
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.3111073076725006
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7629680037498474 || Accuracy: 0.6627042889595032 || F1-score: 0.41140123154470193
Early stopping at epoch 174!
Accuracy on dataset of size 672: 66.51786041259766 %.
Average loss: 0.7649482705376365
proportion of labels in prediction: [tensor(0.7321), tensor(0.2679), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82107574 0.44242424 0.        ]
- f1 (average): 0.421166661122

## Pretrained BERT

### Mean pooled

In [96]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_mean_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0839786529541016
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0357847213745117
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0042308189652183 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5391806364059448
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6543185114860535
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7084653865207325 || Accuracy: 0.6879643201828003 || F1-score: 0.5493394831286408
Early stopping at epoch 186!
Accuracy on dataset of size 672: 69.04762268066406 %.
Average loss: 0.7226935570890253
proportion of labels in prediction: [tensor(0.7396), tensor(0.2009), tensor(0.0595)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80567686 0.51929825 0.29370629]
- f1 (average): 0.53956046

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1131868362426758
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0881928205490112
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.070029453797774 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7127247452735901
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4683959186077118
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6954461498693987 || Accuracy: 0.7028231620788574 || F1-score: 0.5731369952368576
Early stopping at epoch 194!
Accuracy on dataset of size 672: 68.75 %.
Average loss: 0.7084680687297474
proportion of labels in prediction: [tensor(0.7277), tensor(0.2024), tensor(0.0699)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80396476 0.51748252 0.30666667]
- f1 (average): 0.5427046472861451
- ac

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.098788857460022
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0711716413497925
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0712726332924583 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6781942248344421
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.12910042703151703
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.6977984309196472 || Accuracy: 0.6939078569412231 || F1-score: 0.5608619843873767
Early stopping at epoch 180!
Accuracy on dataset of size 672: 69.19642639160156 %.
Average loss: 0.7155460823665966
proportion of labels in prediction: [tensor(0.7277), tensor(0.1979), tensor(0.0744)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81057269 0.50883392 0.32679739]
- f1 (average): 0.54873466

### Max pooled

In [97]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_max_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0957067012786865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0682802200317383
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.969028727574782 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.571304440498352
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.8043275475502014
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7314741611480713 || Accuracy: 0.6716195940971375 || F1-score: 0.506992275294162
Early stopping at epoch 178!
Accuracy on dataset of size 672: 68.00595092773438 %.
Average loss: 0.7330376018177379
proportion of labels in prediction: [tensor(0.7619), tensor(0.2068), tensor(0.0312)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80558539 0.47750865 0.20967742]
- f1 (average): 0.49759048730

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1076050996780396
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0695494413375854
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0264393470504067 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8345953226089478
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.3302067220211029
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.721508036960255 || Accuracy: 0.6775631308555603 || F1-score: 0.5272523330049675
Early stopping at epoch 193!
Accuracy on dataset of size 672: 69.3452377319336 %.
Average loss: 0.7208894978870045
proportion of labels in prediction: [tensor(0.7500), tensor(0.2009), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80823402 0.5122807  0.29411765]
- f1 (average): 0.5382107894

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0997354984283447
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0507149696350098
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0337201248515735 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7388246655464172
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.18091349303722382
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7373271042650397 || Accuracy: 0.6760772466659546 || F1-score: 0.5229165624322477
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7790274620056152
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.18720248341560364
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7315309806303545 || Accuracy: 0.6805348992347717 || F1-score: 0.5289228692454498
Early stopping a

### Sum pooled

In [98]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.0271031856536865
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0212513208389282
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0024839531291614 || Accuracy: 0.6225854158401489 || F1-score: 0.3232245025425277
Epoch: 101/10000 || Item: 0/85 || Loss: 0.4351794719696045
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.2922021746635437
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.72612741318616 || Accuracy: 0.6909360885620117 || F1-score: 0.565352009350549
Early stopping at epoch 136!
Accuracy on dataset of size 672: 70.68452453613281 %.
Average loss: 0.7379533269188621
proportion of labels in prediction: [tensor(0.7485), tensor(0.1786), tensor(0.0729)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80477223 0.54074074 0.40789474]
- f1 (average): 0.584469237285

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.114878535270691
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0557730197906494
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.054190386425365 || Accuracy: 0.6151559948921204 || F1-score: 0.34358449687048626
Epoch: 101/10000 || Item: 0/85 || Loss: 0.736968994140625
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.3847038149833679
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7197055708278309 || Accuracy: 0.7028231620788574 || F1-score: 0.5803708684031409
Early stopping at epoch 151!
Accuracy on dataset of size 672: 71.57737731933594 %.
Average loss: 0.7132322219285098
proportion of labels in prediction: [tensor(0.7426), tensor(0.1860), tensor(0.0714)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81263617 0.56       0.41059603]
- f1 (average): 0.59441073068

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.111946940422058
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1200898885726929
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0467318513176658 || Accuracy: 0.6344724893569946 || F1-score: 0.293682004525378
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6803026795387268
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.07645384967327118
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7378906770185991 || Accuracy: 0.7072808146476746 || F1-score: 0.5981767162832544
Early stopping at epoch 139!
Accuracy on dataset of size 672: 71.13095092773438 %.
Average loss: 0.736527681350708
proportion of labels in prediction: [tensor(0.7232), tensor(0.1860), tensor(0.0908)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81104972 0.53818182 0.45121951]
- f1 (average): 0.60015035137

### CLS

In [99]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_cls_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.072586178779602
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.054619312286377
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9783457517623901 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5552260279655457
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6392856240272522
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7163692333481528 || Accuracy: 0.6849925518035889 || F1-score: 0.5467744526948758
Early stopping at epoch 186!
Accuracy on dataset of size 672: 71.13095092773438 %.
Average loss: 0.7055106813257391
proportion of labels in prediction: [tensor(0.7515), tensor(0.1801), tensor(0.0685)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81818182 0.51660517 0.40268456]
- f1 (average): 0.5791571826

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1158097982406616
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0941648483276367
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0671783143823796 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8176752924919128
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4519164562225342
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7216152494603937 || Accuracy: 0.6805348992347717 || F1-score: 0.5284876279863481
Early stopping at epoch 149!
Accuracy on dataset of size 672: 69.3452377319336 %.
Average loss: 0.7150283401662653
proportion of labels in prediction: [tensor(0.7560), tensor(0.1696), tensor(0.0744)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80690399 0.46969697 0.39215686]
- f1 (average): 0.556252607

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099726676940918
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0698367357254028
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.062767429785295 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7204791903495789
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.13758376240730286
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7134658856825395 || Accuracy: 0.689450204372406 || F1-score: 0.56131034880422
Early stopping at epoch 182!
Accuracy on dataset of size 672: 70.68452453613281 %.
Average loss: 0.6856141903183677
proportion of labels in prediction: [tensor(0.7336), tensor(0.1830), tensor(0.0833)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81578947 0.51282051 0.41509434]
- f1 (average): 0.581234775375

## Fine-tuned BERT

### Mean pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_mean_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=1e-5,
                  loss="cross_entropy")

### Max pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_max_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=pooled_sum_pretrained[client_index],
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# Baseline 2: Averaging history and use FFN

Here, we will use `nlpsig` to construct some paths of embeddings which we will average and use those in a FFN.

First, we define the arguments for how we want to construct our path. As we're going to just do a simple average of embeddings, I'll set zero padding as false, and construct the path by looking at the last `k` posts.

In [54]:
path_specifics = {"pad_by": "history",
                  "zero_padding": False,
                  "method": "k_last",
                  "k": 5,
                  "time_feature": None,
                  "embeddings": "full",
                  "include_current_embedding": True}

In [161]:
def obtain_mean_history(embeddings, path_specifics):
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings)
    path = paths.pad(**path_specifics)
    # remove last two columns (which contains the id and the label)
    path = path[client_index][:,:,:-2]
    # average in the first dimension
    return path.mean(1).astype("float")

## SBERT 768

In [169]:
path_history = obtain_mean_history(sbert_768_embeddings, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.088150978088379
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0829229354858398
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.075862082568082 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.831689178943634
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.9813627004623413
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8894803578203375 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9005365100773898
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- ac

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1165460348129272
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1194690465927124
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1018761504780163 || Accuracy: 0.12184249609708786 || F1-score: 0.07240618101545253
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9759571552276611
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4108854830265045
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8946103345264088 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 102!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9107744693756104
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0994921922683716
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0858962535858154
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0874490196054631 || Accuracy: 0.2615155875682831 || F1-score: 0.14264979681024412
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9241882562637329
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6933719515800476
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8937450593168085 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 102!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9194471185857599
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
-

## SBERT 384

In [170]:
path_history = obtain_mean_history(sbert_384_embeddings, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0689338445663452
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.059541940689087
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.05829148942774 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8537594079971313
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 1.0656259059906006
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.896614210172133 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.912129510532726
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- accu

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.114090919494629
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0903785228729248
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.106756405396895 || Accuracy: 0.25557205080986023 || F1-score: 0.13570019723865875
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9462771415710449
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4005976617336273
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8934355920011346 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 103!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9252154231071472
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0598030090332031
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0398383140563965
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0513845031911677 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9368603229522705
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.695285439491272
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.893760393966328 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9282568964091215
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- a

## Pretrained BERT

### Mean pooled

In [171]:
path_history = obtain_mean_history(pooled_mean_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0827751159667969
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.067988395690918
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.00626910274679 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8487571477890015
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 1.1112945079803467
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.886727511882782 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9075098742138256
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- acc

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1143455505371094
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0963679552078247
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0698934359983965 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.976948618888855
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4426864683628082
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8920709815892306 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9157748547467318
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0997600555419922
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0753464698791504
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0727270191366023 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9181236028671265
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.721269428730011
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8867454582994635 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9253213839097456
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

### Max pooled

In [172]:
path_history = obtain_mean_history(pooled_max_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0947773456573486
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.159409999847412
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9376135522669012 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8620471954345703
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 1.1281884908676147
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8903347687287764 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9101777943697843
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1085178852081299
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0852488279342651
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0026371316476301 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9753608107566833
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4552372694015503
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8919325145808134 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.916386983611367
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.099276065826416
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0503199100494385
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0202285701578313 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.926359236240387
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7107961177825928
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8936019160530784 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9287005175243724
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- a

### Sum pooled

In [173]:
path_history = obtain_mean_history(pooled_sum_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.5255906581878662
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.2541639804840088
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9284746050834656 || Accuracy: 0.6240713000297546 || F1-score: 0.26351130351130353
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5938670039176941
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7251362800598145
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8142873265526511 || Accuracy: 0.6493313312530518 || F1-score: 0.41329564888064
Early stopping at epoch 171!
Accuracy on dataset of size 672: 65.32737731933594 %.
Average loss: 0.8633222579956055
proportion of labels in prediction: [tensor(0.8795), tensor(0.1042), tensor(0.0164)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.78019802 0.31818182 0.1754386 ]
- f1 (average): 0.4246061448

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0804401636123657
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.2311005592346191
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.953440628268502 || Accuracy: 0.6210995316505432 || F1-score: 0.25542315918117936
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8102289438247681
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4154810905456543
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8209891265088861 || Accuracy: 0.6419019103050232 || F1-score: 0.4145659757894998
Early stopping at epoch 184!
Accuracy on dataset of size 672: 65.77381134033203 %.
Average loss: 0.8609610958532854
proportion of labels in prediction: [tensor(0.8557), tensor(0.1220), tensor(0.0223)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7806841  0.36206897 0.20338983]
- f1 (average): 0.448714300

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0874210596084595
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1935654878616333
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9624836119738492 || Accuracy: 0.6210995316505432 || F1-score: 0.25542315918117936
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7057756781578064
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6206573247909546
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8404642180962996 || Accuracy: 0.6389301419258118 || F1-score: 0.44752010597029046
Early stopping at epoch 172!
Accuracy on dataset of size 672: 64.73213958740234 %.
Average loss: 0.8940324891697277
proportion of labels in prediction: [tensor(0.8259), tensor(0.1414), tensor(0.0327)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76796715 0.3755102  0.24      ]
- f1 (average): 0.4611591

### CLS

In [174]:
path_history = obtain_mean_history(pooled_cls_pretrained, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]


********** hidden_dim: 100


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0739248991012573
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0877466201782227
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9801731163805182 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8488182425498962
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 1.0061792135238647
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8912230188196356 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9096886840733615
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
-

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.116005539894104
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0968785285949707
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0649296695535833 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9889340400695801
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.43216484785079956
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8960904695770957 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9185672185637734
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
-

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.100395917892456
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0688506364822388
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0630166205492886 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9245232343673706
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6860083341598511
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8921460184183988 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9267005866224115
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

## Fine-tuned BERT

### Mean pooled

In [175]:
path_history = obtain_mean_history(pooled_mean, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

NameError: name 'pooled_mean' is not defined

### Max pooled

In [None]:
path_history = obtain_mean_history(pooled_max, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
path_history = obtain_mean_history(pooled_sum, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
path_history = obtain_mean_history(pooled_cls, path_specifics)
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=path_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# Baseline 3: LSTM classification

# Baseline 4: FFN using signatures

First, we dimension reduce these and then take signatures. We use the path signature as input to the FFN for classification.

In [23]:
path_specifics = {"pad_by": "history",
                  "zero_padding": False,
                  "method": "k_last",
                  "k": 5,
                  "time_feature": None,
                  "embeddings": "dim_reduced",
                  "include_current_embedding": True}

In [24]:
def obtain_signatures_history(embeddings, path_specifics, dimension, sig_depth):
    # dimension reduce
    reduction = nlpsig.DimReduce(method="gaussian_random_projection", n_components=dimension)
    embeddings_reduced = reduction.fit_transform(embeddings, random_state=seed)
    
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings,
                               embeddings_reduced=embeddings_reduced)
    path = paths.pad(**path_specifics)
    # remove last two columns (which contains the id and the label)
    path = path[client_index][:,:,:-2].astype("float")
    
    # convert to torch tensor to compute signature using signatory
    path = torch.from_numpy(path).float()
    return signatory.signature(path, sig_depth).float()

In [28]:
dimension = 5
sig_depth = 4

## SBERT 768

In [26]:
signature_history = obtain_signatures_history(sbert_768_embeddings, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1296181678771973
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1276369094848633
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1229537291960283 || Accuracy: 0.12184249609708786 || F1-score: 0.07240618101545253
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8314493894577026
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.9525586366653442
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9158739393407648 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 106!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9345286271788857
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1389586925506592
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1151443719863892
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1336447325619785 || Accuracy: 0.1367013305425644 || F1-score: 0.11381436427262122
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9477335810661316
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.45682698488235474
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9138921065763994 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 104!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.931088463826613
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
-

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0805878639221191
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0740684270858765
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0679300915111194 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.936995804309845
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6773010492324829
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9045545350421559 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9375779899683866
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

## SBERT 384

In [29]:
signature_history = obtain_signatures_history(sbert_384_embeddings, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1300742626190186
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1269279718399048
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1223173466595737 || Accuracy: 0.12184249609708786 || F1-score: 0.07240618101545253
Epoch: 101/10000 || Item: 0/85 || Loss: 0.805180549621582
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 1.0327091217041016
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9099448052319613 || Accuracy: 0.6240713000297546 || F1-score: 0.2598877839178142
Early stopping at epoch 105!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9492884224111383
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1388251781463623
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1151952743530273
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.133223761211742 || Accuracy: 0.12927190959453583 || F1-score: 0.10275403608736942
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9606556296348572
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.45526576042175293
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9071839939464222 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 104!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9430014870383523
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0802912712097168
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0748323202133179
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0678671490062366 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9114990234375
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.6415907740592957
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9043075767430392 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Early stopping at epoch 101!
Accuracy on dataset of size 672: 62.35118865966797 %.
Average loss: 0.9514352083206177
proportion of labels in prediction: [tensor(1.), tensor(0.), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76810266 0.         0.        ]
- f1 (average): 0.25603421937060805
- ac

## Pretrained BERT

### Mean pooled

In [30]:
signature_history = obtain_signatures_history(pooled_mean_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.7441978454589844
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 6.150984287261963
--------------------------------------------------
Validation || Epoch: 1 || Loss: 3.259506008841775 || Accuracy: 0.3789004385471344 || F1-score: 0.2955251625410387
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5173958539962769
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.5215308666229248
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.6734585491093723 || Accuracy: 0.5378900170326233 || F1-score: 0.30862251887560127
Early stopping at epoch 163!
Accuracy on dataset of size 672: 55.654762268066406 %.
Average loss: 1.819061886180531
proportion of labels in prediction: [tensor(0.8452), tensor(0.1101), tensor(0.0446)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.72745694 0.11607143 0.03007519]
- f1 (average): 0.2912011855

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.1577088832855225
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.8434455394744873
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.41989220272411 || Accuracy: 0.4041604697704315 || F1-score: 0.3026431397979094
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7993342876434326
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.29983827471733093
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.125967020338232 || Accuracy: 0.5854383111000061 || F1-score: 0.2882955502925611
Early stopping at epoch 111!
Accuracy on dataset of size 672: 58.779762268066406 %.
Average loss: 1.2293318726799705
proportion of labels in prediction: [tensor(0.9062), tensor(0.0759), tensor(0.0179)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74513619 0.10945274 0.0173913 ]
- f1 (average): 0.2906600758

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.4558333158493042
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.6769936084747314
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.1019763621416958 || Accuracy: 0.5898959636688232 || F1-score: 0.3101851851851852
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8751983046531677
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7145705223083496
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9868716489184987 || Accuracy: 0.6166418790817261 || F1-score: 0.30221593665319496
Early stopping at epoch 104!
Accuracy on dataset of size 672: 61.904762268066406 %.
Average loss: 1.0826639424670825
proportion of labels in prediction: [tensor(0.9613), tensor(0.0342), tensor(0.0045)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76431925 0.09248555 0.01886792]
- f1 (average): 0.2918909

### Max pooled

In [31]:
signature_history = obtain_signatures_history(pooled_max_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 21.10601234436035
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 12.224977493286133
--------------------------------------------------
Validation || Epoch: 1 || Loss: 10.66905671899969 || Accuracy: 0.3789004385471344 || F1-score: 0.2881857171991343
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7429050803184509
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7301334738731384
--------------------------------------------------
Validation || Epoch: 101 || Loss: 3.5669758319854736 || Accuracy: 0.518573522567749 || F1-score: 0.30701727642276416
Early stopping at epoch 166!
Accuracy on dataset of size 672: 54.761905670166016 %.
Average loss: 3.691923423246904
proportion of labels in prediction: [tensor(0.7708), tensor(0.1920), tensor(0.0372)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.69797225 0.23655914 0.125     ]
- f1 (average): 0.35317713055

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 8.153376579284668
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 5.994278907775879
--------------------------------------------------
Validation || Epoch: 1 || Loss: 3.633267944509333 || Accuracy: 0.4160475432872772 || F1-score: 0.3395383192400591
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9116073250770569
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.30709612369537354
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.4584283774549311 || Accuracy: 0.528974711894989 || F1-score: 0.2895468472826963
Early stopping at epoch 101!
Accuracy on dataset of size 672: 55.20833206176758 %.
Average loss: 1.5717109658501365
proportion of labels in prediction: [tensor(0.8393), tensor(0.1250), tensor(0.0357)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.7121058  0.14529915 0.06299213]
- f1 (average): 0.306799023286

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.3812642097473145
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.7356096506118774
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.7038150375539607 || Accuracy: 0.5572065114974976 || F1-score: 0.31691976419027806
Epoch: 101/10000 || Item: 0/85 || Loss: 1.0301419496536255
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7248117923736572
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.0454745401035657 || Accuracy: 0.5958395004272461 || F1-score: 0.2886131204789224
Early stopping at epoch 104!
Accuracy on dataset of size 672: 58.18452453613281 %.
Average loss: 1.1007776260375977
proportion of labels in prediction: [tensor(0.9226), tensor(0.0625), tensor(0.0149)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.73724735 0.08333333 0.        ]
- f1 (average): 0.27352689

### Sum pooled

In [32]:
signature_history = obtain_signatures_history(pooled_sum_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 24631632.0
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 4711557.0
--------------------------------------------------
Validation || Epoch: 1 || Loss: 5138354.7727272725 || Accuracy: 0.30312034487724304 || F1-score: 0.24780222971615964
Epoch: 101/10000 || Item: 0/85 || Loss: 438294.0625
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 14429.716796875
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1813593.4460227273 || Accuracy: 0.4918276369571686 || F1-score: 0.32324284347418575
Early stopping at epoch 157!
Accuracy on dataset of size 672: 53.27381134033203 %.
Average loss: 2374243.1136363638
proportion of labels in prediction: [tensor(0.7500), tensor(0.1815), tensor(0.0685)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.69122427 0.23529412 0.09395973]
- f1 (average): 0.3401593726265802
- accuracy: 0.53

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 4012826.75
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1845360.5
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1519848.6420454546 || Accuracy: 0.3893016278743744 || F1-score: 0.33526926716779265
Epoch: 101/10000 || Item: 0/85 || Loss: 152850.84375
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 116427.6015625
--------------------------------------------------
Validation || Epoch: 101 || Loss: 528084.4169034091 || Accuracy: 0.5408617854118347 || F1-score: 0.3710585187801719
Epoch: 201/10000 || Item: 0/85 || Loss: 57562.52734375
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 136120.421875
--------------------------------------------------
Validation || Epoch: 201 || Loss: 382654.3352272727 || Accuracy: 0.5334323644638062 || F1-score: 0.37047981318734907
Early stopping at epoch 254!
Accuracy on dataset of siz

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 4776060.5
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 588908.25
--------------------------------------------------
Validation || Epoch: 1 || Loss: 848951.8991477273 || Accuracy: 0.49479940533638 || F1-score: 0.3280075672438099
Epoch: 101/10000 || Item: 0/85 || Loss: 81126.34375
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 75532.5703125
--------------------------------------------------
Validation || Epoch: 101 || Loss: 243149.96661931818 || Accuracy: 0.5126299858093262 || F1-score: 0.34342909084755435
Early stopping at epoch 119!
Accuracy on dataset of size 672: 51.636905670166016 %.
Average loss: 230526.19850852274
proportion of labels in prediction: [tensor(0.7143), tensor(0.2158), tensor(0.0699)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.67408231 0.23050847 0.13333333]
- f1 (average): 0.3459747071971577
- accuracy: 0.516369044

### CLS

In [33]:
signature_history = obtain_signatures_history(pooled_cls_pretrained, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

signature_history.shape = torch.Size([6725, 780])

********** hidden_dim: 100


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 3.4455976486206055
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 4.314119815826416
--------------------------------------------------
Validation || Epoch: 1 || Loss: 2.479116439819336 || Accuracy: 0.37444278597831726 || F1-score: 0.29591340220143464
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5990457534790039
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.5958691835403442
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.4913460124622693 || Accuracy: 0.5646359324455261 || F1-score: 0.3106762934164993
Early stopping at epoch 170!
Accuracy on dataset of size 672: 56.69643020629883 %.
Average loss: 1.5198925516822122
proportion of labels in prediction: [tensor(0.8333), tensor(0.1295), tensor(0.0372)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.72318693 0.19409283 0.0625    ]
- f1 (average): 0.326593250

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 2.1884889602661133
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.824597716331482
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.207720864902843 || Accuracy: 0.43982169032096863 || F1-score: 0.3415113871635611
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7532700300216675
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.5469624996185303
--------------------------------------------------
Validation || Epoch: 101 || Loss: 1.0839504220268943 || Accuracy: 0.5973253846168518 || F1-score: 0.31075292640596736
Early stopping at epoch 121!
Accuracy on dataset of size 672: 59.0773811340332 %.
Average loss: 1.1274788813157515
proportion of labels in prediction: [tensor(0.9241), tensor(0.0625), tensor(0.0134)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.74230769 0.10416667 0.01785714]
- f1 (average): 0.2881105006

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.3909130096435547
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 0.9869392514228821
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0498062426393682 || Accuracy: 0.5824665427207947 || F1-score: 0.2753911679371505
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8995655179023743
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.7283216714859009
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.9988734071904962 || Accuracy: 0.6136701107025146 || F1-score: 0.2773923113307563
Early stopping at epoch 103!
Accuracy on dataset of size 672: 61.755950927734375 %.
Average loss: 1.0209869037974963
proportion of labels in prediction: [tensor(0.9658), tensor(0.0312), tensor(0.0030)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.76217228 0.09356725 0.        ]
- f1 (average): 0.28524651

### Fine-tuned BERT

### Mean pooled

In [None]:
signature_history = obtain_signatures_history(pooled_mean, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Max pooled

In [None]:
signature_history = obtain_signatures_history(pooled_max, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### Sum pooled

In [None]:
signature_history = obtain_signatures_history(pooled_sum, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

### CLS

In [None]:
signature_history = obtain_signatures_history(pooled_cls, path_specifics, dimension, sig_depth)
print(f"signature_history.shape = {signature_history.shape}")
for hidden_dim in hidden_dim_trials:
    print(f"\n********** hidden_dim: {hidden_dim}")
    implement_ffn(x_data=signature_history,
                  y_data=y_data,
                  hidden_dim=hidden_dim,
                  learning_rate=learning_rate,
                  loss=loss)

# StackedDeepSigNet

## Obtaining path by looking at post history

We can obtain a path by looking at the history of each post. Here we look at the last 10 posts (and pad with vectors of zeros if there are less than 10 posts) including the current post.

We only want to consider paths that correspond to a client's utterance as we want to model a change in mood at that time. Their history will still contain the therapist's utterances too.

In [35]:
time_features = ["time_encoding", "timeline_index"]
path_specifics = {"pad_by": "history",
                  "zero_padding": True,
                  "method": "k_last",
                  "k": 20,
                  "time_feature": time_features,
                  "standardise_method": ["minmax", None],
                  "embeddings": "dim_reduced",
                  "include_current_embedding": True,
                  "pad_from_below": False}

In [36]:
def obtain_SDSN_input(embeddings, path_specifics):
    reduction = nlpsig.DimReduce(method="gaussian_random_projection", n_components=50)
    embeddings_reduced = reduction.fit_transform(embeddings, random_state=seed)
    
    paths = nlpsig.PrepareData(anno_mi,
                               id_column="transcript_id",
                               label_column="client_talk_type",
                               embeddings=embeddings,
                               embeddings_reduced=embeddings_reduced)
    
    paths.pad(**path_specifics)
    
    paths.array_padded = paths.array_padded[client_index]
    paths.embeddings = paths.embeddings[client_index]
    paths.embeddings_reduced = paths.embeddings_reduced[client_index]
    
    return paths.get_torch_path_for_SDSN(
        include_time_features_in_path=True,
        include_time_features_in_input=True,
        include_embedding_in_input=True,
        reduced_embeddings=False
    )

In [37]:
def implement_sdsn(x_data,
                   y_data,
                   sig_depth,
                   input_channels,
                   output_channels,
                   lstm_hidden_dim,
                   ffn_hidden_dim,
                   BiLSTM,
                   learning_rate,
                   loss,
                   gamma = 0):
    SDSN_args = {
        "input_channels": input_channels,
        "output_channels": output_channels,
        "num_time_features": len(time_features),
        "embedding_dim": x_data.shape[2]-input_channels-len(time_features),
        "sig_depth": sig_depth,
        "hidden_dim_lstm": lstm_hidden_dim,
        "hidden_dim_ffn": ffn_hidden_dim,
        "output_dim": len(label_to_id),
        "dropout_rate": 0.1,
        "augmentation_type": "Conv1d",
        "BiLSTM": BiLSTM,
        "comb_method": "concatenation"
    }
    
    sdsn_model = StackedDeepSigNet(**SDSN_args)
    # print(sdsn_model)
    
    # split dataset
    train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),
                                       y_data=torch.tensor(y_data),
                                       train_size=0.8,
                                       valid_size=0.5,
                                       shuffle=True,
                                       as_DataLoader=True,
                                       seed=seed)
    
    # define loss
    if loss == "focal":    
        criterion = FocalLoss(gamma = gamma)
    elif loss == "cross_entropy":
        criterion = torch.nn.CrossEntropyLoss()

    # define optimizer
    optimizer = torch.optim.Adam(sdsn_model.parameters(), lr=learning_rate)
    # define scheduler for adjusting the learning rate
    scheduler = ReduceLROnPlateau(optimizer, 'min')
    # scheduler = StepLR(optimizer, step_size = 10, gamma = 0.5)
    # scheduler = CosineAnnealingWarmRestarts(optimizer, 
    #                                         T_0 = 8,# Number of iterations for the first restart
    #                                         T_mult = 1, # A factor increases TiTi after a restart
    #                                         eta_min = learning_rate*0.1)
    # scheduler = None
    
    sdsn_model = training_pytorch(model=sdsn_model,
                                  train_loader=train,
                                  criterion=criterion,
                                  optimizer=optimizer,
                                  num_epochs=10000,
                                  scheduler=scheduler,
                                  valid_loader=valid,
                                  early_stopping=True,
                                  early_stopping_metric="f1",
                                  patience=100,
                                  verbose=True,
                                  verbose_epoch=100,
                                  seed=seed)

    pred, label = testing_pytorch(sdsn_model, test, criterion)
    print(f"proportion of labels in prediction: {[sum(pred==i)/len(pred) for i in label_to_id.values()]}")
    print(f"proportion of labels in data: {[sum(label==i)/len(label) for i in label_to_id.values()]}")
    
    f1_scores = metrics.f1_score(label, pred, average=None)
    print(f"- f1: {f1_scores}")
    print(f"- f1 (average): {sum(f1_scores)/len(f1_scores)}")
    print(f"- accuracy: {sum(pred==label)/len(pred)}")
    
    return sdsn_model

In [38]:
lstm_hidden_dim_trial = [[8,8], [12,12,8]]
ffn_hidden_dim_trial = [[100,100], [100,100,100], [100,100,100,100]]
sig_depth = 3
output_channels = 10
BiLSTM = True

## SBERT 768

In [204]:
x_data, input_channels = obtain_SDSN_input(sbert_768_embeddings, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0969207286834717
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0233066082000732
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.023309436711398 || Accuracy: 0.46359583735466003 || F1-score: 0.3083223136970524
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8304529190063477
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.41528797149658203
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7428785833445463 || Accuracy: 0.6745913624763489 || F1-score: 0.4143467517817599
Epoch: 201/10000 || Item: 0/85 || Loss: 0.782322347164154
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.4106078147888184
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7343586520715193 || Accuracy: 0.6745913624763489 || F1-score: 0.41507797124235485
Early stopping at 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.12066650390625
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.067062497138977
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.054376938126304 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8029945492744446
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.28940171003341675
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.749141666022214 || Accuracy: 0.6760772466659546 || F1-score: 0.41843096051133877
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7253010272979736
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.31466588377952576
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7473348433321173 || Accuracy: 0.6775631308555603 || F1-score: 0.41826697015446385
Early stopping at e

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0944191217422485
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0965360403060913
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0433697700500488 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9591290354728699
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.40191689133644104
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7622772238471291 || Accuracy: 0.6671619415283203 || F1-score: 0.4036286381697556
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6861003637313843
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.047218680381775
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7436184287071228 || Accuracy: 0.6760772466659546 || F1-score: 0.4207838982127052
Early stopping at 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1812686920166016
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0823497772216797
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0679176612333818 || Accuracy: 0.39524516463279724 || F1-score: 0.2803170874828573
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7879034876823425
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.43962371349334717
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7241770408370278 || Accuracy: 0.679049015045166 || F1-score: 0.4252490292550386
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7371249198913574
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.613956332206726
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7125839320096102 || Accuracy: 0.679049015045166 || F1-score: 0.43306966655334533
Early stopping at e

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0663707256317139
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0678725242614746
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0295921997590498 || Accuracy: 0.5542347431182861 || F1-score: 0.32125706214689265
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8314594030380249
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.26384830474853516
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7574997706846758 || Accuracy: 0.6775631308555603 || F1-score: 0.41700782877253467
Early stopping at epoch 199!
Accuracy on dataset of size 672: 66.96428680419922 %.
Average loss: 0.7706493789499457
proportion of labels in prediction: [tensor(0.7976), tensor(0.2024), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81675393 0.41958042 0.        ]
- f1 (average): 0.4121114487

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1014739274978638
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.110789179801941
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0646752227436413 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9555978775024414
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.3831523060798645
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7607014016671614 || Accuracy: 0.668647825717926 || F1-score: 0.40675829701642635
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6827191114425659
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.9079384207725525
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7459247383204374 || Accuracy: 0.668647825717926 || F1-score: 0.40772953940552764
Early stopping at e

## SBERT 384

In [39]:
x_data, input_channels = obtain_SDSN_input(sbert_384_embeddings, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0764864683151245
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0873043537139893
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.020038989457217 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8260774612426758
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.23308005928993225
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7744331630793485 || Accuracy: 0.6552748680114746 || F1-score: 0.3925271304903637
Early stopping at epoch 198!
Accuracy on dataset of size 672: 65.32737731933594 %.
Average loss: 0.7782417156479575
proportion of labels in prediction: [tensor(0.7768), tensor(0.2232), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80765143 0.39333333 0.        ]
- f1 (average): 0.400328255992

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.072562336921692
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1152584552764893
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0209932814944873 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8427107930183411
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.40092605352401733
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7744652357968417 || Accuracy: 0.6493313312530518 || F1-score: 0.37042501925131033
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7869120240211487
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.3241257071495056
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7755436572161588 || Accuracy: 0.6508172154426575 || F1-score: 0.3741355157413551
Early stopping at

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1326649188995361
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1599905490875244
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0839721831408413 || Accuracy: 0.2956909239292145 || F1-score: 0.217697083376074
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9524648189544678
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.573074221611023
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7800712422891096 || Accuracy: 0.6552748680114746 || F1-score: 0.3853948816005455
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7340773344039917
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.9599273800849915
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7671386328610507 || Accuracy: 0.6493313312530518 || F1-score: 0.39364179029182006
Early stopping at ep

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0632113218307495
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0915249586105347
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0186090794476597 || Accuracy: 0.549777090549469 || F1-score: 0.3190642105043819
Epoch: 101/10000 || Item: 0/85 || Loss: 0.824112057685852
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.31476056575775146
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7664196220311251 || Accuracy: 0.6627042889595032 || F1-score: 0.39176008431541637
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7843072414398193
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.452071189880371
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7611909942193464 || Accuracy: 0.6671619415283203 || F1-score: 0.40130826825070187
Early stopping at e

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1042710542678833
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0711721181869507
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0595148368315264 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8390007019042969
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.36518001556396484
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7754646431316029 || Accuracy: 0.647845447063446 || F1-score: 0.38405269615876003
Early stopping at epoch 194!
Accuracy on dataset of size 672: 66.96428680419922 %.
Average loss: 0.7810252796519886
proportion of labels in prediction: [tensor(0.7560), tensor(0.2440), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82200647 0.43949045 0.        ]
- f1 (average): 0.42049897278

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0992392301559448
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1072020530700684
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0587024905464866 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9701166749000549
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.5604714751243591
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7806987599893049 || Accuracy: 0.647845447063446 || F1-score: 0.38134144816374177
Early stopping at epoch 187!
Accuracy on dataset of size 672: 66.51786041259766 %.
Average loss: 0.765200219371102
proportion of labels in prediction: [tensor(0.7738), tensor(0.2262), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8200213  0.41059603 0.        ]
- f1 (average): 0.4102057752481

## Pretrained BERT

### Mean pooled

In [40]:
x_data, input_channels = obtain_SDSN_input(pooled_mean_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.092307686805725
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0484129190444946
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9850388440218839 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7668509483337402
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.4241316318511963
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7366883158683777 || Accuracy: 0.6805348992347717 || F1-score: 0.4956525368198217
Early stopping at epoch 167!
Accuracy on dataset of size 672: 67.70833587646484 %.
Average loss: 0.7894256440075961
proportion of labels in prediction: [tensor(0.7604), tensor(0.1905), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79784946 0.47482014 0.26470588]
- f1 (average): 0.512458496

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.086983561515808
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0657081604003906
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0310974771326238 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.704171895980835
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.16407299041748047
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7339336167682301 || Accuracy: 0.6775631308555603 || F1-score: 0.5232700427496731
Early stopping at epoch 182!
Accuracy on dataset of size 672: 68.1547622680664 %.
Average loss: 0.7501686919819225
proportion of labels in prediction: [tensor(0.7396), tensor(0.1860), tensor(0.0744)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80786026 0.46545455 0.31372549]
- f1 (average): 0.5290134325

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1314183473587036
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1348956823349
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0881675698540427 || Accuracy: 0.31797918677330017 || F1-score: 0.23029510885582852
Epoch: 101/10000 || Item: 0/85 || Loss: 0.891852080821991
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.16898664832115173
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7633115215735002 || Accuracy: 0.6775631308555603 || F1-score: 0.49486527754501614
Early stopping at epoch 191!
Accuracy on dataset of size 672: 67.41071319580078 %.
Average loss: 0.7805675376545299
proportion of labels in prediction: [tensor(0.7232), tensor(0.2277), tensor(0.0491)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80883978 0.4620462  0.25      ]
- f1 (average): 0.506961994

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.2139078378677368
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0814762115478516
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0501754392277112 || Accuracy: 0.48885586857795715 || F1-score: 0.32113079729025534
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7437731623649597
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.43895959854125977
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7175708358938043 || Accuracy: 0.6835066676139832 || F1-score: 0.5386153975538449
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7395185232162476
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.5451140403747559
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7181831652467902 || Accuracy: 0.6835066676139832 || F1-score: 0.5474054215779308
Early stopping a

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.116066813468933
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1265699863433838
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0330844088034197 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7519819140434265
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.15710219740867615
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.738428451798179 || Accuracy: 0.689450204372406 || F1-score: 0.5121958121109225
Early stopping at epoch 179!
Accuracy on dataset of size 672: 68.1547622680664 %.
Average loss: 0.7435725439678539
proportion of labels in prediction: [tensor(0.7336), tensor(0.1994), tensor(0.0670)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80921053 0.46478873 0.31081081]
- f1 (average): 0.52827002317

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1306880712509155
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0889372825622559
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0812130624597722 || Accuracy: 0.365527480840683 || F1-score: 0.2590785171999675
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9445035457611084
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.1216757744550705
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.732742808081887 || Accuracy: 0.6760772466659546 || F1-score: 0.497789699856748
Epoch: 201/10000 || Item: 0/85 || Loss: 0.679328978061676
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.5225780606269836
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7117806483398784 || Accuracy: 0.6879643201828003 || F1-score: 0.5363720172205836
Early stopping at epoch

### Max pooled

In [41]:
x_data, input_channels = obtain_SDSN_input(pooled_max_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1773746013641357
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0755505561828613
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0181242064996199 || Accuracy: 0.6077265739440918 || F1-score: 0.2556888677370605
Epoch: 101/10000 || Item: 0/85 || Loss: 0.814765214920044
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.29490041732788086
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7640200203115289 || Accuracy: 0.6597325205802917 || F1-score: 0.4036568213783404
Epoch: 201/10000 || Item: 0/85 || Loss: 0.7249236702919006
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 1.3043550252914429
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.767925045707009 || Accuracy: 0.6627042889595032 || F1-score: 0.41408009700438103
Early stopping at e

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1322405338287354
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0741022825241089
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0415133454582908 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8184701204299927
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.3524358868598938
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7725036577744917 || Accuracy: 0.6508172154426575 || F1-score: 0.3933262711864407
Early stopping at epoch 172!
Accuracy on dataset of size 672: 65.0297622680664 %.
Average loss: 0.8041848648678173
proportion of labels in prediction: [tensor(0.7560), tensor(0.2440), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.802589   0.41401274 0.        ]
- f1 (average): 0.4055339118724

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.113893985748291
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.03090500831604
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0526592081243342 || Accuracy: 0.33729568123817444 || F1-score: 0.2380251394770451
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9875040650367737
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.22272755205631256
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7775280638174578 || Accuracy: 0.6552748680114746 || F1-score: 0.39859559825315866
Early stopping at epoch 183!
Accuracy on dataset of size 672: 63.83928680419922 %.
Average loss: 0.7828834273598411
proportion of labels in prediction: [tensor(0.7485), tensor(0.2515), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79609544 0.38871473 0.        ]
- f1 (average): 0.3949367260759

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.2054002285003662
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.063188910484314
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0061352957378735 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8090469837188721
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.27597397565841675
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7800249023870989 || Accuracy: 0.6508172154426575 || F1-score: 0.39815406460773867
Early stopping at epoch 166!
Accuracy on dataset of size 672: 66.66666412353516 %.
Average loss: 0.7648813724517822
proportion of labels in prediction: [tensor(0.7738), tensor(0.2262), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80937167 0.45033113 0.        ]
- f1 (average): 0.41990093260

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0991864204406738
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0555108785629272
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0443167686462402 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8127844929695129
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.300264447927475
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7771428335796703 || Accuracy: 0.6523030996322632 || F1-score: 0.4005143704391825
Early stopping at epoch 171!
Accuracy on dataset of size 672: 64.73213958740234 %.
Average loss: 0.8017999909140847
proportion of labels in prediction: [tensor(0.7307), tensor(0.2693), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.8043956  0.41691843 0.        ]
- f1 (average): 0.4071046777995

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1318442821502686
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.10346519947052
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.083070993423462 || Accuracy: 0.3878157436847687 || F1-score: 0.2741996857199964
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9358991384506226
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.35324254631996155
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.8188418366692283 || Accuracy: 0.637444257736206 || F1-score: 0.38353693495518376
Early stopping at epoch 139!
Accuracy on dataset of size 672: 61.30952453613281 %.
Average loss: 0.8005170659585432
proportion of labels in prediction: [tensor(0.7634), tensor(0.2366), tensor(0.)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.78111588 0.31067961 0.        ]
- f1 (average): 0.363931830492937

### Sum pooled

In [42]:
x_data, input_channels = obtain_SDSN_input(pooled_sum_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.2181402444839478
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.066859245300293
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.98489126292142 || Accuracy: 0.6166418790817261 || F1-score: 0.2900937871846043
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6420540809631348
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.24827660620212555
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.716571103442799 || Accuracy: 0.7191678881645203 || F1-score: 0.6066606145470689
Early stopping at epoch 154!
Accuracy on dataset of size 672: 70.68452453613281 %.
Average loss: 0.7215899445793845
proportion of labels in prediction: [tensor(0.7411), tensor(0.1830), tensor(0.0759)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81134133 0.52014652 0.41558442]
- f1 (average): 0.582357422052

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.139095664024353
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0661197900772095
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0196888284249739 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6759999990463257
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.10273061692714691
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.743569168177518 || Accuracy: 0.7013372778892517 || F1-score: 0.5876951020676172
Early stopping at epoch 149!
Accuracy on dataset of size 672: 71.13095092773438 %.
Average loss: 0.7319929653948004
proportion of labels in prediction: [tensor(0.7351), tensor(0.1741), tensor(0.0908)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81270537 0.53932584 0.42682927]
- f1 (average): 0.592953492

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1228188276290894
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0800007581710815
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0646044232628562 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7415486574172974
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.23596514761447906
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.762010167945515 || Accuracy: 0.7043090462684631 || F1-score: 0.5941088590095213
Early stopping at epoch 156!
Accuracy on dataset of size 672: 71.875 %.
Average loss: 0.7243921648372303
proportion of labels in prediction: [tensor(0.7158), tensor(0.1801), tensor(0.1042)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81777778 0.54612546 0.47398844]
- f1 (average): 0.6126305594462497
- 

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.2287657260894775
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1224294900894165
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0203510902144692 || Accuracy: 0.6300148367881775 || F1-score: 0.3143051308188923
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6053046584129333
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.24841231107711792
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7670120217583396 || Accuracy: 0.6835066676139832 || F1-score: 0.561768259551016
Early stopping at epoch 132!
Accuracy on dataset of size 672: 69.49404907226562 %.
Average loss: 0.7803263447501443
proportion of labels in prediction: [tensor(0.7411), tensor(0.1801), tensor(0.0789)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80043621 0.47232472 0.46153846]
- f1 (average): 0.578099796

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.034242033958435
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0073752403259277
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9958898479288275 || Accuracy: 0.6225854158401489 || F1-score: 0.26099318277276984
Epoch: 101/10000 || Item: 0/85 || Loss: 0.5993335247039795
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.16584578156471252
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7585349841551348 || Accuracy: 0.7087666988372803 || F1-score: 0.5956471115321761
Early stopping at epoch 153!
Accuracy on dataset of size 672: 71.72618865966797 %.
Average loss: 0.725405137647282
proportion of labels in prediction: [tensor(0.7143), tensor(0.2024), tensor(0.0833)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.82313682 0.53846154 0.44025157]
- f1 (average): 0.600616643

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0629332065582275
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0641825199127197
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0240105227990584 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.6873794794082642
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.13534866273403168
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7940189025618813 || Accuracy: 0.6849925518035889 || F1-score: 0.5937051930568719
Early stopping at epoch 138!
Accuracy on dataset of size 672: 68.75 %.
Average loss: 0.7797680172053251
proportion of labels in prediction: [tensor(0.7039), tensor(0.2009), tensor(0.0952)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.79147982 0.49122807 0.46706587]
- f1 (average): 0.5832579196889048
- 

### CLS

In [43]:
x_data, input_channels = obtain_SDSN_input(pooled_cls_pretrained, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/13551 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.

********** lstm_hidden_dim: [8, 8] || ffnhidden_dim: [100, 100]


  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1161494255065918
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0059633255004883
--------------------------------------------------
Validation || Epoch: 1 || Loss: 0.9993689656257629 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7387819886207581
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.28736263513565063
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7172420187429949 || Accuracy: 0.6953937411308289 || F1-score: 0.5449024011323456
Early stopping at epoch 190!
Accuracy on dataset of size 672: 69.94047546386719 %.
Average loss: 0.7094901095737111
proportion of labels in prediction: [tensor(0.7381), tensor(0.1756), tensor(0.0863)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.81530055 0.48507463 0.39751553]
- f1 (average): 0.5659635

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1231155395507812
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0709043741226196
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.043758901682767 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7524192333221436
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.17940518260002136
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7213849696246061 || Accuracy: 0.6864784359931946 || F1-score: 0.5292301182466207
Early stopping at epoch 188!
Accuracy on dataset of size 672: 68.45237731933594 %.
Average loss: 0.730201008644971
proportion of labels in prediction: [tensor(0.7619), tensor(0.1652), tensor(0.0729)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80988185 0.43678161 0.34210526]
- f1 (average): 0.529589573

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1191399097442627
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0814623832702637
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0798902186480435 || Accuracy: 0.35066863894462585 || F1-score: 0.24751491973154996
Epoch: 101/10000 || Item: 0/85 || Loss: 0.8690895438194275
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.2705612778663635
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7456639625809409 || Accuracy: 0.6924219727516174 || F1-score: 0.5419908486575153
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6533145904541016
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.6370545625686646
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7173359231515364 || Accuracy: 0.7043090462684631 || F1-score: 0.5707119983017303
Early stopping at

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.0801074504852295
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.1135045289993286
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0021320906552402 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7768341302871704
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.33637309074401855
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7227382118051703 || Accuracy: 0.6953937411308289 || F1-score: 0.552636199297143
Early stopping at epoch 182!
Accuracy on dataset of size 672: 68.60118865966797 %.
Average loss: 0.7287239161404696
proportion of labels in prediction: [tensor(0.7426), tensor(0.1726), tensor(0.0848)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80392157 0.45864662 0.3875    ]
- f1 (average): 0.55002272

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1512285470962524
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0863783359527588
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.061882495880127 || Accuracy: 0.42050519585609436 || F1-score: 0.29243233165539567
Epoch: 101/10000 || Item: 0/85 || Loss: 0.7626064419746399
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.24707303941249847
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7200096087022261 || Accuracy: 0.6939078569412231 || F1-score: 0.5443017935602079
Early stopping at epoch 182!
Accuracy on dataset of size 672: 68.1547622680664 %.
Average loss: 0.734047683802518
proportion of labels in prediction: [tensor(0.7470), tensor(0.1711), tensor(0.0818)]
proportion of labels in data: [tensor(0.6235), tensor(0.2232), tensor(0.1533)]
- f1: [0.80564604 0.44528302 0.35443038]
- f1 (average): 0.535119811

  train, valid, test = split_dataset(x_data=torch.tensor(x_data).float(),


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch: 1/10000 || Item: 0/85 || Loss: 1.1070767641067505
--------------------------------------------------
##### Epoch: 1/10000 || Loss: 1.0723456144332886
--------------------------------------------------
Validation || Epoch: 1 || Loss: 1.0553485263477673 || Accuracy: 0.6225854158401489 || F1-score: 0.25579975579975583
Epoch: 101/10000 || Item: 0/85 || Loss: 0.9206050634384155
--------------------------------------------------
##### Epoch: 101/10000 || Loss: 0.18701200187206268
--------------------------------------------------
Validation || Epoch: 101 || Loss: 0.7487882592461326 || Accuracy: 0.6775631308555603 || F1-score: 0.4986432148426074
Epoch: 201/10000 || Item: 0/85 || Loss: 0.6725772619247437
--------------------------------------------------
##### Epoch: 201/10000 || Loss: 0.6710101366043091
--------------------------------------------------
Validation || Epoch: 201 || Loss: 0.7234085418961265 || Accuracy: 0.689450204372406 || F1-score: 0.5484433300950712
Early stopping at 

## Fine-tuned BERT

### Mean pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_mean, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Max pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_max, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### Sum pooled

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_sum, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

### CLS

In [None]:
x_data, input_channels = obtain_SDSN_input(pooled_cls, path_specifics)
for lstm_hidden_dim in lstm_hidden_dim_trial:
    for ffn_hidden_dim in ffn_hidden_dim_trial:
        print(f"\n********** lstm_hidden_dim: {lstm_hidden_dim} "
              f"|| ffnhidden_dim: {ffn_hidden_dim}")
        implement_sdsn(x_data=x_data,
                       y_data=y_data,
                       sig_depth=sig_depth,
                       input_channels=input_channels,
                       output_channels=output_channels,
                       lstm_hidden_dim=lstm_hidden_dim,
                       ffn_hidden_dim=ffn_hidden_dim,
                       BiLSTM=BiLSTM,
                       learning_rate=learning_rate,
                       loss=loss)

Baselines:
   - just looking at the sentence embeddings (encodes nothing about the history on the post)
       - highlights importance of looking at the sequence
   - averaging history
   - comparing the cosine similarity between previous post and current post to see if switch
   
Test for:
- How many posts do you need to look back?