In [1]:
import pickle
import numpy as np
import pandas as pd
import re

seed = 2023

In [2]:
from nlpsig_networks.pytorch_utils import SaveBestModel, training_pytorch, testing_pytorch, set_seed, KFold_pytorch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from nlpsig_networks.scripts.swnu_network_functions import (
    swnu_network_hyperparameter_search, obtain_SWNUNetwork_input
)

Rumours

In [5]:
%run load_sbert-embeddings.py

In [6]:
df_rumours.head()

Unnamed: 0,id,label,datetime,text,timeline_id,set
0,5.249902e+17,0,2014-10-22 18:26:23,Police have clarified that there were two shoo...,0,train
1,5.249906e+17,0,2014-10-22 18:27:58,"@CTVNews you guys ""confirmed"" there were 3 sho...",0,train
2,5.249908e+17,1,2014-10-22 18:28:46,@CTVNews get it right. http://t.co/GHYxMuzPG9,0,train
3,5.249927e+17,1,2014-10-22 18:36:29,RT @CTVNews Police have clarified that there w...,0,train
4,5.250038e+17,1,2014-10-22 19:20:41,@CTVNews @ctvsaskatoon so what happened at Rid...,0,train


In [7]:
sbert_embeddings.shape

torch.Size([5568, 384])

In [8]:
x_data = obtain_SWNUNetwork_input(
    method = "umap",
    dimension= 30,
    df= df_rumours,
    id_column='timeline_id',
    label_column='label',
    embeddings= sbert_embeddings.numpy(),
    k=5,
    time_feature='time_encoding',
    standardise_method=None,
    add_time_in_path=False)

x_data[0].shape

[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


  0%|          | 0/5568 [00:00<?, ?it/s]

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


torch.Size([5568, 5, 415])

SWNU Network

Obtaining path by looking at post history
We can obtain a path by looking at the history of each post. Here we look at the last 20 posts (and pad with vectors of zeros if there are less than 20 posts) including the current post.

We only want to consider paths that correspond to a client's utterance as we want to model a change in mood at that time. Their history will still contain the therapist's utterances too.

In [8]:
#dimensionality reduction
embedding_dim = sbert_embeddings.shape[1]
dim_reduce_method = ["umap"] #["gaussian_random_projection", "umap"]
dimensions = [20]#[50,30,15]
#time features
time_features = "time_encoding"
standardise_method = "standardise"
add_time_in_path = False
#SWNU block
augmentation_tp = "Conv1d"
hidden_dim_aug = None
comb_m = "concatenation"
log_sig = True
conv_output_channels = [10] #[20, 10, 5]
log_signature_dimensions_and_sig_depths = [(8, 3)]#[(30, 2), (10, 3), (6, 4)]
bidirectional = False
#ffn
hidden_dim_sizes = [64]#[32,64]
dropout_rates = [0.2]#[0.5, 0.2, 0.1]
#overall training
num_epochs = 100
batch=64
patience = 4
learning_rates = [1e-4] #[1e-3, 1e-4, 5e-4]
seeds = [0, 1, 12, 123, 1234]
loss = "focal"
gamma = 2
validation_metric = "f1"
split_indices = (df_rumours[df_rumours['set']=='train'].index,
                 df_rumours[df_rumours['set']=='dev'].index,
                 df_rumours[df_rumours['set']=='test'].index)

In [9]:
size=20
swnu_network_log_signature, best_swnu_network_log_signature, _, __ = swnu_network_hyperparameter_search(
    num_epochs=num_epochs,
    df=df_rumours,
    id_column="timeline_id",
    label_column="label",
    embeddings=sbert_embeddings.numpy(),
    y_data=y_data,
    embedding_dim=embedding_dim,
    output_dim=output_dim,
    history_lengths=[size],
    dim_reduce_methods=dim_reduce_method,
    dimensions=dimensions, 
    sig_depths=[x[1] for x in log_signature_dimensions_and_sig_depths], 
    log_signature=log_sig,
    conv_output_channels=conv_output_channels,
    swnu_hidden_dim_sizes=[x[0] for x in log_signature_dimensions_and_sig_depths],
    ffn_hidden_dim_sizes=hidden_dim_sizes,
    dropout_rates=dropout_rates,
    learning_rates=learning_rates,
    BiLSTM=bidirectional,
    seeds=seeds,
    loss=loss,
    gamma=gamma,
    batch_size=batch,
    time_feature=time_features, 
    standardise_method=standardise_method,
    add_time_in_path = add_time_in_path,
    augmentation_type = augmentation_tp,
    hidden_dim_aug = hidden_dim_aug,
    comb_method = comb_m,
    path_indices=None,
    data_split_seed=123,
    split_ids= None, #torch.tensor(df_rumours['timeline_id'].astype(int)),
    split_indices = split_indices,
    k_fold=False,
    patience=patience,
    validation_metric=validation_metric,
    results_output=None,
    verbose=False
)

  0%|          | 0/1 [00:00<?, ?it/s]
[A


##################################################
dimension: 20 | method: umap
[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.




[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A






100%|██████████| 1/1 [07:30<00:00, 450.15s/it]






100%|██████████| 1/1 [07:30<00:00, 450.15s/it]





100%|██████████| 1/1 [07:30<00:00, 450.16s/it]




100%|██████████| 1/1 [07:30<00:00, 450.16s/it]



100%|██████████| 1/1 [07:30<00:00, 450.16s/it]


100%|██████████| 1/1 [07:30<00:00, 450.17s/it]

100%|██████████| 1/1 [08:36<00:00, 516.52s/it]
100%|██████████| 1/1 [08:36<00:00, 516.52s/it]
100%|██████████| 1/1 [08:36<00:00, 516.53s/it]

- average (validation) metric score: 0.6242551680788517
scores for the different seeds: [0.5689009964872034, 0.6474855459801461, 0.6120501306882915, 0.6142759094028827, 0.6785632578357356]





[INFO] Concatenating the embeddings to the dataframe...
[INFO] - columns beginning with 'e' denote the full embddings.
[INFO] - columns beginning with 'd' denote the dimension reduced embeddings.
[INFO] Adding time feature columns into dataframe in `.df`.
[INFO] Adding 'time_encoding' and feature...
[INFO] Adding 'time_diff' and feature...
[INFO] Adding 'timeline_index' feature...
[INFO] Padding ids and storing in `.df_padded` and `.array_padded` attributes.


100%|██████████| 5568/5568 [00:18<00:00, 296.96it/s]


[INFO] The path was created for each item in the dataframe, by looking at its history, so to include embeddings in the FFN input, we concatenate the embeddings for each sentence / text.


In [10]:
swnu_network_log_signature

Unnamed: 0,loss,accuracy,f1,f1_scores,valid_loss,valid_accuracy,valid_f1,valid_f1_scores,k,dimensions,...,learning_rate,seed,BiLSTM,gamma,k_fold,augmentation_type,hidden_dim_aug,comb_method,batch_size,model_id
0,focal,0.593899,0.572441,"[0.6682242990654205, 0.47665847665847666]",0.221828,0.619217,0.568901,"[0.7161803713527852, 0.4216216216216216]",20,20,...,0.0001,0,False,2,False,Conv1d,,concatenation,64,0
0,focal,0.585319,0.549445,"[0.6765799256505576, 0.42231075697211157]",0.216514,0.672598,0.647486,"[0.7415730337078651, 0.5533980582524272]",20,20,...,0.0001,1,False,2,False,Conv1d,,concatenation,64,0
0,focal,0.5796,0.549297,"[0.6661619984859953, 0.4324324324324324]",0.209865,0.633452,0.61205,"[0.7031700288184437, 0.5209302325581394]",20,20,...,0.0001,12,False,2,False,Conv1d,,concatenation,64,0
0,focal,0.57388,0.538618,"[0.666168782673637, 0.41106719367588934]",0.220015,0.658363,0.614276,"[0.7446808510638298, 0.4838709677419355]",20,20,...,0.0001,123,False,2,False,Conv1d,,concatenation,64,0
0,focal,0.553861,0.518841,"[0.6486486486486487, 0.3890339425587467]",0.22477,0.715302,0.678563,"[0.7872340425531915, 0.5698924731182796]",20,20,...,0.0001,1234,False,2,False,Conv1d,,concatenation,64,0


In [None]:
best_swnu_network_log_signature

Unnamed: 0,loss,accuracy,f1,f1_scores,valid_loss,valid_accuracy,valid_f1,valid_f1_scores,k,dimensions,...,dropout_rate,learning_rate,seed,BiLSTM,gamma,k_fold,augmentation_type,hidden_dim_aug,comb_method,batch_size
0,focal,0.593899,0.572441,"[0.6682242990654205, 0.47665847665847666]",0.221828,0.619217,0.568901,"[0.7161803713527852, 0.4216216216216216]",20,20,...,0.2,0.0001,0,False,2,False,Conv1d,,concatenation,64
0,focal,0.585319,0.549445,"[0.6765799256505576, 0.42231075697211157]",0.216514,0.672598,0.647486,"[0.7415730337078651, 0.5533980582524272]",20,20,...,0.2,0.0001,1,False,2,False,Conv1d,,concatenation,64
0,focal,0.5796,0.549297,"[0.6661619984859953, 0.4324324324324324]",0.209865,0.633452,0.61205,"[0.7031700288184437, 0.5209302325581394]",20,20,...,0.2,0.0001,12,False,2,False,Conv1d,,concatenation,64
0,focal,0.57388,0.538618,"[0.666168782673637, 0.41106719367588934]",0.220015,0.658363,0.614276,"[0.7446808510638298, 0.4838709677419355]",20,20,...,0.2,0.0001,123,False,2,False,Conv1d,,concatenation,64
0,focal,0.553861,0.518841,"[0.6486486486486487, 0.3890339425587467]",0.22477,0.715302,0.678563,"[0.7872340425531915, 0.5698924731182796]",20,20,...,0.2,0.0001,1234,False,2,False,Conv1d,,concatenation,64


In [12]:
best_swnu_network_log_signature["f1"].mean()

0.5457286456821915

In [13]:
np.stack(best_swnu_network_log_signature["f1_scores"]).mean(axis=0)

array([0.66515673, 0.42630056])