Libraries

In [1]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
import pickle
import sys
import re

sys.path.insert(0, "../../timeline_generation/")  # Adds higher directory to python modules path
import data_handler

  from .autonotebook import tqdm as notebook_tqdm


Read raw data

In [2]:
TalkLifeDataset = data_handler.TalkLifeDataset()
annotations = TalkLifeDataset.return_annotated_timelines(load_from_pickle=False)
annotations = annotations[annotations['content']!='nan']

sample_size = annotations.shape[0]
print(sample_size)
#annotations.head()

18604


Define Model

In [3]:
# ================================
#model specifics
model_specifics = {"global_embedding_tp": 'SBERT', #options: SBERT, BERT_cls , BERT_mean, BERT_max
    "dimensionality_reduction_tp": 'umap', #options: ppapca, ppapcappa, umap
    "dimensionality_reduction_components": 15, # options: any int number between 1 and embedding dimensions
    "dimensionality_reduction": True, #options: True, False
    "time_injection_history_tp": None, #options: timestamp, None
    "time_injection_post_tp": 'timestamp', #options: timestamp, None
    "signature_dimensions": 3, #options: any int number larger than 1
    "post_embedding_tp": 'sentence', #options: sentence, reduced
    "feature_combination_method": 'concatenation', #options: concatenation, gated_addition 
    "signature_tp": 'log', # options: log, sig
    "augmentation_tp": 'Conv1d', #options: Conv1d, CNN
    "loss_function": 'focal', #options: focal, cbfocal
    "reduced_network_components": 13 , #any integer greater than 1 
    "classifier_name": 'Conv1d3kernel13channelSigLSTMSigLSTMSigFFN2hidden', #'Conv1d3kernel10channelSigFFN2hidden', # options: FFN2hidden (any future classifiers added)
    "classes_num": '3class', #options: 3class (5class to be added in the future)
}

Post Embeddings, Dimensionality Reduction

In [4]:
#post embedding
from embeddings import Representations

rep = Representations(type = model_specifics['global_embedding_tp'])
embeddings_sentence = rep.get_embeddings()

print(embeddings_sentence.shape)

#dimensionality reduction
from dimensionality_reduction import DimensionalityReduction

reduction = DimensionalityReduction(method= model_specifics['dimensionality_reduction_tp'], components=model_specifics['dimensionality_reduction_components'])
embeddings_reduced = reduction.fit_transform(embeddings_sentence)

print(embeddings_reduced.shape)

(18604, 384)
(18604, 15)


Time features

In [5]:
#concatenate new dataframe
from dataset import get_modeling_dataframe
df = get_modeling_dataframe(annotations, embeddings_sentence, embeddings_reduced)

#get time features
from timeinjection import TimeFeatures, Padding
tf = TimeFeatures()
df = tf.get_time_features(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['time_diff'][i] = (df['datetime'][i] - df['datetime'][i-1] ).total_seconds() / 60
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['timeline_index'][first_index:last_index] = np.arange(t_id_len)


Padding

In [6]:
#implement a variation that gets only the last k posts (k=5) so that we avoid the issue of variable lengths in signature
from time import time

if (model_specifics['dimensionality_reduction'] == True):
    emb_str = "^d\w*[0-9]"
else:
    emb_str = "^e\w*[0-9]"

id_counts = df.groupby(['timeline_id'])['timeline_id'].count()#.set_index(['timeline_id'])
time_n = id_counts.max()
df_new = np.array(df[['timeline_id','label','time_encoding']+[c for c in df.columns if re.match(emb_str, c)]])

#iterate to create slices
start_i = 0
end_i = 0
dims = df_new.shape[1]
zeros = np.concatenate(( np.array([100]), np.repeat(0,dims-2) ),axis=0)
sample_list = []
zero_padding = True
k_last = True
k = 5

for i in range(df.shape[0]):
    t1 = time()
    if (i==0):
        i_prev = 0
    else:
        i_prev = i-1
    if (df['timeline_id'][i]==df['timeline_id'][i_prev]):
        end_i +=1
        if ((k_last==True) & ((end_i - start_i) > k)):
            start_i = end_i - k
    else: 
        start_i = i
        end_i = i+1

    #data point with history
    df_add = df_new[start_i:end_i, 1:][np.newaxis, :, :]
    #padding length
    if (k_last == True):
        padding_n = k - (end_i- start_i)
    else:
        padding_n = time_n - (end_i- start_i)
    #create padding
    if zero_padding:
        #padding = np.concatenate((np.array([100]), np.array(df_new[end_i-1, 2]).reshape(1), zeros[2:]), axis=0)
        #zeros_tile = np.tile(padding,(padding_n,1))[np.newaxis, :, :]
        zeros_tile = np.tile(zeros,(padding_n,1))[np.newaxis, :, :]
    else:
        zeros_tile = np.tile(df_new[end_i-1, 1:],(padding_n,1))[np.newaxis, :, :]
    #append zero padding
    df_padi = np.concatenate((df_add , zeros_tile),axis=1) 
    #append each sample to final list
    sample_list.append(df_padi)
  
df_padded = np.concatenate(sample_list)
print(df_padded.shape)

(18604, 5, 17)


Preparation of Data for Deep Signature Network

In [7]:
#torch conversion and removal of label and time dimensions for now
path = torch.from_numpy(df_padded[: , : , 2:].astype(float))
  
if (model_specifics['time_injection_post_tp']== 'timestamp'):
    mean = df_padded[: , : , 1][df_padded[: , : , 1]!=0].mean()
    std = df_padded[: , : , 1][df_padded[: , : , 1]!=0].std()
    time_feature = (torch.from_numpy(df_padded[: , : , 1].astype(float)).unsqueeze(1) - mean) /std
    time_feature[time_feature < -100] = 0
else:
    time_feature = None

if (model_specifics['post_embedding_tp'] == 'sentence'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^e\w*[0-9]", c)]].values).unsqueeze(2).repeat(1, 1, k)
elif (model_specifics['post_embedding_tp'] == 'reduced'):
    bert_embeddings = torch.tensor(df[[c for c in df.columns if re.match("^d\w*[0-9]", c)]].values).unsqueeze(2).repeat(1, 1, k)
else:
    bert_embeddings = None

x_data = torch.transpose(path, 1,2)

if (time_feature != None):
    x_data = torch.cat((x_data, time_feature), dim=1)
if (bert_embeddings != None):
    x_data = torch.cat((x_data, bert_embeddings), dim=1)

x_data.shape

torch.Size([18604, 400, 5])

K fold training with repeatition of random seeds and Hyperparameter Tuning

In [10]:
from sklearn import metrics
import random
from datetime import date
import math

from classification_utils import Folds, set_seed, validation, training, testing
from deepsignatureffn import DeepSigNet, StackedDeepSigNet, FocalLoss, ClassBalanced_FocalLoss

# ================================
save_results = True
# ================================

#GLOBAL MODEL PARAMETERS
augmentation_tp = model_specifics["augmentation_tp"]
input_channels = path.shape[2]
output_channels =  [model_specifics["reduced_network_components"]] #13#[10,13]
augmentation_layers = () #[(32, 16, 10)] #(50, 20, output_channels) #
BiLSTM = False
sig_d = 3 
blocks = 3
post_dim = x_data.shape[1]- input_channels
hidden_dim_lstm =  [(12, 8)] #12 [10,12]
hidden_dim = [32] #32 [32,64] 
output_dim = 3
loss = model_specifics["loss_function"] #'focal' #cbfocal
dropout_rate = [0.25]  #0.25 [0.25,0.35]
if (model_specifics['time_injection_history_tp'] == 'timestamp'):
    add_time = True
else: 
    add_time = False

# ================================
num_epochs = 100
learning_rate =  [0.0003] #0.0003 [0.0003, 0.0005]
gamma = [2] #2 [2,3]
beta = 0.999
BATCH_SIZE = 64
NUM_folds = 5
patience = 2
weight_decay_adam = 0.0001
RANDOM_SEED_list = [0, 1, 12, 123, 1234]

# ================================
if (model_specifics['dimensionality_reduction'] == True):
    model_code_name = model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['dimensionality_reduction_tp']) + str(model_specifics['dimensionality_reduction_components']) \
    + "_" + str(model_specifics['time_injection_history_tp']) + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp']) + "_" + str(model_specifics['feature_combination_method']) \
    + "_" + str(model_specifics['signature_tp']) + "_" + str(model_specifics['signature_dimensions']) \
    + "_" + str(model_specifics['classifier_name']) + "_" + str(model_specifics["loss_function"]) \
    + "_" + str(model_specifics['classes_num']) 
else:
    model_code_name = model_specifics["global_embedding_tp"]  \
    + "_" + str(model_specifics['time_injection_history_tp']) + str(model_specifics['time_injection_post_tp']) \
    + "_" + str(model_specifics['post_embedding_tp']) + "_" + str(model_specifics['feature_combination_method']) \
    + "_" + str(model_specifics['signature_tp']) + "_" + str(model_specifics['signature_dimensions']) \
    + "_" + str(model_specifics['classifier_name']) + "_" + str(model_specifics["loss_function"]) \
    + "_" + str(model_specifics['classes_num']) 


FOLDER_models = '/storage/ttseriotou/pathbert/models/v2/' 
FOLDER_results = '/storage/ttseriotou/pathbert/results/v2/' 

# ================================
KFolds = Folds(num_folds=NUM_folds)
y_data = KFolds.get_labels(df)
# ================================
#K FOLD RUNS
ft_i = 0 #run number
for out_ch in output_channels:
    for lr in learning_rate:
        for g in gamma:
            for dp in dropout_rate:
                for h_dim in hidden_dim:
                    for lstm_dim in hidden_dim_lstm:
                        #out_ch =  aug_l[2] 
                        str_version = 'tuning' + str(ft_i)
                        print('lr=',lr, ' g=',g,' dp=',dp, ' h_dim=',h_dim, ' lstm_dim=', lstm_dim, 'conv output layers=',output_channels)
                        ft_i+=1

                        classifier_params = {"augmentation_tp": augmentation_tp,
                        "input_channels": input_channels,
                        "output_channels": out_ch,
                        "augmentation_layers": augmentation_layers,
                        "sig_d": sig_d,
                        "post_dim": post_dim,
                        "hidden_dim_lstm": lstm_dim,
                        "hidden_dim": h_dim,
                        "output_dim": output_dim,
                        "dropout_rate": dp,
                        "num_epochs": num_epochs,
                        "learning_rate": lr,
                        "BiLSTM": BiLSTM,
                        "blocks": blocks,
                        "gamma": g,
                        "BATCH_SIZE": BATCH_SIZE,
                        "NUM_folds": NUM_folds,
                        "patience": patience,
                        "weight_decay_adam": weight_decay_adam,
                        "RANDOM_SEED_list": RANDOM_SEED_list,
                        } 
                                        
                        for my_ran_seed in RANDOM_SEED_list:
                            set_seed(my_ran_seed)
                            myGenerator = torch.Generator()
                            myGenerator.manual_seed(my_ran_seed)    
                            for test_fold in range(NUM_folds):

                                print('Starting random seed #',my_ran_seed, ' and fold #', test_fold)
                                #get ith-fold data
                                x_test, y_test, x_valid, y_valid, x_train , y_train, test_tl_ids, test_pids = KFolds.get_splits(df, x_data, y_data, test_fold= test_fold)

                                #data loaders with batches
                                train = torch.utils.data.TensorDataset( x_train, y_train)
                                valid = torch.utils.data.TensorDataset( x_valid, y_valid)
                                test = torch.utils.data.TensorDataset( torch.cat((x_test,test_pids.unsqueeze(2).repeat(1, 1, k)),1) , y_test)

                                train_loader = torch.utils.data.DataLoader(dataset=train, batch_size = BATCH_SIZE, shuffle = True)
                                valid_loader = torch.utils.data.DataLoader(dataset=valid, batch_size = BATCH_SIZE, shuffle = True)
                                test_loader = torch.utils.data.DataLoader(dataset=test, batch_size = BATCH_SIZE, shuffle = True)

                    
                                #early stopping params
                                last_metric = 0
                                trigger_times = 0
                                best_metric = 0

                                #model definitions
                                #model = DeepSigNet(input_channels, output_channels, sig_d,  post_dim, hidden_dim, output_dim, dropout_rate, add_time, augmentation_tp, augmentation_layers)
                                model = StackedDeepSigNet(input_channels, out_ch, sig_d, lstm_dim, post_dim, h_dim, output_dim, dp, add_time, augmentation_tp, augmentation_layers, BiLSTM, comb_method=model_specifics['feature_combination_method'], blocks=blocks)
                                #loss function
                                if (loss=='focal') :
                                    alpha_values = torch.Tensor([math.sqrt(1/(y_train[y_train==0].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==1].shape[0]/y_train.shape[0])), math.sqrt(1/(y_train[y_train==2].shape[0]/y_train.shape[0]))])
                                    criterion = FocalLoss(gamma = g, alpha = alpha_values)
                                elif (loss == 'cbfocal'):
                                    classifier_params["beta"] = beta
                                    samples_count = torch.Tensor([y_train[y_train==0].shape[0], y_train[y_train==1].shape[0], y_train[y_train==2].shape[0]])
                                    criterion = ClassBalanced_FocalLoss(gamma = g, beta = beta, no_of_classes=3, samples_per_cls=samples_count)                               
                                optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay= weight_decay_adam)

                                #model train/validation per epoch
                                for epoch in range(num_epochs):

                                    training(model, train_loader, criterion, optimizer, epoch, num_epochs)
                                
                                    # Early stopping
                                    
                                    _ , f1_v, labels_val, predicted_val = validation(model, valid_loader, criterion)
                                    print('Current Macro F1:', f1_v)

                                    if f1_v > best_metric :
                                        best_metric = f1_v

                                        #test and save so far best model
                                        predicted_test, labels_test, pids_test = testing(model, test_loader)

                                        results = {
                                            "model_code_name": model_code_name, 
                                            "model_specifics": model_specifics, 
                                            "classifier_params": classifier_params, 
                                            "date_run": date.today().strftime("%d/%m/%Y"),
                                            "test_tl_ids": test_tl_ids,
                                            "test_pids": pids_test, #test_pids,
                                            "labels": labels_test,
                                            "predictions": predicted_test,
                                            "labels_val": labels_val,
                                            "predicted_val": predicted_val,
                                            "test_fold": test_fold,
                                            "random_seed": my_ran_seed,
                                            "epoch": epoch,
                                        }

                                        if (save_results==True):
                                            #file_name_results = FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" + "_" + str_version + '.pkl'
                                            #file_name_model = FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold" + "_" + str_version +'.pkl'
                                            file_name_results = FOLDER_results + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold"+ '.pkl'
                                            file_name_model = FOLDER_models + model_code_name + "_" + str(my_ran_seed) + "seed" + "_" + str(test_fold) + "fold"  +'.pkl'
                                            pickle.dump(results, open(file_name_results, 'wb'))
                                            torch.save(model.state_dict(), file_name_model)

                                    if f1_v < last_metric:
                                        trigger_times += 1
                                        print('Trigger Times:', trigger_times)

                                        if trigger_times >= patience:
                                            print('Early stopping!')
                                            break

                                    else:
                                        print('Trigger Times: 0')
                                        trigger_times = 0

                                    last_metric = f1_v
                            
                

lr= 0.005  g= 2  dp= 0.25  h_dim= 32  lstm_dim= (12, 10) conv output layers= [13]
Starting random seed # 0  and fold # 0
The size of train/valid/test timelines are:  268 132 100
Samples in test set:  3773
[0/100, 0/158] loss: 0.82982844
[0/100, 100/158] loss: 0.52820849
Current Macro F1: 54.14672170430298
Trigger Times: 0
[1/100, 0/158] loss: 0.38461044
[1/100, 100/158] loss: 0.32350436
Current Macro F1: 54.884949114920644
Trigger Times: 0
[2/100, 0/158] loss: 0.36699501
[2/100, 100/158] loss: 0.34093463
Current Macro F1: 47.574964731943176
Trigger Times: 1
[3/100, 0/158] loss: 0.26555252
[3/100, 100/158] loss: 0.2768907
Current Macro F1: 54.08805368098921
Trigger Times: 0
[4/100, 0/158] loss: 0.38603261
[4/100, 100/158] loss: 0.31638068


KeyboardInterrupt: 

In [3]:
#BEST K MODELS - VALIDATION LOOP
from os import listdir
from os.path import isfile, join
from sklearn import metrics
from collections import Counter

k=5
FOLDER_results = '/storage/ttseriotou/pathbert/results/v2/' 
model_code_name = 'SBERT_umap15_Nonetimestamp_sentence_concatenation_log_3_Conv1d3kernelSigLSTMSigLSTMSigFFN2hidden_focal_3class'
metrics_overall = pd.DataFrame(0, index = ['O', 'IE', 'IS', 'accuracy', 'macro avg', 'weighted avg'], columns = ['precision', 'recall', 'f1-score', 'support'])

#get all tuning files
per_model_files = [f for f in listdir(FOLDER_results) if 'tuning' in f if model_code_name in f]

#get the indices of tuning files
files_ind = [int(f[:f.index(".")].split("_")[-1].replace('tuning', '')) for f in per_model_files]
files_ind = list(set(files_ind))
dict_f1 = {}

print(files_ind)
for t in files_ind:
    labels_final = torch.empty((0))
    predicted_final = torch.empty((0))

    tuning_files = [f for f in per_model_files if ('tuning'+str(t)) in f]

    for sf in tuning_files:
        with open(FOLDER_results+sf, 'rb') as fin:
            results = pickle.load(fin)
            labels_results = results['labels_val']
            predictions_results = results['predicted_val']
        
        #for each seed combine fold results
        labels_final = torch.cat([labels_final, labels_results])
        predicted_final = torch.cat([predicted_final, predictions_results])

    #calculate metrics for each seed
    metrics_tab = metrics.classification_report(labels_final, predicted_final, target_names = ['O','IE','IS'], output_dict=True)
    metrics_tab = pd.DataFrame(metrics_tab).transpose()
    f1 = metrics_tab['f1-score']['macro avg']
    dict_f1[t] = f1

dict_f1 = Counter(dict_f1)

for top in dict_f1.most_common()[:k]:
    labels_final = torch.empty((0))
    predicted_final = torch.empty((0))

    tuning_files = [f for f in per_model_files if ('tuning'+str(top[0])) in f]

    for sf in tuning_files:
        with open(FOLDER_results+sf, 'rb') as fin:
            results = pickle.load(fin)
            labels_results = results['labels_val']
            predictions_results = results['predicted_val']
        
        #for each seed combine fold results
        labels_final = torch.cat([labels_final, labels_results])
        predicted_final = torch.cat([predicted_final, predictions_results])

    #calculate metrics for each seed
    metrics_tab = metrics.classification_report(labels_final, predicted_final, target_names = ['O','IE','IS'], output_dict=True)
    metrics_tab = pd.DataFrame(metrics_tab).transpose()
    params = results['classifier_params']
    print('lr=',params['learning_rate'], 'g=',params['gamma'],'dp=',params['dropout_rate'], 'h_dim=',params['hidden_dim'], 'lstm_dim=', params['hidden_dim_lstm'], 'conv out=',params['output_channels'])
    print(metrics_tab)
    #print('\n')

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]
lr= 0.0003 g= 2 dp= 0.25 h_dim= 32 lstm_dim= (12, 8) conv out= 13
              precision    recall  f1-score       support
O              0.909072  0.870078  0.889148  20189.000000
IE             0.438231  0.589409  0.502700   2606.000000
IS             0.309544  0.301003  0.305214   1196.000000
accuracy       0.811221  0.811221  0.811221      0.811221
macro avg      0.552282  0.586830  0.565687  23991.000000
weighted avg   0.828040  0.811221  0.818060  23991.000000
lr= 0.0003 g= 2 dp= 0.25 h_dim= 64 lstm_dim= (12, 8) conv out= 13
              precision    recall  f1-score       support
O              0.905679  0.878449  0.891856  20189.000000
IE             0.464736  0.566385  0.510550   2606.000000
IS             0.286294  0.295151  0.2906