___
___
# M908 NLP - miniProject: **Stress Detection**
___
___
### Kylafi Christina-Theano (Theatina)
LT1200012
___

___
___
# Feature Engineering Experiments
___

## Emotion Detection

In [None]:
from transformers import pipeline

#---------- emotion dictionary ---------------------------------------------------------------------------------------------------------------------------------
def add_emotion_dictionary(df):
    classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)
    
    emotions_preds = [ { pred['label']:pred['score'] for pred in classifier( t )[0] } for t in df['text'] ]
    df['emotion_dict'] = emotions_preds

    return df

def add_emotionNscore(df):
    emotion_dom = []
    emotion_score = []
    progress_bar = tqdm.tqdm(range(df.shape[0]))
    for i,row in df.iterrows():
        em_dict = row['emotion_dict']
        emotion_dom.append(max(em_dict, key=em_dict.get))
        emotion_score.append(max(em_dict.values()))
        progress_bar.update(1)
    
    df['emotion'] = emotion_dom
    df['emotion_score'] = emotion_score
    return df


# add_emotion_dictionary(train_df)
# add_emotion_dictionary(test_df)

# add_emotionNscore(train_df)
# add_emotionNscore(test_df)

# train_df.to_csv(os.path.join(file_store_dir,"stressTrain.csv"),  index = False, header=True)
# test_df.to_csv(os.path.join(file_store_dir,"stressTest.csv"),  index = False, header=True)


#---------- depression dictionary ------------------------------------------------------------------------------------------------------------------------
def add_depression_dictionary(df):
    classifier = pipeline("text-classification", model="paulagarciaserrano/roberta-depression-detection", return_all_scores=True)

    depression_preds = [ { pred['label']:pred['score'] for pred in classifier( t )[0] } for t in df['text'] ]
    df['depression_dict'] = depression_preds



def add_depressionNscore(df):
    depression_dom = []
    depression_score = []
    progress_bar = tqdm.tqdm(range(df.shape[0]))
    for i,row in df.iterrows():
        depr_dict = row['depression_dict']
        depression_dom.append(max(depr_dict, key=depr_dict.get))
        depression_score.append(max(depr_dict.values()))
        progress_bar.update(1)
    
    df['depression'] = depression_dom
    df['depression_score'] = depression_score


# add_depression_dictionary(train_df)
# add_depressionNscore(train_df)

# add_depression_dictionary(test_df)
# add_depressionNscore(test_df)

# train_df.to_csv(os.path.join(file_store_dir,"stressTrainEmDepr.csv"),  index = False, header=True)
# test_df.to_csv(os.path.join(file_store_dir,"stressTestEmDepr.csv"),  index = False, header=True)


#---------- data visualization ------------------------------------------------------------------------------------------------------------------------
def data_vis(file_store_dir, train_df, test_df):
    plot_dir = os.path.join(file_store_dir,"figures")
    if not os.path.exists(plot_dir):
        os.makedirs(plot_dir)

    # plt.clf()
    df=train_df.groupby(['subreddit']).size()
    # df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(plot_dir,"subreddits.png"), dpi=300)

    # plt.clf()
    df=train_df.groupby(['subreddit',"label"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    plt.legend(["No stress", "Stress"])
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(plot_dir,"subredditsLabel.png"), dpi=300)

    # plt.clf()
    df=train_df.groupby(['emotion',"label"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    plt.legend(["No stress", "Stress"])
    plt.xticks(rotation=0)
    plt.savefig(os.path.join(plot_dir,"emotionLabel.png"), dpi=300)


    df=train_df.groupby(['subreddit',"emotion"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    # plt.legend(["joy", "love", "anger", "sadness", "fear", "surprise" ])
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(plot_dir,"subredditEmotion.png"), dpi=300)


    df=train_df.groupby(['label',"emotion"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    # plt.legend(["joy", "love", "anger", "sadness", "fear", "surprise" ])
    plt.xticks([0,1],["no stress", "stress"], rotation=0)
    plt.savefig(os.path.join(plot_dir,"labelEmotion.png"), dpi=300)


    df=train_df.groupby(['depression',"emotion"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    # plt.legend(["joy", "love", "anger", "sadness", "fear", "surprise" ])
    plt.xticks([0,1,2],["moderate", "not depression", "severe"], rotation=0)
    plt.savefig(os.path.join(plot_dir,"depressionEmotion"), dpi=300)


    plt.clf()
    df1 = train_df[train_df['label'] == 0]
    df2 = train_df[train_df['label'] == 1]

    ax = df1.plot(x='emotion', y='emotion_score', kind='scatter', c='g', label='Not stressed', figsize=(15,12))
    df2.plot(x='emotion', y='emotion_score', kind='scatter', ax=ax, c='r', label='Stressed')
    ax.hlines(.5, -0.2,5.2, linestyles='dashed')
    ax.annotate('threshold',(-0.4,0.51))
    plt.show()


    df1 = train_df[train_df['emotion'] == "joy"]
    df2 = train_df[train_df['emotion'] == "love"]
    df3 = train_df[train_df['emotion'] == "anger"]
    df4 = train_df[train_df['emotion'] == "sadness"]
    df5 = train_df[train_df['emotion'] == "fear"]
    df6 = train_df[train_df['emotion'] == "surprise"]


    x_col = 'lex_liwc_negemo'
    y_col = 'lex_liwc_posemo'

    ax = df1.plot(x=x_col, y=y_col, kind='scatter', c='g', label='joy', figsize=(15,12))
    df2.plot(x=x_col, y=y_col, kind='scatter', ax=ax, c='m', label='love')
    df3.plot(x=x_col, y=y_col, kind='scatter', ax=ax, c='r', label='anger')
    df4.plot(x=x_col, y=y_col, kind='scatter', ax=ax, c='b', label='sadness')
    df5.plot(x=x_col, y=y_col, kind='scatter', ax=ax, c='c', label='fear')
    df6.plot(x=x_col, y=y_col, kind='scatter', ax=ax, c='y', label='surprise')

    plt.show()


    # plt.savefig(os.path.join(plot_dir,"emotionEmScore.png"), dpi=300)


    # df=train_df.groupby(['subreddit']).size()
    # # df=df.unstack()
    # df.plot(kind='bar', figsize=(15,8))
    # plt.savefig(os.path.join(file_store_dir,"subreddits.png"), dpi=300)

    t_df = train_df

    df=t_df[t_df["label"]==0].groupby(['subreddit',"emotion"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    # plt.legend(["joy", "love", "anger", "sadness", "fear", "surprise" ])
    plt.xticks(rotation=45)
    plt.savefig(os.path.join("./subredditEmotion_label_0.png"), dpi=300)
    plt.title("Not stressed")
#     plt.show()


    df=t_df[t_df["label"]==1].groupby(['subreddit',"emotion"]).size()
    df=df.unstack()
    df.plot(kind='bar', figsize=(15,8))
    # plt.legend(["joy", "love", "anger", "sadness", "fear", "surprise" ])
    plt.xticks(rotation=45)
    plt.savefig(os.path.join("./subredditEmotion_label_1.png"), dpi=300)
    plt.title("Stressed")
#     plt.show()



# columns_to_file(train_df,file_store_dir)
# !cat ./results/columns.txt

In [9]:
dataset_train_embs = "/kaggle/working/stressTrainEmDep_BERTnWord2Vec_text.csv"
dataset_test_embs = "/kaggle/working/stressTestEmDep_BERTnWord2Vec_text.csv"

if not os.path.exists(dataset_train_embs):
    !gdown 19ZMTk5ZwmyL_eVVhXGC_kAdFTReu25Nm 
    !gdown 1XcaYJ78vXov-yhyBFDUWx2rH7DbeKx5k

train_df = pd.read_csv(dataset_train_embs)
test_df = pd.read_csv(dataset_test_embs)



___
___
# **Stress Detection Task** 
___
___

In [1]:
#---------- RESULTS FILE DIRECTORY ---------------------------------------------------------------------- 

file_store_dir="/kaggle/working/results"

In [2]:
!pip install -U huggingface_hub
!pip install gensim
!pip install nltk
!pip install gdown

# modules
import pandas as pd
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import tqdm
import numpy as np
from numpy import mean
from numpy import std
from datasets import load_metric
import random
from IPython.display import FileLink, FileLinks

import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset

import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam, SGD
from transformers import pipeline

import transformers
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoTokenizer, AutoModel, BertTokenizer, BertModel

from keras import backend as K
from keras import Model
from keras.models import Sequential
from keras.layers import Input, Embedding, LSTM, Dense, Dropout, concatenate, Bidirectional, Conv1D, Conv2D, Flatten, MaxPooling1D, MaxPooling2D, GlobalMaxPool1D, GRU, CuDNNGRU
from keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from keras.callbacks import CSVLogger

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, auc

import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

piplist = !pip freeze
if not ( "GPUtil" in " ".join([ p for p in piplist ]) ):
    !pip install GPUtil
    print("\n\n")
from GPUtil import showUtilization as gpu_usage
import gc
from numba import cuda


# output dir
if not os.path.exists(file_store_dir):
    os.makedirs(file_store_dir)



#Importing librariesfrom nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec, KeyedVectors

___
___
## Functions 
___

In [3]:
#---------- DATA PRE PROCESSING ------------------------------------------------------------------------------

def stamp_to_date(timestamp):
    return datetime.fromtimestamp(timestamp)

def load_data(dataset_path):
    return pd.read_csv(dataset_path)

def columns_to_dict(df):
    columns_dict = {}
    for i,c in enumerate(df.columns):
        columns_dict[c] = list(set(df[c]))
    return columns_dict

def add_tokens_col(df):
    text_token_list = []
    for i,row in df.iterrows():
        tok = TweetTokenizer()
        tokens = tok.tokenize(row['text'])
        tokens = [ t for t in tokens if t.isalnum() and t not in stopwords.words('english') ]
        text_token_list.append(tokens)
    
    df["tokens"] = text_token_list
    return df

def df_to_csv(df, filedir, filename):
    if ".csv" not in filename:
        filename+=".csv"
    df.to_csv(os.path.join(filedir,filename), sep='\t', encoding='utf-8')

def text_tokenizer(tokenizer, texts):
    tok_texts = []
    for text in tqdm.tqdm(texts, desc = 'Tokenizing texts'):
        tokens = tokenizer.tokenize(text)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        tok_texts.append(ids)
    return tok_texts

def alpha_to_numerical(alpha_feats_df):
    le = LabelEncoder()
    return le.fit_transform(alpha_feats_df), le

def columns_to_file(df, file_store_dir, filename):
    columns_file = os.path.join(file_store_dir,f"{filename}.txt")
    columns_dict = {}
    with open(columns_file, "w+", encoding="utf-8") as writer:
        for i,c in enumerate(df.columns):
            if c in ["emotion_dict", "depression_dict"]:
                columns_dict[c] = list(eval(df[c][0]).keys())
            else:
                columns_dict[c] = df[c]
            writer.write(f"{i}. {c}\n")

        writer.write(20*"_")
        writer.write(f"\n\n subreddits: {list(set(columns_dict['subreddit']))}\n")
        writer.write(f"\n emotions: {columns_dict['emotion_dict']}\n")
        writer.write(f"\n depression states: {columns_dict['depression_dict']}\n")
        writer.write(f"\n social_timestamp range: [ {stamp_to_date(min(columns_dict['social_timestamp']))}, {stamp_to_date(max(columns_dict['social_timestamp']))} ]\n\n")


def datasets_sortNclean(train_df,test_df):
    # dataset rearrangement
    train_df.insert(9, 'social_upvote_ratio', train_df.pop('social_upvote_ratio'))
    train_df.insert(10, 'social_num_comments', train_df.pop('social_num_comments'))
    train_df.insert(12, 'syntax_fk_grade', train_df.pop('syntax_fk_grade'))
    train_df.pop("post_id")
    train_df.pop("id")
    train_df.pop("sentence_range")
    train_df.pop("confidence")

    test_df.insert(9, 'social_upvote_ratio', test_df.pop('social_upvote_ratio'))
    test_df.insert(10, 'social_num_comments', test_df.pop('social_num_comments'))
    test_df.insert(12, 'syntax_fk_grade', test_df.pop('syntax_fk_grade'))
    test_df.pop("post_id")
    test_df.pop("id")
    test_df.pop("sentence_range")
    test_df.pop("confidence")

    for i,c in enumerate(test_df.columns):
        print (i+1,c)


def columNames_to_file(df, file_store_dir, data_type):
    columns_file = os.path.join(file_store_dir,f"{data_type}_columns.txt")
    columns_dict = {}
    with open(columns_file, "w+", encoding="utf-8") as writer:
        for i,c in enumerate(df.columns):
            columns_dict[c] = df[c]
            writer.write(f"{i}. {c}\n")


def df_to_csv_PlusColumnNames(trainX_df, trainY_df, testX_df, testY_df, file_store_dir, data_type):       
    columNames_to_file(trainX_df,file_store_dir,data_type)

    trainX_df.to_csv(os.path.join(file_store_dir,f"stressTrainX_{data_type}.csv"),  index = False, header=True)
    trainY_df.to_csv(os.path.join(file_store_dir,f"stressTrainY_{data_type}.csv"),  index = False, header=True)

    testX_df.to_csv(os.path.join(file_store_dir,f"stressTestX_{data_type}.csv"),  index = False, header=True)
    testY_df.to_csv(os.path.join(file_store_dir,f"stressTestY_{data_type}.csv"),  index = False, header=True)


def evaluate_model_precision(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1)
    scores = cross_val_score(model, X, y, scoring='precision', cv=cv, n_jobs=-1, error_score='raise')
    return scores

def evaluate_model_recall(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1)
    scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1, error_score='raise')
    return scores

def evaluate_model_f1(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=4, random_state=1)
    scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
    return scores


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def categorical_encoding(labels):
    cat_enc = LabelEncoder()
    cat_enc.fit(labels)
    encoded_labels = cat_enc.transform(labels)
    # convert integers to one hot encoding
    return np_utils.to_categorical(encoded_labels)



#---------- BERT SENTENCE EMBEDDINGS ------------------------------------------------------------------------------
# Extract sentence embeddings from BERT based on article: "BERT Word Embeddings Tutorial" 
# (https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial

BERT_EMB_OUT = 768

def BERT_tok_paddedXY(train_df,test_df, bert_tok_name='bert-base-uncased', max_length=-9):
    print("\nBERT tokenizer running..\n")
    bert_tokenizer = transformers.BertTokenizer.from_pretrained(bert_tok_name)

    # trainX,trainY = df[['text', 'lex_liwc_Tone', 'lex_liwc_negemo', 'lex_liwc_Clout','lex_liwc_i', 'sentiment' ]], df['label']
    trainX_tok = text_tokenizer(bert_tokenizer, train_df['text'])
    testX_tok = text_tokenizer(bert_tokenizer, test_df['text'])

    # max length for padding will be the maximum length of the texts for optimization
    trainX_tok_padded = sequence.pad_sequences(trainX_tok, padding='post')
    if max_length==-9:
        max_length = len(trainX_tok_padded[0])
    testX_tok_padded = sequence.pad_sequences(testX_tok, padding='post', maxlen=max_length )

    return trainX_tok_padded, testX_tok_padded, max_length

def batch_encodings(encodings, batch_size):
    enc_batches = []

#     randomize samples
    num = len(encodings["input_ids"])
    in_tokens, type_tokens, mask = zip(*random.sample(list(zip(encodings["input_ids"], encodings["token_type_ids"], encodings["attention_mask"])), num))
    in_tokens, type_tokens, mask = list(in_tokens), list(type_tokens), list(mask)

    done = 0
    running = True
    steps = len(in_tokens)
    progress_bar = tqdm.tqdm(range(steps), desc ="Batching Encodings")
    while running:
        # `to_take` is our actual batch size. It will be `batch_size` until 
        # we get to the last batch, which may be smaller. 
        selected = min(batch_size, len(in_tokens) - done)

        # Select a contiguous batch of samples starting at `select`.
        start = done
        end = start+selected
        tok,types,mask = in_tokens[start:end], type_tokens[start:end], mask[start:end]

        done+=selected
        # Each sample is a tuple--split them apart to create a separate list of 
        # sequences and a list of labels for this batch.
        enc_batches.append([tok,types,mask])

        progress_bar.update(selected)
        if done==end==len(in_tokens):
            running=False

    return enc_batches


def gpu_clean():   
    gc.collect()
    torch.cuda.empty_cache()
    gpu_usage()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

def BERT_sentence_emb(train_df_text, test_df_text, bert_tok_name='bert-base-uncased'):
    gpu_clean()
    # Sentences we want sentence embeddings for
    sentences_train = list(train_df_text.values)
    sentences_test = list(test_df_text.values)

    # Load model from HuggingFace Hub
    lower_text = "uncased" in bert_tok_name
    tokenizer = transformers.BertTokenizer.from_pretrained(bert_tok_name,  do_lower_case=lower_text)
    model = transformers.BertModel.from_pretrained(bert_tok_name, output_hidden_states=True)

    # Tokenize sentences
    encoded_input_train = tokenizer(sentences_train, padding=True, truncation=True, return_tensors='pt')
    encoded_input_test = tokenizer(sentences_test, padding=True, truncation=True, return_tensors='pt')

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"\nRunning on {device}..\n")
    model = model.to(device)
    model.eval()
    # Compute token embeddings
#     train_sampler = RandomSampler(encoded_input_train)
#     train_dataloader = DataLoader(encoded_input_train, sampler=train_sampler, batch_size=500)
    print(f"\nCreating Text Embeddings\n")
    batch_size=150
#     train_dataloader = batch_encodings(encoded_input_train, batch_size)
    train_dataset = TensorDataset(encoded_input_train["input_ids"], encoded_input_train["token_type_ids"], encoded_input_train["attention_mask"])
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

    train_sentence_embeddings = []
    steps = len(train_dataloader)
    progress_bar = tqdm.tqdm(range(steps), desc ="Training Sentence Embeddings")
    with torch.no_grad():
        for batch_data in train_dataloader:
            input_ids, token_type_ids, attention_mask = tuple(t.to(device) for t in batch_data)
            model_output_train = model(input_ids, attention_mask, token_type_ids)
            hid_states = model_output_train[2]
#             hidden layer 12/13 and average token embeddings
            tok_vecs = hid_states[-2]
            train_sentence_embeddings.extend(torch.mean(tok_vecs,dim=1))
            progress_bar.update(1)
#     clear gpu & ram
    del encoded_input_train
    del model_output_train
    gpu_clean()
#     gpu_clean()

#     model = model.to(device)
    test_dataset = TensorDataset(encoded_input_test["input_ids"], encoded_input_test["token_type_ids"], encoded_input_test["attention_mask"])
    test_sampler = RandomSampler(test_dataset)
    test_dataloader = DataLoader(train_dataset, sampler=test_sampler, batch_size=batch_size)

    test_sentence_embeddings = []
    steps = len(test_dataloader)
    progress_bar = tqdm.tqdm(range(steps), desc ="Test Sentence Embeddings")
    with torch.no_grad():
        for batch_data in test_dataloader:
            input_ids, token_type_ids, attention_mask = tuple(t.to(device) for t in batch_data)
            model_output_test = model(input_ids, attention_mask, token_type_ids)
            hid_states = model_output_test[2]
#             hidden layer 12/13 and average token embeddings
            tok_vecs = hid_states[-2]
            test_sentence_embeddings.extend(torch.mean(tok_vecs,dim=1))
            progress_bar.update(1)

#     clean gpu & ram
    del encoded_input_test
    del model_output_test
    gpu_clean()

    return train_sentence_embeddings, test_sentence_embeddings

# test_embs_test, test_embs_train = BERT_sentence_emb(train_df["text"],test_df["text"], bert_tok_name='bert-base-uncased')

def get_sentence_embs(train_sent_embs, test_sent_embs):
    # save sentence embeddings
    train_s_embs = [ embs.tolist() for embs in train_sent_embs ]
    test_s_embs = [ embs.tolist() for embs in test_sent_embs ]

    text_embed_names = [ str(i) for i in range(len(train_s_embs[0])) ]
    train_embs = pd.DataFrame(train_s_embs, columns=text_embed_names)
    test_embs = pd.DataFrame(test_s_embs, columns=text_embed_names)

    return train_embs, test_embs

#     train_embs.to_csv(os.path.join(file_store_dir,"stressTrain_embs.csv"), index = False, header=True)
#     test_embs.to_csv(os.path.join(file_store_dir,"stressTest_embs.csv"), index = False, header=True)


#---------- Word2Vec SENTENCE EMBEDDINGS ------------------------------------------------------------------------------
# Extract sentence embeddings from BERT based on article: "BERT Word Embeddings Tutorial" 
# (https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial

WORD2VEC_EMB_OUT = 300




#---------- FEATURE SELECTION -----------------------------------------------------------------------------------------
def join_embs_feats(embeddings,features):
    return embeddings.join(features)

def df_allFeats_split(train_df, test_df):
    trainX_allFeats = pd.DataFrame(train_df)
    trainX_allFeats.pop("text")
    if "BERT" in trainX_allFeats.columns:
        trainX_allFeats.pop("BERT")
    if "Word2Vec" in trainX_allFeats.columns:
        trainX_allFeats.pop("Word2Vec")
#     trainX_allFeats.pop("tokens")
    trainX_allFeats.pop("id")
    trainX_allFeats.pop("emotion_dict")
    trainX_allFeats.pop("depression_dict")
    trainX_allFeats.pop("post_id")
    trainX_allFeats.pop("sentence_range")
    trainX_allFeats.pop("social_timestamp")
    subreddit_encoded, sub_encoder = alpha_to_numerical(trainX_allFeats["subreddit"])
    trainX_allFeats["subreddit"] = subreddit_encoded
    emotion_encoded, em_encoder = alpha_to_numerical(trainX_allFeats["emotion"])
    trainX_allFeats["emotion"] = emotion_encoded
    depression_encoded, dep_encoder = alpha_to_numerical(trainX_allFeats["depression"])
    trainX_allFeats["depression"] = depression_encoded
    trainY_allFeats = trainX_allFeats["label"]
    trainX_allFeats.pop("label")

    testX_allFeats = pd.DataFrame(test_df)
    testX_allFeats.pop("text")
    if "BERT" in testX_allFeats.columns:
        testX_allFeats.pop("BERT")
    if "Word2Vec" in testX_allFeats.columns:
        testX_allFeats.pop("Word2Vec")
#     testX_allFeats.pop("tokens")
    testX_allFeats.pop("id")
    testX_allFeats.pop("emotion_dict")
    testX_allFeats.pop("depression_dict")
    testX_allFeats.pop("post_id")
    testX_allFeats.pop("sentence_range")
    testX_allFeats.pop("social_timestamp")
    subreddit_encoded, sub_encoder = alpha_to_numerical(testX_allFeats["subreddit"])
    testX_allFeats["subreddit"] = subreddit_encoded
    emotion_encoded, em_encoder = alpha_to_numerical(testX_allFeats["emotion"])
    testX_allFeats["emotion"] = emotion_encoded
    depression_encoded, dep_encoder = alpha_to_numerical(testX_allFeats["depression"])
    testX_allFeats["depression"] = depression_encoded
    testY_allFeats = testX_allFeats["label"]
    testX_allFeats.pop("label")

    return trainX_allFeats, trainY_allFeats, testX_allFeats, testY_allFeats


def df_textEmbedsFeats_split(train_df,test_df, embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ str(i) for i in range(embedding_dim)]

    trainX_textEmbedsFeats = trainX_allFeats[feats_to_keep]
    testX_textEmbedsFeats = testX_allFeats[feats_to_keep]

    return trainX_textEmbedsFeats, trainY, testX_textEmbedsFeats, testY

    return

def df_lexFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ str(i) for i in range(embedding_dim)]
    feats_to_keep.extend([ c for c in trainX_allFeats.columns if "lex" in c ])
    feats_to_keep.append("sentiment")


    trainX_lexFeats = trainX_allFeats[feats_to_keep]
    testX_lexFeats = testX_allFeats[feats_to_keep]

    return trainX_lexFeats, trainY, testX_lexFeats, testY

def df_liwcFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ str(i) for i in range(embedding_dim)]
    feats_to_keep.extend([ c for c in trainX_allFeats.columns if "liwc" in c ])

    trainX_liwcFeats = trainX_allFeats[feats_to_keep]
    testX_liwcFeats = testX_allFeats[feats_to_keep]

    return trainX_liwcFeats, trainY, testX_liwcFeats, testY

def df_dalFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ str(i) for i in range(embedding_dim)]
    feats_to_keep.extend([ c for c in trainX_allFeats.columns if c in [ "emotion", ] ])

    trainX_dalFeats = trainX_allFeats[feats_to_keep]
    testX_dalFeats = testX_allFeats[feats_to_keep]

    return trainX_dalFeats, trainY, testX_dalFeats, testY

def df_onlyFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ c for c in trainX_allFeats.columns if c not in [ str(i) for i in range(embedding_dim)] ]

    trainX_onlyFeats = trainX_allFeats[feats_to_keep]
    testX_onlyFeats = testX_allFeats[feats_to_keep]

    return trainX_onlyFeats, trainY, testX_onlyFeats, testY
    
def df_emDepFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ str(i) for i in range(embedding_dim)]
    feats_to_keep.extend([ c for c in trainX_allFeats.columns if any(substring in c for substring in ["sad", "anger", "anx", "negemo", "posemo", "emotion", "depression", "sentiment"]) ])

    trainX_dalFeats = trainX_allFeats[feats_to_keep]
    testX_dalFeats = testX_allFeats[feats_to_keep]

    return trainX_dalFeats, trainY, testX_dalFeats, testY

def df_onlylexFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ c for c in trainX_allFeats.columns if "lex" in c ]
    feats_to_keep.append("sentiment")


    trainX_lexFeats = trainX_allFeats[feats_to_keep]
    testX_lexFeats = testX_allFeats[feats_to_keep]

    return trainX_lexFeats, trainY, testX_lexFeats, testY

def df_onlyliwcFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ c for c in trainX_allFeats.columns if "liwc" in c ]

    trainX_liwcFeats = trainX_allFeats[feats_to_keep]
    testX_liwcFeats = testX_allFeats[feats_to_keep]

    return trainX_liwcFeats, trainY, testX_liwcFeats, testY

def df_onlydalFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ c for c in trainX_allFeats.columns if c in [ "emotion", ] ]

    trainX_dalFeats = trainX_allFeats[feats_to_keep]
    testX_dalFeats = testX_allFeats[feats_to_keep]

    return trainX_dalFeats, trainY, testX_dalFeats, testY

def df_onlyemDepFeats_split(train_df,test_df,embedding_dim=BERT_EMB_OUT):
    trainX_allFeats, trainY, testX_allFeats, testY = df_allFeats_split(train_df, test_df)
    feats_to_keep = [ c for c in trainX_allFeats.columns if any(substring in c for substring in ["sad", "anger", "anx", "negemo", "posemo", "emotion", "depression", "sentiment"]) ]

    trainX_dalFeats = trainX_allFeats[feats_to_keep]
    testX_dalFeats = testX_allFeats[feats_to_keep]

    return trainX_dalFeats, trainY, testX_dalFeats, testY



#---------- MODEL EVALUATION -----------------------------------------------------------------------------------------
def precision_s(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

    precision = TP / (Pred_Positives+K.epsilon())
    return precision 

def recall_s(y_true, y_pred):
    TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    Positives = K.sum(K.round(K.clip(y_true, 0, 1)))

    recall = TP / (Positives+K.epsilon())    
    return recall 

def f1_s(y_true, y_pred):     
    precision, recall = precision_s(y_true, y_pred), recall_s(y_true, y_pred)

    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# def precision_2(y_true,y_pred):
#     y_true = np.array(y_true).reshape(-1)
#     y_pred = np.array(y_pred).reshape(-1)
#     return precision_score(y_true,y_pred)

# def recall_2(y_true,y_pred):
#     y_true = np.array(y_true).reshape(-1)
#     y_pred = np.array(y_pred).reshape(-1)
#     return recall_score(y_true,y_pred)

# def f1_2(y_true,y_pred):
#     y_true = np.array(y_true).reshape(-1)
#     y_pred = np.array(y_pred).reshape(-1)
#     return f1_score(y_true,y_pred)


def plot_learning_curves(history, string, model_path, feats, model_type):
    model_name = model_path.split("/")[-1]
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])

    if "_" in string[-3:]:
        string=string.split("_")[0]

    plt.xlabel("Epochs")
    plt.ylabel(string.capitalize())
    plt.legend(["train_"+string, 'val_'+string])
    plt.title(f"{model_type}\nMetric: {string.capitalize()}\n({feats})")
    plt.savefig(os.path.join(model_path,f"{model_name}_{string}_{feats}.png"), dpi=300)
    plt.show()


def plot_history(history,model_path,feats,model_type):
    for metric in history.history.keys():
        if "val" not in metric:
            plot_learning_curves(history, metric, model_path,feats,model_type)    
#         plot_learning_curves(history, 'loss', model_path)
#         plot_learning_curves(history, 'precision_s', model_path)    
#         plot_learning_curves(history, 'recall_s', model_path)
#         plot_learning_curves(history, 'f1_s', model_path)


from sklearn.metrics import confusion_matrix, classification_report
def class_report(model,trainX,trainY,testX,testY):
    scores = model.evaluate(testX, testY, verbose=1)
    predictions = model.predict(testX)
    predictions=  np.round(predictions).reshape(max(predictions.shape)).astype(float)
    true_y= testY.reshape(max(testY.shape)).astype(float)
    labels = ['Not stressed', 'Stressed']
    class_report_df_test = classification_report(testY.reshape(-1), predictions)
    scores_df_test = pd.DataFrame(confusion_matrix(testY.reshape(-1), predictions), index=labels, columns=labels)
    scores_str_test = f"\n\nTest Set Classification Report"+f"\nAccuracy: {scores[1]*100:.2f}%\nLoss: {scores[0]:.3f}\n" +class_report_df_test+"\n"+scores_df_test.to_string()  

    scores = model.evaluate(trainX, trainY, verbose=1)
    predictions = model.predict(trainX)
    predictions=  np.round(predictions).reshape(max(predictions.shape)).astype(float)
    true_y= trainY.reshape(max(trainY.shape)).astype(float)
    labels = ['Not stressed', 'Stressed']
    class_report_df_train = classification_report(trainY.reshape(-1), predictions)
    scores_df_train = pd.DataFrame(confusion_matrix(trainY.reshape(-1), predictions), index=labels, columns=labels)
    scores_str_train = f"\n\nTrain Set Classification Report"+f"\nAccuracy: {scores[1]*100:.2f}%\nLoss: {scores[0]:.3f}\n" +class_report_df_train+"\n"+scores_df_train.to_string()  
#     print(scores_str_train)

    class_score_str = scores_str_test+"\n\n"+scores_str_train+"\n\n"
    
    return class_score_str

# scores_str = class_report(model_1,trainX,trainY,testX,testY)

# Grid Search
def BiLSTM_to_optimizeRecall(lr, dropout_last_Dense, dense2_num, lstm1, lstm2, dropout_lstm):
    loss_func = 'binary_crossentropy'
    opt = Adam(learning_rate=lr)
    model = model_biLSTM(dropout_last_Dense, dense2_num,  lstm1, lstm2, dropout_lstm)

    model.compile(loss=loss_func, optimizer=opt, metrics=tf.keras.metrics.Recall())

    return model


def CNN_to_optimizeRecall(filters1, k_s1, filters2, k_s2, dropout_last_Dense, denseLast_num):
    loss_func = 'binary_crossentropy'
    opt = Adam(learning_rate=lr)
    model = model_CNN(filters1, k_s1, filters2, k_s2, dropout_last_Dense, denseLast_num)

    model.compile(loss=loss_func, optimizer=opt, metrics=tf.keras.metrics.Recall())

    return model



def ROC_curves_ALL_NNs(model_list,model_type_list,X,y,folder_path,feats,class_num):
    plt.figure(figsize=(18,12))
    for m,t in zip(model_list,model_type_list):
        if t=="CNN":
            X_set = X.reshape(X.shape[0], X.shape[1], -1) 
        else:
            X_set = X.reshape(-1, 1, X.shape[1]) 

        y_true=y
        y_pred = m.predict(X_set).ravel()
        fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_true, y_pred)
        auc_keras = auc(fpr_keras, tpr_keras)
        plt.plot(fpr_keras, tpr_keras, label=f'{t} (area = {auc_keras:.2f})')

    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver operating characteristic\n({feats})')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(folder_path,f'ROC_{feats}.png'), dpi=300)
    plt.show()


def ROC_curves_NNsAlgos(model_list,model_type_list,X_nn,y_nn,X_algo,y_algo,folder_path,feats,class_num):
    best_model = None
    max_auc = 0
    best_model_type = None
    plt.figure(figsize=(18,12))
    for m,t in zip(model_list,model_type_list):
        if t in ["CNN", "BiLSTM", "BiGRU"]:
            if t=="CNN":
                X_set = X_nn.reshape(X_nn.shape[0], X_nn.shape[1], -1) 
            elif t in ["BiLSTM", "BiGRU"]:
                X_set = X_nn.reshape(-1, 1, X_nn.shape[1])
                y_true=y_nn
            y_pred = m.predict(X_set).ravel()
            fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_true, y_pred)
            auc_keras = auc(fpr_keras, tpr_keras)
            plt.plot(fpr_keras, tpr_keras, label=f'{t} (area = {auc_keras:.2f})')

        else:    
            auc_keras = roc_auc_score(y_algo, m.predict(X_algo))
            fpr, tpr, thresholds = roc_curve(y_algo, m.predict_proba(X_algo)[:,1])
            plt.plot(fpr, tpr, label=f'{t} (area = {auc_keras:.2f})')
            
        if auc_keras>max_auc:
            max_auc=auc_keras
            best_model=m
            best_model_type=t
        


    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver operating characteristic\n({feats})')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(folder_path,f'ROC_{feats}.png'), dpi=300)
    plt.show()
    
    return best_model,best_model_type


# ROC Curves
def ROC_curve(model,model_type,X,y,folder_path):
    logit_roc_auc = roc_auc_score(y, model.predict(X))
    fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
    plt.figure()
    plt.plot(fpr, tpr, label=f'{model_type} (area = %0.2f)' % logit_roc_auc)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(folder_path,f'{model_type}_ROC.png'),dpi=300)
    plt.show()

def ROC_curves_ALL(model_list,model_type_list,X,y,folder_path,feats):
    plt.figure(figsize=(18,12))
    for m,t in zip(model_list,model_type_list):

        logit_roc_auc = roc_auc_score(y, m.predict(X))
        fpr, tpr, thresholds = roc_curve(y, m.predict_proba(X)[:,1])
        plt.plot(fpr, tpr, label=f'{t} (area = %0.2f)' % logit_roc_auc)

    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver operating characteristic\n({feats})')
    plt.legend(loc="lower right")
    plt.savefig(os.path.join(folder_path,f'ROC_{feats}.png'), dpi=300)
    plt.show()


# class_num =1
# feats="allFeats"
# ROC_curves_ALL_NNs([model_1, model_2, model_3],["BiLSTM", "CNN", "BiGRU"],testX_orig,testY_orig,file_store_dir,feats,class_num=class_num)

___
___
## Models 
___

In [4]:

#---------- CLASSIFICATION ALGORITHMS -----------------------------------------------------------------------------------------

def LR_classification(file_store_dir, trainX, trainY, testX, testY, feats):
    # among several small values, 0.001 was the one optimizing the results - quicker convergence, higher scores
    C=5e-5
    # scaling="Standard"
    # LR = make_pipeline(StandardScaler(),LogisticRegression(C=C, solver="lbfgs", max_iter=1000 ))
    # scaling="Normal"
    # LR = make_pipeline(MinMaxScaler(),LogisticRegression(C=C, solver="lbfgs", max_iter=1000 ))
    scaling="noScaling"
    LR = LogisticRegression(C=C, solver="lbfgs", max_iter=1000 )
    LR.fit(trainX, trainY)
    LR_scores_precision = evaluate_model_precision(LR, testX, testY)
    LR_scores_recall = evaluate_model_recall(LR, testX, testY)
    LR_scores_f1 = evaluate_model_f1(LR, testX, testY)

    scores = f'\n> Logistic Regression (C: {C})\nmean F1: {mean(LR_scores_f1):.6f} | mean Precision: {mean(LR_scores_precision):.6f} | mean Recall: {mean(LR_scores_recall):.6f}\nstd F1: {std(LR_scores_f1):.6f} | std Precision: {std(LR_scores_precision):.6f} | std Recall: {std(LR_scores_recall):.6f}'

    y_pred = LR.predict(testX)
    log_str = scores+"\n\n"+classification_report(testY, y_pred)+"\n\n"+str(LR.get_params())
    print(log_str)

    log_dir = os.path.join(file_store_dir,"LogisticRegression")
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_path = os.path.join(log_dir,f"LR_{scaling}_{feats}.txt")
    with open(log_path,"w",encoding="utf-8") as log_writer:
        log_writer.write(log_str)

    return LR

def KNN_classification(file_store_dir, trainX, trainY, testX, testY, feats):
    # among several values, 5 was the one optimizing the results
    k_n=5
    # scaling="Standard"
    # KNN = make_pipeline(StandardScaler(),KNeighborsClassifier(n_neighbors=k_n))
    # scaling="Normal"
    # KNN = make_pipeline(MinMaxScaler(),KNeighborsClassifier(n_neighbors=k_n))
    scaling="noScaling"
    KNN = KNeighborsClassifier(n_neighbors=k_n)
    KNN.fit(trainX, trainY)
    KNN_scores_precision = evaluate_model_precision(KNN, testX, testY)
    KNN_scores_recall = evaluate_model_recall(KNN, testX, testY)
    KNN_scores_f1 = evaluate_model_f1(KNN, testX, testY)

    scores = f'\n> k Nearest Neighbor (n_neighbors:{k_n}) \nmean F1: {mean(KNN_scores_f1):.6f} | mean Precision: {mean(KNN_scores_precision):.6f} | mean Recall: {mean(KNN_scores_recall):.6f}\nstd F1: {std(KNN_scores_f1):.6f} | std Precision: {std(KNN_scores_precision):.6f} | std Recall: {std(KNN_scores_recall):.6f}'

    y_pred = KNN.predict(testX)
    log_str = scores+"\n\n"+classification_report(testY, y_pred)+"\n\n"+str(KNN.get_params())
    print(log_str)

    log_dir = os.path.join(file_store_dir,"KNearestNeighbors")
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_path = os.path.join(log_dir,f"KNN_{scaling}_{feats}.txt")
    with open(log_path,"w",encoding="utf-8") as log_writer:
        log_writer.write(log_str)

    return KNN

def SVM_classification(file_store_dir, trainX, trainY, testX, testY, feats):    
    C=1.0
    kernel="rbf"
    # scaling="Standard"
    # SVM = make_pipeline(StandardScaler(),SVC(C=C, kernel=kernel))
    # scaling="Normal"
    # SVM = make_pipeline(MinMaxScaler(),SVC(C=C, kernel=kernel))
    scaling="NoScaling"
    SVM = SVC(C=C, kernel=kernel,probability=True)
    SVM.fit(trainX, trainY)
    SVM_scores_precision = evaluate_model_precision(SVM, testX, testY)
    SVM_scores_recall = evaluate_model_recall(SVM, testX, testY)
    SVM_scores_f1 = evaluate_model_f1(SVM, testX, testY)

    scores = f'\n> Support Vector Machine (C: {C}) \nmean F1: {mean(SVM_scores_f1):.6f} | mean Precision: {mean(SVM_scores_precision):.6f} | mean Recall: {mean(SVM_scores_recall):.6f}\nstd F1: {std(SVM_scores_f1):.6f} | std Precision: {std(SVM_scores_precision):.6f} | std Recall: {std(SVM_scores_recall):.6f}'

    y_pred = SVM.predict(testX)
    log_str = scores+"\n\n"+classification_report(testY, y_pred)+"\n\n"+str(SVM.get_params())
    print(log_str)

    log_dir = os.path.join(file_store_dir,"SupportVectorMachine")
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_path = os.path.join(log_dir,f"SVM_{scaling}_{feats}.txt")
    with open(log_path,"w",encoding="utf-8") as log_writer:
        log_writer.write(log_str)

    return SVM

def RF_classification(file_store_dir, trainX, trainY, testX, testY, feats):
    n_estimators = 100
    scaling="Standard"
    RF = make_pipeline(StandardScaler(),RandomForestClassifier(n_estimators=n_estimators))
    # scaling="Normal"
    # RF = make_pipeline(MinMaxScaler(),RandomForestClassifier(n_estimators=n_estimators))
    # scaling="NoScaling"
    # RF = RandomForestClassifier(n_estimators=n_estimators)
    RF.fit(trainX, trainY)
    RF_scores_precision = evaluate_model_precision(RF, testX, testY)
    RF_scores_recall = evaluate_model_recall(RF, testX, testY)
    RF_scores_f1 = evaluate_model_f1(RF, testX, testY)

    scores = f'\n> Random Forest \nmean F1: {mean(RF_scores_f1):.6f} | mean Precision: {mean(RF_scores_precision):.6f} | mean Recall: {mean(RF_scores_recall):.6f}\nstd F1: {std(RF_scores_f1):.6f} | std Precision: {std(RF_scores_precision):.6f} | std Recall: {std(RF_scores_recall):.6f}'

    y_pred = RF.predict(testX)
    log_str = scores+"\n\n"+classification_report(testY, y_pred)+"\n\n"+str(RF.get_params())
    print(log_str)

    log_dir = os.path.join(file_store_dir,"RandomForest")
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_path = os.path.join(log_dir,f"RF_{scaling}_{feats}.txt")
    with open(log_path,"w",encoding="utf-8") as log_writer:
        log_writer.write(log_str)
        
    return RF



#---------- ARTIFICIAL NEURAL NETWORKS -----------------------------------------------------------------------------------------


def model_training(model,loss_func,opt,trainX,trainY,batch_size,epochs,testX,testY, csv_logger, metrics):
    
    model.compile(loss=loss_func, optimizer=opt, metrics=metrics)
    history = model.fit(trainX, trainY, batch_size=batch_size, epochs=epochs, validation_data=(testX, testY), verbose=1, shuffle=True, callbacks=[csv_logger])
    
    return model, history
    
# model 1
def model_biLSTM(dropout2_drp=0.2, dense2_num=8, lstm1_num=32, lstm2_num=16, dropout_lstm=0.2):
    kernel_reg=regularizers.l1_l2()
#     kernel_reg=None
    dense1_num=32
    dense1_activation="relu"

    dropout1_drp=0.9
    
    dense2_activation="relu"
    
    classes_num=1
    final_activation="sigmoid"
    
    model = Sequential()
    model.add( Dense( dense1_num, activation=dense1_activation, kernel_regularizer=kernel_reg) )
    model.add( Bidirectional(LSTM(lstm1_num, dropout = dropout_lstm, return_sequences=True,kernel_regularizer=kernel_reg  ) ) )
#     model.add( Bidirectional(LSTM(lstm2_num)) )
    model.add( Dense(dense2_num, activation=dense2_activation) )
    model.add( Dropout(dropout2_drp) )
    model.add( Dense(classes_num, activation=final_activation) )
    
    return model


# model 2
def model_CNN(f1=16, k1_size=2, f2=8, k2_size=2, dropout2_drp=0.15, dense2_num=8):
    dense2_activation="relu"
#     dropout2_drp=0.2
    
    classes_num=1
    final_activation="sigmoid"
    
    model = Sequential()
    model.add(Conv1D(filters=f1, kernel_size=k1_size, padding="same", activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(filters=f2, kernel_size=k2_size, padding="same", activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(dense2_num, activation='relu'))
    model.add( Dropout(dropout2_drp) )
    model.add( Dense(classes_num, activation=final_activation) )

    return model

# model 3
def model_BiGRU():
    dense2_num=8
    dense2_activation="relu"
    dropout2_drp=0.2
    
    gru_units = 32
    
    classes_num=1
    final_activation="sigmoid"
    
    model = Sequential()
    model.add( Bidirectional(CuDNNGRU(gru_units, return_sequences=True)))
    model.add( Dropout(0.2))
    model.add( Dense(gru_units*2, activation='relu'))
    model.add( Dropout(0.1))
    model.add( Dense(gru_units, activation='relu'))
    model.add( Dropout(dropout2_drp) )
    model.add( Dense(classes_num, activation=final_activation) )
    
    return model


# model 1
def BiLSTM_classification(file_store_dir, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats):
    model_name="modelBiLSTM"
    model_path=os.path.join(file_store_dir,f"{model_name}")
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    classes_num=1
    # data set reshape
    trainX = trainX_orig.reshape(-1, 1, trainX_orig.shape[1])
    testX = testX_orig.reshape(-1, 1, testX_orig.shape[1])

    # trainY = categorical_encoding(trainY_allFeats)
    # trainY = trainY.reshape(-1, 1, classes_num)
    trainY = trainY_orig.reshape(-1, 1, classes_num)

    # testY= categorical_encoding(testY_allFeats)
    # testY= testY.reshape(-1, 1, classes_num)
    testY = testY_orig.reshape(-1, 1, classes_num)

    loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    lr = 5e-5
    opt = Adam(learning_rate=lr)
    # opt = RMSprop(learning_rate=lr)
    batch_size = 24
#     epochs = 20

    csv_logger = CSVLogger(os.path.join(model_path,f'log_{model_name}_{feats}.csv'), separator=',' )
    metrics= [ "accuracy", tfa.metrics.F1Score(num_classes=2, average="micro", threshold=0.5), tf.keras.metrics.Precision(), tf.keras.metrics.Recall() ]
    # metrics = ["accuracy",f1_s,precision_s,recall_s]
    model_1 = model_biLSTM()
    print(f"\n\nBiLSTM\n")
    model_1, history_1 = model_training(model_1,loss_fun,opt,trainX,trainY,batch_size,epochs,testX,testY, csv_logger, metrics)

    loss, accuracy, f1_score, precision, recall = model_1.evaluate(trainX, trainY, verbose=0)
    eval_str_training = f"\nBiLSTM\n\nTraining Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    loss, accuracy, f1_score, precision, recall = model_1.evaluate(testX, testY, verbose=0)
    eval_str_test = f"\n\nTest Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    # print("\n\n"+eval_str_training, eval_str_test)

#     plot_history(history_1, model_path,feats, "BiLSTM")

    with open(os.path.join(model_path,f"{model_name}_evaluationScore_{feats}.txt"),"w",encoding="utf-8") as evaluation_file:
        evaluation_file.write(class_report(model_1,trainX,trainY,testX,testY))
        evaluation_file.write("\n\n"+eval_str_training+eval_str_test)

#     from IPython.display import FileLink, FileLinks
#     FileLinks(".")

    print(f"\n{class_report(model_1,trainX,trainY,testX,testY)}" )
    
    return model_1, history_1

## model 2
def CNN_classification(file_store_dir, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats):
    model_name="modelCNN"
    model_path=os.path.join(file_store_dir,f"{model_name}")
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    classes_num=1
    # data set reshape
    trainX = trainX_orig.reshape(trainX_orig.shape[0], trainX_orig.shape[1], -1)
    testX = testX_orig.reshape(testX_orig.shape[0], testX_orig.shape[1], -1)

    # le = LabelEncoder()
    # trainY = categorical_encoding(trainY_allFeats)
    # trainY = trainY.reshape(-1, 1, classes_num)
    trainY = trainY_orig.reshape(-1, classes_num)
    # trainY = le.fit_transform()

    # testY= categorical_encoding(testY_allFeats)
    # testY= testY.reshape(-1, 1, classes_num)
    testY = testY_orig.reshape(-1, classes_num)

    loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    lr = 5e-5
    opt = Adam(learning_rate=lr)
    # opt = RMSprop(learning_rate=lr)
    batch_size = 24
#     epochs = 20

    csv_logger = CSVLogger(os.path.join(model_path,f'log_{model_name}_{feats}.csv'), separator=',')
    metrics= [ "accuracy", tfa.metrics.F1Score(num_classes=2, average="micro", threshold=0.5), tf.keras.metrics.Precision(), tf.keras.metrics.Recall() ]

    model_2= model_CNN()
    print(f"\n\nCNN\n")
    model_2,history_2 = model_training(model_2,loss_fun,opt,trainX,trainY,batch_size,epochs,testX,testY, csv_logger, metrics)

    loss, accuracy, f1_score, precision, recall = model_2.evaluate(trainX, trainY, verbose=0)
    eval_str_training = f"\nCNN\n\nTraining Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    loss, accuracy, f1_score, precision, recall = model_2.evaluate(testX, testY, verbose=0)
    eval_str_test = f"\n\nTest Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    # print("\n\n"+eval_str_training, eval_str_test)

#     plot_history(history_2, model_path, feats, "CNN")

    with open(os.path.join(model_path,f"{model_name}_evaluationScore_{feats}.txt"),"w",encoding="utf-8") as evaluation_file:
        evaluation_file.write(class_report(model_2,trainX,trainY,testX,testY))
        evaluation_file.write("\n\n"+eval_str_training+eval_str_test)

#     from IPython.display import FileLink, FileLinks
#     FileLinks(".")   
    print(f"\n{class_report(model_2,trainX,trainY,testX,testY)}" )
    
    return model_2, history_2

## model 3
def BiGRU_classification(file_store_dir, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats):
    model_name="modelBiGRU"
    model_path=os.path.join(file_store_dir,f"{model_name}")
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    classes_num=1
    # # data set reshape
    trainX = trainX_orig.reshape(-1, 1, trainX_orig.shape[1])
    testX = testX_orig.reshape(-1, 1, testX_orig.shape[1])

    # trainY = categorical_encoding(trainY_allFeats)
    # trainY = trainY.reshape(-1, 1, classes_num)
    trainY = trainY_orig.reshape(-1, 1, classes_num)

    # testY= categorical_encoding(testY_allFeats)
    # testY= testY.reshape(-1, 1, classes_num)
    testY = testY_orig.reshape(-1, 1, classes_num)

    loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    lr = 5e-5
    opt = Adam(learning_rate=lr)
    # opt = RMSprop(learning_rate=lr)
    batch_size = 24
#     epochs = 20

    csv_logger = CSVLogger(os.path.join(model_path,f'log_{model_name}_{feats}.csv'), separator=',')
    metrics= [ "accuracy", tfa.metrics.F1Score(num_classes=2, average="micro", threshold=0.5), tf.keras.metrics.Precision(), tf.keras.metrics.Recall() ]

    model_3= model_BiGRU()
    print(f"\n\nBiGRU\n")
    model_3,history_3 = model_training(model_3,loss_fun,opt,trainX,trainY,batch_size,epochs,testX,testY, csv_logger, metrics)

    loss, accuracy, f1_score, precision, recall = model_3.evaluate(trainX, trainY, verbose=0)
    eval_str_training = f"\nBiGRU\n\nTraining Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    loss, accuracy, f1_score, precision, recall = model_3.evaluate(testX, testY, verbose=0)
    eval_str_test = f"\n\nTest Set\nLoss: {loss: .6f}\nAccuracy: {accuracy:.6f}\nPrecision: {precision:.6f}\nRecall: {recall:.6f}\nF1: {f1_score:.6f}\n\n"
    # print("\n\n"+eval_str_training, eval_str_test)

#     plot_history(history_3, model_path, feats, "BiGRU")

    with open(os.path.join(model_path,f"{model_name}_evaluationScore_{feats}.txt"),"w",encoding="utf-8") as evaluation_file:
        evaluation_file.write(class_report(model_3,trainX,trainY,testX,testY))
        evaluation_file.write("\n\n"+eval_str_training+eval_str_test)
    
    print(f"\n{class_report(model_3,trainX,trainY,testX,testY)}" )
#     from IPython.display import FileLink, FileLinks
#     FileLinks(".")   

    return model_3, history_3    


___
### Dataset Modifications/Additions
___

In [5]:
def emb_feats_df(df,embs,dim):
    
    embs = [ eval(enc) if type(enc)!=list else enc for enc in embs ]

    text_embed_names = [ str(i) for i in range(dim) ]
    embs_df = pd.DataFrame(embs, columns=text_embed_names)
    embFeats_df = embs_df.join(df)
    
    return embFeats_df

def token_WVs(text_list, w2v_model):
    all_tokens = [ t for t_list in text_list for t in t_list ]
    count_unknown=0
    
    unk_vector = np.zeros(w2v_model["the"].shape)
    nump_zero_arrs=[  unk_vector for t in all_tokens ]
    token_dict={k:v for k,v in zip(all_tokens,nump_zero_arrs)}
    for tok in all_tokens:
        if tok in w2v_model:
            token_dict[tok]=w2v_model[tok]
        else:
            count_unknown+=1
    
    token_dict["<UNK>"] = np.zeros(w2v_model["the"].shape)
    
    return token_dict, count_unknown


def text_WVs(text_list, token_vectors):
#     for text in text_list:
#         sent_vect_mean = 
    
    sentence_vects =[ np.array( [ token_vectors[token] if token in token_vectors else token_vectors["<UNK>"]  for token in s_tokens ] ).mean(axis=0) if len(s_tokens)>0 else token_vectors["<UNK>"] for s_tokens in text_list    ]      
    return sentence_vects


Word2Vec_path = "/kaggle/working/Word2Vec/"
if not os.path.exists(Word2Vec_path):
    os.makedirs(Word2Vec_path)

Word2Vec_news = os.path.join(Word2Vec_path,"GoogleNews-vectors-negative300.bin.gz")

if not os.path.exists(Word2Vec_news[:-3]):
    %cd {Word2Vec_path}
    !gdown 0B7XkCwpI5KDYNlNUTTlSS21pQmM
    !gunzip GoogleNews-vectors-negative300.bin.gz

%cd /kaggle/working
Word2Vec_news_emb_file = os.path.join(Word2Vec_path,"GoogleNews-vectors-negative300.bin")



In [6]:
def create_sentence_embeddings():    
    # gpu_clean()
    # load original dataset
    dataset_train = "/kaggle/input/stress-analysis-in-social-media/dreaddit-train.csv"
    dataset_test = "/kaggle/input/stress-analysis-in-social-media/dreaddit-test.csv"
    train_df = pd.read_csv(dataset_train)
    test_df = pd.read_csv(dataset_test)

    # create embeddings
    train_sent_embs,test_sent_embs = BERT_sentence_emb(train_df["text"],test_df["text"], bert_tok_name='bert-base-uncased')
    train_embs, test_embs = get_sentence_embs(train_sent_embs, test_sent_embs)
    # BERT_embFeats_train = join_embs_feats(train_embs,train_df_SED_text)
    # BERT_embFeats_test = join_embs_feats(test_embs,test_df_SED_text)



    # load stress-emotion-depression dataset
    dataset_train_SED_text = "/kaggle/input/sed-text-dataset/stressTrainEmDepr.csv"
    dataset_test_SED_text = "/kaggle/input/sed-text-dataset/stressTestEmDepr.csv"
    train_df_SED_text = pd.read_csv(dataset_train_SED_text)
    test_df_SED_text = pd.read_csv(dataset_test_SED_text)

    google_news_vectors = KeyedVectors.load_word2vec_format(Word2Vec_news_emb_file, binary=True)

    add_tokens_col(train_df_SED_text)
    text_list = list(train_df_SED_text["tokens"].values)
    token_vectors_dict_train,unk_words = token_WVs(text_list, google_news_vectors)
    print(f"\nTraining Set unknown Words: {unk_words} ({unk_words/(len(token_vectors_dict_train)-1)*100:.2f}%)")
    sentence_vectors_train = text_WVs(text_list, token_vectors_dict_train)


    add_tokens_col(test_df_SED_text)
    text_list = list(test_df_SED_text["tokens"].values)
    token_vectors_dict_test,unk_words = token_WVs(text_list, google_news_vectors)
    print(f"\nTest Set unknown Words: {unk_words} ({unk_words/(len(token_vectors_dict_test)-1)*100:.2f}%)")
    sentence_vectors_test = text_WVs(text_list, token_vectors_dict_test)



    train_sent_embs, test_sent_embs= [ vect.tolist() for vect in train_sent_embs], [ vect.tolist() for vect in test_sent_embs]
    bert_train = pd.Series(train_sent_embs)
    bert_test =  pd.Series(test_sent_embs)
    sentence_vectors_train, sentence_vectors_test= [ vect.tolist() for vect in sentence_vectors_train], [ vect.tolist() for vect in sentence_vectors_test]
    Word2Vec_train =  pd.Series(sentence_vectors_train)
    Word2Vec_test=  pd.Series(sentence_vectors_test)

    train_df_SED_text["BERT"] = bert_train.values
    test_df_SED_text["BERT"] =  bert_test.values
    train_df_SED_text["Word2Vec"] =  Word2Vec_train.values
    test_df_SED_text["Word2Vec"] =  Word2Vec_test.values


    # save dataframes
    train_df_SED_text.to_csv(os.path.join(file_store_dir,"stressTrainEmDep_BERTnWord2Vec_text.csv"),  index = False, header=True)
    test_df_SED_text.to_csv(os.path.join(file_store_dir,"stressTestEmDep_BERTnWord2Vec_text.csv"),  index = False, header=True)

    FileLinks(".")

___
___

# Experiments Wrapped Up !
___
## **Stress Detection**

#### Application of text classification tasks based on paper **["Dreaddit: A Reddit Dataset for Stress Analysis in Social Media"](https://arxiv.org/pdf/1911.00133v1.pdf)**

___


In [None]:
def run_all(file_store_dir, trainX, trainY, testX, testY, feats, text_emb_type="BERT", epochs=12):
    print(f"\n\n\n--> {feats} <--\n")
    # create log file directories
    file_store_dir_classAlgos = os.path.join(file_store_dir,"Algorithms")
    file_store_dir_classNNs = os.path.join(file_store_dir,"NNs")
    for d in [file_store_dir_classAlgos, file_store_dir_classNNs]:
        if not os.path.exists(d):
            os.makedirs(d)
    
    model_names=["modelBiLSTM","modelCNN","modelBiGRU"]
    model_paths = []
    for n in model_names:
        model_path=os.path.join(file_store_dir,f"{n}")
        model_paths.append(model_path)
        if not os.path.exists(model_path):
            os.makedirs(model_path)


    # Classifiers
    print("\n\nClassification Algorithms")
    # Algorithms
    trainX_algo, trainY_algo, testX_algo, testY_algo = trainX.copy(), trainY.copy(), testX.copy(), testY.copy()
    LR = LR_classification(file_store_dir_classAlgos, trainX_algo, trainY_algo, testX_algo, testY_algo, feats)
    KNN = KNN_classification(file_store_dir_classAlgos, trainX_algo, trainY_algo, testX_algo, testY_algo, feats)
    SVM = SVM_classification(file_store_dir_classAlgos, trainX_algo, trainY_algo, testX_algo, testY_algo, feats)
    RF = RF_classification(file_store_dir_classAlgos, trainX_algo, trainY_algo, testX_algo, testY_algo, feats)

    print("\n\nArtificial NNs")
    # Artificial NNs
    trainX_orig, trainY_orig, testX_orig, testY_orig = trainX.values, trainY.values.astype(float), testX.values, testY.values.astype(float)
    modelBiLSTM, historyBiLSTM = BiLSTM_classification(file_store_dir_classNNs, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats)
    plot_history(historyBiLSTM, model_paths[0], feats, "BiLSTM")
    
    trainX_orig, trainY_orig, testX_orig, testY_orig = trainX.values, trainY.values.astype(float), testX.values, testY.values.astype(float)
    modelCNN, historyCNN = CNN_classification(file_store_dir_classNNs, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats)
    plot_history(historyCNN, model_paths[1], feats, "CNN")
    
    trainX_orig, trainY_orig, testX_orig, testY_orig = trainX.values, trainY.values.astype(float), testX.values, testY.values.astype(float)
    modelBiGRU, historyBiGRU = BiGRU_classification(file_store_dir_classNNs, trainX_orig, trainY_orig, testX_orig, testY_orig, epochs, feats)
    plot_history(historyBiGRU, model_paths[2], feats, "BiGRU")

    # Evaluation
    class_num=1
    ROC_curves_ALL([LR,KNN,SVM,RF],["LogReg","kNearestN","SVM","RandomForest"],testX_algo,testY_algo,file_store_dir_classAlgos,feats)
    ROC_curves_ALL_NNs([modelBiLSTM,modelCNN,modelBiGRU],["BiLSTM","CNN","BiGRU"],testX_orig,testY_orig,file_store_dir_classNNs,feats,class_num=class_num)

    model_list = [ LR, KNN, SVM, RF, modelBiLSTM, modelCNN, modelBiGRU ]
    model_type_list = [ "LogisticRegression", "KNearestNeihbor", "SupportVectorMachine", "RandomForest", "BiLSTM", "CNN", "BiGRU" ]
    best_model, best_model_type = ROC_curves_NNsAlgos(model_list,model_type_list,testX_orig,testY_orig,testX_algo,testY_algo,file_store_dir,feats,class_num)


    # Save all results to a .zip file
    zipname=os.path.join(file_store_dir,f"{text_emb_type}_{feats}_Results.zip")
    !zip -r {zipname} {file_store_dir}
    
    return best_model, f"{best_model_type}_{feats}"


def run_all_feats(file_store_dir,train_df, test_df, text_emb_type="BERT"):
    emb_dim=BERT_EMB_OUT
    if text_emb_type=="Word2Vec":
        emb_dim=Word2Vec_EMB_OUT
        
    
    best_models, best_model_types = [], []
    # Feature selection
    # __________________________________________________
    
    # Only Text embeddings to check the contribution of the features in the next step
    feats="onlyTextEmbeds"
    trainX, trainY, testX, testY = df_textEmbedsFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt)    
    
    # All Features selected
    feats="allFeats"
    trainX, trainY, testX, testY = df_allFeats_split(train_df, test_df)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type, epochs=20)
    best_models.append(bm)
    best_model_types.append(bmt) 

    # Linguistic Inquiry and Word Count features selected
    feats="LIWC"
    trainX, trainY, testX, testY = df_liwcFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt) 

    # Dictionary of Affect in Language features selected
    feats="DAL"
    trainX, trainY, testX, testY = df_dalFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt) 

    # Lexical features selected (LIWC + DAL + sentiment)
    feats="Lex"
    trainX, trainY, testX, testY = df_lexFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt) 


    # Emotion - Depression features selected from LIWC + DAL + sentiment + emotion/depression dictionary
    feats="EmDep"
    trainX, trainY, testX, testY = df_emDepFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt) 

    
    
# #     ---------- Testing the features' contribution ! - experimentation -------------------------------
#     Only the Features are selected
    feats="onlyFeats"
    trainX, trainY, testX, testY = df_onlyFeats_split(train_df, test_df)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type, epochs=20)
    best_models.append(bm)
    best_model_types.append(bmt)

    
#     Linguistic Inquiry and Word Count features selected
    feats="onlyLIWC"
    trainX, trainY, testX, testY = df_onlyliwcFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt)
    
#     Dictionary of Affect in Language features selected
#     feats="onlyDAL"
#     trainX, trainY, testX, testY = df_onlydalFeats_split(train_df,test_df, emb_dim)
#     feats_dir = os.path.join(file_store_dir,f"{feats}")
#     bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
#     best_models.append(bm)
#     best_model_types.append(bmt)

    # Lexical features selected (LIWC + DAL + sentiment)
    feats="onlyLex"
    trainX, trainY, testX, testY = df_onlylexFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt)

    # Emotion - Depression features selected from LIWC + DAL + sentiment + emotion/depression dictionary
    feats="onlyEmDep"
    trainX, trainY, testX, testY = df_onlyemDepFeats_split(train_df,test_df, emb_dim)
    feats_dir = os.path.join(file_store_dir,f"{feats}")
    bm,bmt = run_all(feats_dir, trainX, trainY, testX, testY, feats, text_emb_type)
    best_models.append(bm)
    best_model_types.append(bmt)
    
    return best_models, best_model_types


In [None]:
# run the imports and define functions
file_store_dir = "/kaggle/working/results"
if not os.path.exists(file_store_dir):
    os.makedirs(file_store_dir)

file_store_dir_BERT = "/kaggle/working/results/BERT"
if not os.path.exists(file_store_dir_BERT):
    os.makedirs(file_store_dir_BERT)
    
file_store_dir_Word2Vec = "/kaggle/working/results/Word2Vec"
if not os.path.exists(file_store_dir_Word2Vec):
    os.makedirs(file_store_dir_Word2Vec)
    
#---- Data ---- 
# Download the saved datasets
dataset_train_embs = "/kaggle/working/stressTrainEmDep_BERTnWord2Vec_text.csv"
dataset_test_embs = "/kaggle/working/stressTestEmDep_BERTnWord2Vec_text.csv"

if not os.path.exists(dataset_train_embs):
    !gdown 1IcrTBMXnXkcrBo543NDjGH7UT3QLYNa-
    !gdown 1mtp-OP249SJsj-LYTEFbHpArcRwpOqte


# BERT
train_df_embs = pd.read_csv(dataset_train_embs)
test_df_embs = pd.read_csv(dataset_test_embs)
BERT_feats_df_train = emb_feats_df(train_df_embs,train_df_embs["BERT"],BERT_EMB_OUT)
BERT_feats_df_test = emb_feats_df(test_df_embs, test_df_embs["BERT"],BERT_EMB_OUT)

# Word2Vec
train_df_embs = pd.read_csv(dataset_train_embs)
test_df_embs = pd.read_csv(dataset_test_embs)
Word2Vec_feats_df_train = emb_feats_df(train_df_embs,train_df_embs["Word2Vec"],Word2Vec_EMB_OUT)
Word2Vec_feats_df_test = emb_feats_df(train_df_embs, test_df_embs["Word2Vec"],Word2Vec_EMB_OUT)

# print("\n\n\nBERT Sentence Embeddings\n")
# best_models_BERT,best_model_types_BERT = run_all_feats(file_store_dir_BERT, BERT_feats_df_train, BERT_feats_df_test, "BERT")
# ROC_curves_NNsAlgos(best_models_BERT,best_model_types_BERT,testX_orig,testY_orig,testX_algo,testY_algo,file_store_dir_BERT,"BERT",class_num)

print("\n\n\nWord2Vec Google News Sentence Embeddings\n")
best_models_Word2Vec,best_model_types_Word2Vec  = run_all_feats(file_store_dir_Word2Vec, Word2Vec_feats_df_train, Word2Vec_feats_df_test, "Word2Vec")
# ROC_curves_NNsAlgos(best_models_Word2Vec,best_model_types_Word2Vec,testX_orig,testY_orig,testX_algo,testY_algo,file_store_dir_Word2Vec,"Word2Vec",class_num)


# Display all file links to download
FileLinks(".")   


In [None]:
# Save all results to a .zip file
zipname=os.path.join("/kaggle/working",f"Results_final.zip")
!zip -r {zipname} /kaggle/working
    
# Display all file links to download
FileLinks(".")   