In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PATH = '/content/drive/MyDrive/project/'

In [None]:
data = pd.read_csv(PATH + 'suicide-watch.zip')

In [None]:
data['class'].value_counts()

suicide        116037
non-suicide    116037
Name: class, dtype: int64

In [None]:
data.text

0         wholesome Mom Moment step mother amazon prime ...
1         Omg guy , say yes ! ask weird - ass weeb , say...
2         point . reason go . hate . evident way people ...
3         official school finally block Reddit . study h...
4         guess . hi , new sub . not " verge " kill , ge...
                                ...                        
116032    final Exit networkdoe know legitimate ? unders...
116033         watch Batzorig Vaanchig , awesome memify pls
116034    rational realization prefer not exist . honest...
116035    wait push edgei not right . life slowly near c...
116036    , Life give Lemons - Vine Remix [ https://www....
Name: text, Length: 116037, dtype: object

In [None]:
import spacy

Initial step after data gathering is datapreprocessing.The following function I wrote for preprocess the text for the classifiers.

In [None]:
import re

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words.remove("not")
# remove not from stopwords as it might be important for the task at hand
nlp.vocab["not"].is_stop = False

from nltk.corpus import stopwords

def clean(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\W+', ' ', text)
    doc = nlp(text.lower())
    text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return text

This code removes links, numbers, punctuation, lowercasesthetext, and stopwords except the word ”not”, which is useful in this scenario. After this step,labels have to been coded appropriately, so that the model can read them.

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['class'] = le.fit_transform(data['class'])

In [None]:
# cut the data to meet deadlines
X, y = data.text[:len(data)//4], data['class'][:len(data)//4]

I have also taken only a part of the dataset, in order to meet the deadline,
since training on the whole dataset takes a lot of time even on GPU, and with
Colab’s usage limits the task would not have been carried out successfully. The
sample used is 58k posts with preserved class balance. The text is then cleaned:

In [None]:
# cleaning
from tqdm import tqdm
tqdm.pandas()
text_cleaned = X.progress_apply(clean)

100%|██████████| 58018/58018 [26:28<00:00, 36.52it/s]


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_cleaned, y, test_size=0.2, random_state=42)

After that, Naive Bayes is applied and evaluated on generated Tf-Idf vector
representations.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tf-idf",TfidfVectorizer()),
    ("clf",MultinomialNB())
])

model_0.fit(X = X_train, y = y_train)

In [None]:
model_0.score(X = X_test, y = y_test)

0.8771113409169252

Then, there is a vocabulary class I previously wrote that generates a tok2idx
dictionary. It is necessary for creating IDs for each token for their retrieval in the embedding layers later on.

In [None]:
import re
from collections import Counter
import numpy as np
from tqdm import tqdm
import pickle
import spacy, re


MAX_LEN = 55

class Vocab:

    # iniatialize our token 2 index dict and maximum sentence length.

    def __init__(self, max_len):
        self.tok2idx = {'<PAD>': 0, '<UNK>' : 1}
        self.max_len = max_len
        # self.get_stop_words()
    # returns the length of our vocab.

    def __len__(self):
        return len(self.tok2idx)

    # basic text preprocessing - removing punctuation and gluing the text back together.

    # def clean(self, text):
    #     text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    #     text = re.sub(r'\d+', '', text)
    #     text = re.sub(r'\W+', ' ', text)
    #     doc = nlp(text.lower())
    #     text = " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    #     return text

    # tokenization of the text. Neccessary to generate unique indices.

    def tokenize(self, text):
        tokens = []
        for word in text.split():
            tokens.append(word)
        return tokens

    # building our tok2idx vocab with a given corpus

    def build_vocab(self, text_list=None):
        try:
            self.load_vocab()
        except:
            # start and index 2, since we already have <PAD> and <UNK> in our vocab by default.
            idx = 2
            # our Counter classfrom Python collections library
            self.freqs = Counter()
            for sent in tqdm(text_list):
                # preprocessing step - removing punctuation.
                #sent = self.clean(sent)
                # tokenize as we go...
                for word in self.tokenize(sent):
                    # count the number of words
                    self.freqs[word] += 1

            for token in self.freqs.keys():
                # set the index of the token
                self.tok2idx[token] = idx
                idx += 1

            self.save_vocab()



    # this is for generating a tensor of padded tensors of integers in the form of a tensor.
    # this is uselful for generating the training and testing sets that the machine can interpret.

    def nums_from_text(self, text_list):

        # initialize our array of this exact size (desired length of our dataset and maximum sentence length)
        text_array = np.empty((len(text_list), self.max_len), dtype=np.int64)

        for array_idx, sent in tqdm(enumerate(text_list)):
            # ignore empty sentences
            if sent is np.nan:
                continue

            # clean each sentence and tokenize it
            #sent = self.clean(sent)
            tokens = self.tokenize(sent)

            # generate the inner array, which represents a single sentence
            num_text = np.zeros(self.max_len,  dtype=np.int64)

            for idx, token in enumerate(tokens):
                # truncate if the sentence exceeds max_len
                if idx >= self.max_len:
                    break
                # replace each token with its index
                num_text[idx] = self.tok2idx[token] if token in self.tok2idx.keys() else self.tok2idx['<UNK>']

            # insert the numericalized sentence into the outer tensor
            text_array[array_idx] = num_text

        return text_array

    def save_vocab(self):
        with open(PATH + 'tok2idx1.pkl', 'wb') as f:
            pickle.dump(self.tok2idx, f)

    def load_vocab(self):
        with open(PATH + 'tok2idx1.pkl', 'rb') as f:
            self.tok2idx = pickle.load(f)

    # def get_stop_words(self):
    #     with open(PATH + 'clinical-stopwords.txt', 'r') as f:
    #         self.stop_words = f.readlines()

In [None]:
print(VOCAB.tok2idx)

{'<PAD>': 0, '<UNK>': 1}


In [None]:
# # X = pd.concat([X_train, X_test], axis=0)
VOCAB = Vocab(MAX_LEN)
VOCAB.build_vocab(text_cleaned)

In [None]:
text_arr = VOCAB.nums_from_text(text_cleaned)

58018it [00:01, 34122.21it/s]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(text_arr, data['class'], test_size=0.2, random_state=42)

In [None]:
X_train_, y_train_ = X_train[:int(len(X_train)*0.8)], y_train[:int(len(y_train)*0.8)]
X_val_, y_val_ = X_train[int(len(X_train)*0.8):], y_train[int(len(X_train)*0.8):]

In [None]:
X_train

array([[  529,    65,    15, ...,     0,     0,     0],
       [  304, 22702,    33, ...,   304,   101,   959],
       [  238,  1439,   334, ...,     0,     0,     0],
       ...,
       [  564,   265,   266, ...,     0,     0,     0],
       [  140,   615,   205, ...,     0,     0,     0],
       [  478,    99,    55, ...,     0,     0,     0]])

In [None]:
NUM_CLASSES = 2

Then the dataset is prepared to be feed into a keras LSTM with the help of
tensorflow batchloaders.

In [None]:
import tensorflow.keras as tfk

y_train = tfk.utils.to_categorical(y_train_)
y_val = tfk.utils.to_categorical(y_val_)
y_test = tfk.utils.to_categorical(y_test)

In [None]:
import tensorflow as tf

tf_train = tf.data.Dataset.from_tensor_slices((X_train_, y_train))
tf_val = tf.data.Dataset.from_tensor_slices((X_val_, y_val))
tf_test = tf.data.Dataset.from_tensor_slices((X_test, y_test))

tf_train = tf_train.batch(64).prefetch(tf.data.AUTOTUNE)
tf_val = tf_val.batch(64).prefetch(tf.data.AUTOTUNE)
tf_test = tf_test.batch(64).prefetch(tf.data.AUTOTUNE)

After that, I wrote a network constructor class to build networks more easily
given the parameter grid:

In [None]:
import tensorflow.keras as tfk
from keras.regularizers import l2 as f_l2

# CNN constructor (remnant of my)
def build_conv(num_layers=1, emb_dim=128, filters=32, kernel_size=5, drop_rate=0.0, l2=None, dense_neurons=None, add_batch_norm=False, summary=False):

    model = tfk.Sequential()

    model.add(tfk.layers.Embedding(input_dim = len(VOCAB.tok2idx), output_dim = emb_dim, input_length=MAX_LEN))

    for _ in range(num_layers):
        model.add(tfk.layers.Conv1D(filters, kernel_size=kernel_size, padding='same', activation='relu', kernel_regularizer=f_l2(l2)))
        model.add(tfk.layers.Dropout(drop_rate))
        if add_batch_norm:
            model.add(tfk.layers.BatchNormalization())

    model.add(tfk.layers.GlobalAveragePooling1D())

    if dense_neurons is not None:
        model.add(tfk.layers.Dense(dense_neurons, kernel_regularizer=f_l2(l2)))
        model.add(tfk.layers.Dropout(drop_rate))

    model.add(tfk.layers.Dense(NUM_CLASSES, activation='softmax'))
    model.compile(
        loss='categorical_crossentropy',
        optimizer='Adam',
        metrics=['accuracy']
    )

    if summary:
        model.summary()

    return model

In [None]:
import tensorflow.keras as tfk
from keras.regularizers import l2 as f_l2

def build_lstm(num_layers=1, emb_dim=256, neurons=64, bidir=False, drop_rate=0.0, l2=None, summary=False):

    l2 = f_l2(l2)

    model = tfk.Sequential()

    model.add(tfk.layers.Embedding(input_dim = len(VOCAB.tok2idx), output_dim = int(emb_dim), input_length=MAX_LEN))

    if bidir:
        for i in range(num_layers):
            # these if statements are used to check if the current layer is last, since it is required that all the LSTM layers but the last
            # have to return_sequences set to True
            if i < num_layers - 1:
                model.add(tfk.layers.Bidirectional(tfk.layers.LSTM(neurons, dropout = drop_rate, return_sequences=True, recurrent_regularizer=l2)))
            elif i == num_layers - 1:
                model.add(tfk.layers.Bidirectional(tfk.layers.LSTM(neurons, dropout = drop_rate, recurrent_regularizer=l2)))
    else:
        for i in range(num_layers):
            if i < num_layers - 1:
                model.add(tfk.layers.LSTM(neurons, dropout = drop_rate, return_sequences=True, recurrent_regularizer=l2))
            elif i == num_layers - 1:
                model.add(tfk.layers.LSTM(neurons, dropout = drop_rate, recurrent_regularizer=l2))

    model.add(tfk.layers.Dense(NUM_CLASSES, activation='softmax'))

    model.compile(
        optimizer = 'Adam',
        loss = 'categorical_crossentropy',
        metrics = ['accuracy']
    )

    if summary:
        model.summary()

    return model

Then there is also a general build function, which I used in another project.
It helps in building different architectures from multiple constructors.

In [None]:
def build(**kwargs):
    if kwargs['model_type'] == 'LSTM':
        _ = kwargs.pop('model_type', None)
        return build_lstm(**kwargs)

    if kwargs['model_type'] == 'CONV':
        _ = kwargs.pop('model_type', None)
        return build_conv(**kwargs)

    else: return None

After that, there is a Random Optimization from grid class I previously
wrote. It randomly selects hyperparameters from the grid following a uniform
distribution.

In [None]:
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from itertools import product
import pickle
random.seed(50)

MAX_EVALS = 5

class KerasRandomHpOpt:

    def __init__(self, train, val, params, epochs, y_val, max_evals = MAX_EVALS):
        self.train = train
        self.val = val
        self.y_val = y_val
        self.epochs = epochs
        self.params = params
        self.max_evals = max_evals


    def random_search(self):
        """Random search for hyperparameter optimization"""

        # Dataframe for results
        all_params = self.get_all_params()
        results = pd.DataFrame(columns = ['acc', 'precision', 'recall', 'f1', *all_params, 'iteration'],
                                    index = list(range(MAX_EVALS*len(self.params))))
        row = 0
        for model_i in range(len(self.params)):
            for i in range(self.max_evals):
                # Choose random hyperparameters
                parameters = {k: random.choice(v) for k, v in self.params[model_i].items()}
                print('-'*150)
                print(f"Currently trying out {parameters}...")
                print('-'*150)
                # Evaluate randomly selected hyperparameters
                eval_results = self.objective(parameters)
                results.iloc[row] = {**eval_results, **parameters}
                results.to_excel(PATH + 'record.xlsx')
                row += 1


    def objective(self, params):
        model = build(**params)
        #print(self.train.shape, self.val.shape)
        hist = model.fit(self.train, epochs = self.epochs, validation_data = self.val)

        probs = model.predict(self.val)
        preds = tf.argmax(probs, axis=1)
        perf = KerasRandomHpOpt.eval(y_true = self.y_val, y_pred = preds)

        fname = KerasRandomHpOpt.generate_fname(params)

        print('-'*150)
        print('Saving history in ' + fname + '.hist.pkl...')
        KerasRandomHpOpt.save_history(hist.history, fname + '.hist.pkl')

        print('Saving model in ' + fname + '.h5...')
        KerasRandomHpOpt.save_model(model, fname +'.h5')

        # print('Saving model architechture in ' + fname + '.json...')
        # KerasRandomHpOpt.save_architechture(model, fname +'.json')

        return perf

    def get_all_params(self):
        all_params = []
        for params in self.params:
            for param in params:
                if param not in all_params:
                    all_params.append(param)

        return all_params

    @staticmethod
    def eval(y_true,y_pred):
        # Calculate model accuracy
        accuracy = accuracy_score(y_true,y_pred)*100
        # Calculate model precision, recall and f1 score using "weighted" average
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
        results = {'acc': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

        return results

    @staticmethod
    def generate_fname(params):
        fname = ''
        for item in params.items():
            fname += str(item[0]) + '-' + str(item[1]) + '.'

        fname = fname[:-1]
        fname += '_1'

        return fname

    @staticmethod
    def save_history(hist, fname):
        with open(PATH + 'hist/' + fname, 'wb') as f:
            pickle.dump(hist, f)

    @staticmethod
    def save_model(model, fname):
        model.save(PATH + 'models/' + fname)

    @staticmethod
    def save_architechture(model, fname):
        model.to_json(PATH + 'architectures/' + fname)

Define parameter grid and run:

In [None]:
import random
random.seed(50)

param_grid = [{
    'model_type': ['LSTM'],
    'num_layers': [1],
    'emb_dim': [64, 128, 256],
    'neurons': [32, 64, 128],
    'drop_rate': [0.0, 0.2],
     'l2': [0.01, 0.02],
     'bidir': [True, False]
     }]



In [None]:
parameters = {k: random.choice(v) for k, v in param_grid[0].items()}
parameters

{'model_type': 'CONV',
 'num_layers': 2,
 'emb_dim': 128,
 'filters': 16,
 'kernel_size': 5,
 'drop_rate': 0.0,
 'l2': 0.01,
 'dense_neurons': None,
 'add_batch_norm': False}

In [None]:
model = build(**parameters, summary=True)

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 55, 128)           256       
                                                                 
 conv1d (Conv1D)             (None, 55, 16)            10256     
                                                                 
 dropout (Dropout)           (None, 55, 16)            0         
                                                                 
 conv1d_1 (Conv1D)           (None, 55, 16)            1296      
                                                                 
 dropout_1 (Dropout)         (None, 55, 16)            0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                     

In [None]:
tuner = KerasRandomHpOpt(tf_train, tf_val, param_grid, 5, y_val_)

In [None]:
tuner.random_search()

------------------------------------------------------------------------------------------------------------------------------------------------------
Currently trying out {'model_type': 'LSTM', 'num_layers': 1, 'emb_dim': 128, 'neurons': 128, 'drop_rate': 0.0, 'l2': 0.02, 'bidir': False}...
------------------------------------------------------------------------------------------------------------------------------------------------------
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------------------------------------------------------------------------------------
Saving history in model_type-LSTM.num_layers-1.emb_dim-128.neurons-128.drop_rate-0.0.l2-0.02.bidir-False_1.hist.pkl...
Saving model in model_type-LSTM.num_layers-1.emb_dim-128.neurons-128.drop_rate-0.0.l2-0.02.bidir-False_1.h5...
-----------------------------------------------------------------------------------------------------------------------------

Function for loading the saved models

In [None]:
import keras
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import tensorflow as tf

def load_data_and_test(PATH, X_val, y_val):

    models = []
    metrics = {'acc': [], 'rec': [], 'prec': [], 'f1': []}

    for model in os.listdir(PATH + '/models'):
        model = keras.models.load_model(PATH + '/models/' + model)
        probs = model.predict(X_val)
        preds = tf.argmax(probs, axis=1)
        accuracy = accuracy_score(y_val, preds)*100
        precision = precision_score(y_val, preds)*100
        f1 = f1_score(y_val, preds)*100
        recall = recall_score(y_val, preds)*100
        metrics['acc'].append(accuracy)
        metrics['prec'].append(precision)
        metrics['f1'].append(f1)
        metrics['rec'].append(recall)


    df = pd.DataFrame(data=metrics)

    return models, df

In [None]:
models, df = load_data_and_test(PATH, tf_val, y_val_)



In [None]:
df

Unnamed: 0,acc,rec,prec,f1
0,90.929656,93.757962,88.960516,91.296258
1,91.446731,92.993631,90.42114,91.689345
2,91.468275,91.762208,91.451545,91.606613
3,90.778843,94.076433,88.478435,91.191603
4,90.908112,89.808917,92.076622,90.928633


Then for SBERT embeddings, I load the SBERT model from HuggingFace
repository, encode my samples, split the data into sets, and prepare it for the
keras model.

In [None]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
# load saved cleaned data
df = pd.read_csv(PATH + 'cleaned_watch.csv')

In [None]:
embeddings = sbert.encode(df.text)

In [None]:
import numpy as np
embeddings = np.array(embeddings)

In [None]:
np.save(PATH + 'sent_embs.npy', embeddings)

In [None]:
# import numpy as np
# embeddings = np.load(PATH + 'sent_embs.npy')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
emb_tens_train, emb_tens_test, lables_train, lables_test  = train_test_split(embeddings, df['class'], test_size=0.2)

In [None]:
emb_tens_train, emb_tens_val, lables_train, lables_val  = train_test_split(emb_tens_train, lables_train, test_size=0.15)

In [None]:
import tensorflow as tf
# batch_size = 128

embs_tens_train = tf.convert_to_tensor(emb_tens_train, dtype=tf.float32)
lables_tens_train = tf.convert_to_tensor(lables_train, dtype=tf.float32)

embs_tens_val = tf.convert_to_tensor(emb_tens_val, dtype=tf.float32)
lables_tens_val = tf.convert_to_tensor(lables_val, dtype=tf.float32)

embs_tens_test = tf.convert_to_tensor(emb_tens_test, dtype=tf.float32)
lables_tens_test = tf.convert_to_tensor(lables_test, dtype=tf.float32)

In [None]:
tf_train = tf.data.Dataset.from_tensor_slices((embs_tens_train, lables_tens_train))
tf_val = tf.data.Dataset.from_tensor_slices((embs_tens_val, lables_tens_val))
tf_test = tf.data.Dataset.from_tensor_slices((embs_tens_test, lables_tens_test))



Then I use a library called hyperopt to efficiently select hyperparameters for
the SBERT-CNN architechture using Bayesian Optimization and not just make
random guesses. This objective function creates a network based on the selected
parameters, and computes objective. Then Hyperopt, based on previous the
results, selects the next set of hyperparameters.

In [None]:
import numpy as np
from hyperopt import Trials, STATUS_OK, tpe, fmin, hp
from tensorflow.keras.datasets import cifar10
import tensorflow.keras as tfk
from tensorflow.keras import Sequential
import pickle

best_f1 = 0.0
results = []
from keras import backend as K

# custom metric f1-score function to be used in keras optimization
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val



def objective(params):
    # print(params['emb_dim'])

    global embs_tens_train, lables_tens_train
    global embs_tens_val, lables_tens_val
    global best_f1
    global results

    tf_train = tf.data.Dataset.from_tensor_slices((embs_tens_train, lables_tens_train))
    tf_val = tf.data.Dataset.from_tensor_slices((embs_tens_val, lables_tens_val))

    tf_train = tf_train.batch(params['batch_size']).prefetch(tf.data.AUTOTUNE)
    tf_val = tf_val.batch(params['batch_size']).prefetch(tf.data.AUTOTUNE)

    model = Sequential()
    num_layers = params['num_layers']

    pool = tfk.layers.MaxPooling1D if params['pooling'] == 'max' else tfk.layers.AveragePooling1D

    for i in range(num_layers):
        # Randomly select filters and filter size for each layer
        filters = params['filters']
        f_sizes = params['f_sizes']
        # print(f_sizes)

        if i == 0:
            model.add(tfk.layers.Conv1D(filters, f_sizes, activation='relu', input_shape=(params['emb_dim'], 1)))
        else:
            model.add(tfk.layers.Conv1D(filters, f_sizes, activation='relu'))
        model.add(pool(pool_size=2))
        model.add(tfk.layers.Dropout(params['dropouts']))


    model.add(tfk.layers.Flatten())
    model.add(tfk.layers.Dense(params['out_dim'], activation='sigmoid'))

    if params['optimizer'] == 'adam':
        opt = tfk.optimizers.Adam(learning_rate=params['lr'])
    elif params['optimizer'] == 'rmsprop':
        opt = tfk.optimizers.RMSprop(learning_rate=params['lr'])
    else:
        opt = tfk.optimizers.SGD(learning_rate=params['lr'])

    model.compile(optimizer=params['optimizer'], loss='binary_crossentropy', metrics=[f1_score, 'accuracy'])

    history = model.fit(tf_train, validation_data=tf_val,
                        epochs=10,
                        verbose=0)

    val_f1 = max(history.history['val_f1_score'])
    val_acc = max(history.history['val_accuracy'])

    with open(PATH + f'hist_{val_f1}.pkl', 'wb') as f:
            pickle.dump(history.history, f)


    if val_f1 > best_f1:
        best_f1 = val_f1
        model.save(PATH + 'best_hybrid.h5')


    results.append({
        'f1': val_f1,
        'acc': val_acc,
        'params': params
    })


    return {'loss': -val_f1, 'status': STATUS_OK}




# Search space
space = {
    'emb_dim': embeddings.shape[1],
    'num_layers': hp.choice('num_layers', [2, 3, 4]),
    'filters': hp.choice('filters', [128, 64, 32]),
    'f_sizes': hp.choice('f_sizes', [3,4,5]),
    'dropouts': hp.choice('dropouts', [0.1,0.2, 0.3]),
    'learning_rate': hp.loguniform('learning_rate', np.log(1e-5), np.log(1e-2)),
    'pooling': hp.choice('pooling', ['max', 'avg']),
    'optimizer': hp.choice('optimizer', ['adam', 'rmsprop', 'sgd']),
    'lr': hp.choice('lr', [1e-3, 3e-5, 1e-4]),
    'batch_size': hp.choice('batch_size', [32, 64, 128, 256]),
    'out_dim': 1
}




In [None]:
# Trials to store optimization results

trials = Trials()

# Optimization
best = fmin(
    objective,
    space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials
)



print(best)

100%|██████████| 20/20 [14:26<00:00, 43.34s/trial, best loss: -0.911880373954773]
{'batch_size': 1, 'dropouts': 2, 'f_sizes': 2, 'filters': 0, 'learning_rate': 0.00013736081859817189, 'lr': 2, 'num_layers': 0, 'optimizer': 0, 'pooling': 1}


In [None]:
results

[{'f1': 0.8898702263832092,
  'acc': 0.893005907535553,
  'params': {'batch_size': 32,
   'dropouts': 0.1,
   'emb_dim': 384,
   'f_sizes': 5,
   'filters': 64,
   'learning_rate': 0.004106615988146594,
   'lr': 0.001,
   'num_layers': 4,
   'optimizer': 'sgd',
   'out_dim': 1,
   'pooling': 'max'}},
 {'f1': 0.8828210234642029,
  'acc': 0.8859686851501465,
  'params': {'batch_size': 128,
   'dropouts': 0.2,
   'emb_dim': 384,
   'f_sizes': 5,
   'filters': 64,
   'learning_rate': 0.006666517066504972,
   'lr': 0.0001,
   'num_layers': 2,
   'optimizer': 'sgd',
   'out_dim': 1,
   'pooling': 'avg'}},
 {'f1': 0.910651683807373,
  'acc': 0.9128249287605286,
  'params': {'batch_size': 64,
   'dropouts': 0.3,
   'emb_dim': 384,
   'f_sizes': 3,
   'filters': 128,
   'learning_rate': 0.0018501376609838399,
   'lr': 0.001,
   'num_layers': 2,
   'optimizer': 'rmsprop',
   'out_dim': 1,
   'pooling': 'avg'}},
 {'f1': 0.9017953872680664,
  'acc': 0.904926061630249,
  'params': {'batch_size': 64

In [None]:
results_unpack = [{**result, **result.pop('params')} for result in results]

In [None]:
results_ = pd.DataFrame(results_unpack)

In [None]:
results_ = results_.drop(columns=['params'])

In [None]:
results_ = results_.sort_values('f1', ascending=False)

In [None]:
results_.to_excel(PATH + 'hybrid_results.xlsx')