In [1]:
from matplotlib import pyplot as plt
import math, os, re, time, random, string
import numpy as np, pandas as pd, seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sns

from collections import defaultdict
import wordcloud

import warnings
warnings.filterwarnings('ignore')

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.layers import *
from tensorflow.keras import Input, Model
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import tensorflow_addons as tfa

from sklearn.metrics import *
from sklearn.preprocessing import StandardScaler
import optuna

In [2]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')
#train = pd.read_csv('train_cleaned_lite.csv')
#test = pd.read_csv('test_cleaned_lite.csv')

In [3]:
train['text'] = train['text'].astype(str)
test['text'] = test['text'].astype(str)

labels = train['target']
print('targets shape', labels.shape)

targets shape (7613,)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train, labels, test_size = 0.2, stratify=labels)
X_train.shape
X_val.shape
y_train.shape
y_val.shape

(6090, 9)

(1523, 9)

(6090,)

(1523,)

#### Tokenizing

In [5]:
for i in [X_train, X_val, test]:
    i['text'] = i['text'].astype(str)


X_train_text = [i for i in X_train['text']]
X_val_text = [i for i in X_val['text']]
test_text = [i for i in test['text']]

In [6]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(X_train_text)

print(len(tokenizer.word_index), 'unique tokens')

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text)
test_seq = tokenizer.texts_to_sequences(test_text)

11060 unique tokens


In [7]:
X_train_data = pad_sequences(X_train_seq)
X_val_data = pad_sequences(X_val_seq, X_train_data.shape[1])
test_data = pad_sequences(test_seq, X_train_data.shape[1])

In [8]:
X_train_data.shape
X_val_data.shape
test_data.shape

(6090, 23)

(1523, 23)

(3263, 23)

### Meta-feature scaling

In [9]:
scaler = StandardScaler()
scaler.fit(X_train.iloc[:, 2:])

meta_train = scaler.transform(X_train.iloc[:, 2:])
meta_val = scaler.transform(X_val.iloc[:, 2:])
meta_test = scaler.transform(test.iloc[:, 1:])

StandardScaler()

In [10]:
meta_train.shape
meta_val.shape
meta_test.shape

(6090, 7)

(1523, 7)

(3263, 7)

#### Functions

In [11]:
def loss_acc_plot(history):
    scores = pd.DataFrame(history.history)
    scores[['loss', 'val_loss']].plot();
    scores[['accuracy', 'val_accuracy']].plot();
    #scores[['f1_score', 'val_f1_score']].plot();

answers = pd.read_csv('answer key.csv')
y_true = answers['target']

def kaggle(model, filename=''):
    y_pred = (model.predict([test_data, meta_test]) > 0.5).astype(int)
    print('\nf1 score is:', f1_score(y_true, y_pred, average='macro'))
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true, y_pred))
    
    if filename != '':
        submission=pd.read_csv('sample_submission.csv')
        submission['target']=y_pred
        submission.to_csv(filename+'.csv', index=False)

In [12]:
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', 
                              factor = 0.5, 
                              verbose = 1, 
                              patience = 4,                        
                              min_lr = 0.0001)

def model_process(model, n_epochs=20, model_name='', filename=''):
    model.summary()
    
    checkpointer = ModelCheckpoint(filepath = model_name+'.hdf5', 
                                   save_best_only=True,
                                   save_weights_only=True,
                                   monitor='val_loss')

    history = model.fit(x=[X_train_data, meta_train],
                        y=y_train,
                        epochs = n_epochs,
                        verbose=1,
                        callbacks=[checkpointer, reduce_lr],
                        validation_data = ([X_val_data, meta_val], y_val)
                       )
    
    loss_acc_plot(history)
    
    model.load_weights(model_name+'.hdf5')
    
    loss, acc = model.evaluate([X_val_data, meta_val], y_val)
    print('\nModel accuracy on validation set = ', acc)
    print('\nModel loss on validation set = ', loss)
    
    y_val_pred = (model.predict([X_val_data, meta_val]) > 0.5).astype(int)
    print('\nModel F1 on validation set = ', f1_score(y_val, y_val_pred, average='macro'))
    
    kaggle(model, filename)

In [41]:
embeddings_index = {}
with open('glove.6B/glove.6B.200d.txt','r', encoding="utf8") as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

EMBEDDING_DIM = 200  
word_index = tokenizer.word_index

#initialize embedding matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

#add glove word encodings to our library
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        
        #words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print("Our embedded matrix is of dimension", embedding_matrix.shape)

embedding = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights = [embedding_matrix],
                     input_length = 23, trainable = False)  

400001it [00:20, 19771.88it/s]
100%|██████████| 11060/11060 [00:00<00:00, 460427.61it/s]

Our embedded matrix is of dimension (11061, 200)





# Optuna

In [49]:
def objective(trial):
    LSTM_dropout = trial.suggest_discrete_uniform('LSTM_dropout', 0.0, 0.8, 0.1)
    dropout = trial.suggest_discrete_uniform('dropout', 0.0, 0.8, 0.1)
    spatial_dropout = trial.suggest_discrete_uniform('spatial_dropout', 0.0, 0.8, 0.1)
    
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = trial.suggest_categorical("optimizer", [Adam, RMSprop])
    
    LSTM1 = trial.suggest_int('units1', 10, 100, step=10)
    LSTM2 = trial.suggest_int('units2', 10, 100, step=10)
    dense_units = trial.suggest_int('dense_units', 10, 100, step=10)
    
        
    #model
    nlp_input = Input(shape = (23,), name = 'nlp_input')
    meta_input_train = Input(shape = (7, ), name = 'meta_train')
    
    emb = embedding(nlp_input)
    emb = SpatialDropout1D(spatial_dropout)(emb)

    nlp_out = Bidirectional(LSTM(LSTM1, dropout=LSTM_dropout, return_sequences=True))(emb)  
    nlp_out = SpatialDropout1D(spatial_dropout)(nlp_out)
    
    nlp_out = Bidirectional(LSTM(LSTM2, dropout=LSTM_dropout))(emb)   
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    
    #add second hidden layer
    x = Dropout(dropout)(x)
    x = (Dense(dense_units, 
               activation = 'relu'
              ))(x)
    
    #add output layer
    x = Dropout(dropout)(x)
    preds = Dense(1, 
                  activation='sigmoid'
                 )(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)

    model.compile(loss = 'binary_crossentropy', optimizer = optimizer(lr), metrics = ['accuracy'])
    
    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', 
                              factor = 0.5, 
                              verbose = False, 
                              patience = 4,                        
                              min_lr = 0.0001)
    
    pruning = optuna.integration.TFKerasPruningCallback(trial, 'val_accuracy')
    
    model.fit(x=[X_train_data, meta_train],
                        y=y_train,
                        epochs = 15,
                        verbose=False,
                        callbacks=[reduce_lr, pruning],
                        validation_data = ([X_val_data, meta_val], y_val),
                        shuffle=True
                       )
    y_pred = (model.predict([test_data, meta_test]) > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

In [50]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2021-08-22 22:23:21,725][0m A new study created in memory with name: no-name-77338653-bad4-4b57-a8f6-63a45eae2c99[0m
[32m[I 2021-08-22 22:23:41,527][0m Trial 0 finished with value: 0.7162570973172621 and parameters: {'LSTM_dropout': 0.7000000000000001, 'dropout': 0.6000000000000001, 'spatial_dropout': 0.6000000000000001, 'lr': 0.018086759576374796, 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop'>, 'units1': 60, 'units2': 30, 'dense_units': 100}. Best is trial 0 with value: 0.7162570973172621.[0m
[32m[I 2021-08-22 22:24:00,722][0m Trial 1 finished with value: 0.7612147339524218 and parameters: {'LSTM_dropout': 0.7000000000000001, 'dropout': 0.5, 'spatial_dropout': 0.0, 'lr': 0.0011488686702275726, 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.rmsprop.RMSprop'>, 'units1': 40, 'units2': 30, 'dense_units': 30}. Best is trial 1 with value: 0.7612147339524218.[0m
[32m[I 2021-08-22 22:24:19,606][0m Trial 2 finished with value: 0.6211113

[32m[I 2021-08-22 22:31:24,475][0m Trial 38 pruned. Trial was pruned at epoch 0.[0m
[32m[I 2021-08-22 22:31:30,361][0m Trial 39 pruned. Trial was pruned at epoch 0.[0m
[32m[I 2021-08-22 22:31:36,059][0m Trial 40 pruned. Trial was pruned at epoch 0.[0m
[32m[I 2021-08-22 22:31:56,747][0m Trial 41 finished with value: 0.7706593496953655 and parameters: {'LSTM_dropout': 0.1, 'dropout': 0.7000000000000001, 'spatial_dropout': 0.2, 'lr': 0.002890177381116407, 'optimizer': <class 'tensorflow.python.keras.optimizer_v2.adam.Adam'>, 'units1': 90, 'units2': 40, 'dense_units': 90}. Best is trial 10 with value: 0.7769143044442431.[0m
[32m[I 2021-08-22 22:32:02,213][0m Trial 42 pruned. Trial was pruned at epoch 0.[0m
[32m[I 2021-08-22 22:32:22,990][0m Trial 43 finished with value: 0.7732009650772109 and parameters: {'LSTM_dropout': 0.1, 'dropout': 0.7000000000000001, 'spatial_dropout': 0.30000000000000004, 'lr': 0.0040816444746307285, 'optimizer': <class 'tensorflow.python.keras.opti

In [51]:
trial = study.best_trial
trial.params

{'LSTM_dropout': 0.4,
 'dropout': 0.6000000000000001,
 'spatial_dropout': 0.1,
 'lr': 0.005733554663300928,
 'optimizer': tensorflow.python.keras.optimizer_v2.adam.Adam,
 'units1': 70,
 'units2': 50,
 'dense_units': 90}

# LSTM with GloVe

In [None]:
embeddings_index = {}
with open('glove.6B/glove.6B.200d.txt','r', encoding="utf8") as f:
    for line in tqdm(f):
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
f.close()

In [None]:
EMBEDDING_DIM = 200  
word_index = tokenizer.word_index

#initialize embedding matrix with zeros
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

#add glove word encodings to our library
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        
        #words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print("Our embedded matrix is of dimension", embedding_matrix.shape)

In [None]:
embedding = Embedding(len(word_index) + 1, EMBEDDING_DIM, weights = [embedding_matrix],
                     input_length = 23, trainable = False)  

In [None]:
def m5_glove(dropout_val):
            
    #define inputs
    nlp_input = Input(shape = (40,), name = 'nlp_input')
    meta_input_train = Input(shape = (7, ), name = 'meta_train')
    
    emb = embedding(nlp_input)
    emb = SpatialDropout1D(dropout_val)(emb)

    nlp_out = Bidirectional(LSTM(100, 
                                 dropout=dropout_val, 
                                 ))(emb)     
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    
    #add second hidden layer
#    x = Dropout(dropout_val)(x)
#    x = (Dense(50, 
#               activation = LeakyReLU(alpha=0.01)
#              ))(x)
    
    #add output layer
    x = Dropout(dropout_val)(x)
    preds = Dense(1, 
                  activation='sigmoid'
                 )(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)
    model.compile(loss = 'binary_crossentropy', optimizer = Adam(0.001), metrics = ['accuracy'])
    
    return model

In [None]:
model5 = m5_glove(0.4)
model_process(model5, 50, '05 GloVe LSTM')

# GloVe dual LSTM

In [None]:
def m6_glove2(dropout_val):
            
    #define inputs
    nlp_input = Input(shape = (40,), name = 'nlp_input')
    meta_input_train = Input(shape = (7, ), name = 'meta_train')
    

    emb = embedding(nlp_input)
    emb = SpatialDropout1D(dropout_val)(emb)

    
    nlp_out = Bidirectional(LSTM(130, 
                                 dropout=dropout_val,
                                 return_sequences=True))(emb)    
    
    nlp_out = SpatialDropout1D(dropout_val)(nlp_out)
    
    nlp_out = Bidirectional(LSTM(110, 
                                 dropout=dropout_val))(emb)    
    
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    
    #add second hidden layer
    x = Dropout(dropout_val)(x)
    x = (Dense(90, 
               activation = 'relu'
              ))(x)
    
    #add output layer
    x = Dropout(dropout_val)(x)
    preds = Dense(1, 
                  activation='sigmoid'
                 )(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)
    model.compile(loss = 'binary_crossentropy', optimizer = Adam(0.002), metrics = ['accuracy'])
    
    return model

In [None]:
model6 = m6_glove2(0.4)
model_process(model6, 50, '06 GloVe dual lstm', '06 GloVe dual lstm')

# model tuning

In [None]:
import optuna

In [None]:
def objective(trial):
    LSTM_dropout = trial.suggest_discrete_uniform('dropout', 0.0, 0.8, 0.1)
    dropout = trial.suggest_discrete_uniform('dropout', 0.0, 0.8, 0.1)
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = trial.suggest_categorical("optimizer", [Adam, RMSprop])
    
    LSTM1 = trial.suggest_int('units1', 20, 200, step=10)
    LSTM2 = trial.suggest_int('units2', 20, 200, step=10)
    dense_units = trial.suggest_int('units3', 20, 200, step=10)
    
    
    
    nlp_input = Input(shape = (40,), name = 'nlp_input')
    meta_input_train = Input(shape = (7, ), name = 'meta_train')
    
    emb = embedding(nlp_input)
    emb = SpatialDropout1D(dropout)(emb)

    nlp_out = Bidirectional(LSTM(LSTM1, 
                                 dropout=LSTM_dropout,
                                 return_sequences=True))(emb)    
    
    nlp_out = SpatialDropout1D(dropout)(nlp_out)
    
    nlp_out = Bidirectional(LSTM(LSTM2, 
                                 dropout=LSTM_dropout))(emb)    
    
     
    #add meta data    
    x = Concatenate()([nlp_out, meta_input_train])
    
    #add second hidden layer
    x = Dropout(dropout)(x)
    x = (Dense(dense_units, 
               activation = 'relu'
              ))(x)
    
    #add output layer
    x = Dropout(dropout)(x)
    preds = Dense(1, 
                  activation='sigmoid'
                 )(x)
    
    #compile model
    model = Model(inputs=[nlp_input , meta_input_train], outputs = preds)

    model.compile(loss = 'binary_crossentropy', optimizer = optimizer(lr), metrics = ['accuracy'])
    
    reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', 
                              factor = 0.5, 
                              verbose = False, 
                              patience = 4,                        
                              min_lr = 0.0001)
    
    model.fit(x=[X_train_data, meta_train],
                        y=y_train,
                        epochs = 30,
                        verbose=False,
                        callbacks=[reduce_lr],
                        validation_data = ([X_val_data, meta_val], y_val),
                        shuffle=True
                       )
    y_pred = (model.predict([test_data, meta_test]) > 0.5).astype(int)
    f1 = f1_score(y_true, y_pred, average='macro')
    return f1

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
trial = study.best_trial
trial.params

In [None]:
trial = study.best_trial
trial.params

In [64]:
glove_dual = pd.read_csv('glove single lstm.csv')

In [65]:
f1_score(y_true, glove_dual['target'], average='weighted')
f1_score(y_true, glove_dual['target'], average='macro')
f1_score(y_true, glove_dual['target'], average='micro')
f1_score(y_true, glove_dual['target'])

0.7940573057197762

0.7869388717154875

0.7989580140974564

0.7363344051446945

In [62]:
df=glove_dual.drop(columns = ['prob'], axis=1)

In [63]:
df.to_csv('08 glove dual lstm.csv', index=False)