# Preprocess

In [None]:
# Import modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
np.random.seed(42)

import matplotlib.pyplot as plt
import seaborn as sns
import os, json, random, pickle
random.seed(42)
# ignore deprecation warnings in sklearn

import warnings
warnings.filterwarnings("ignore")

# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

# Set model directory

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

# Set data paths

train_path = os.path.join(data_dir, 'train.csv')

train_processed_path = os.path.join(data_dir, 'interim', 'train_preprocessed.txt')

meta_feat_path = os.path.join(data_dir, 'interim', 'meta_feat.txt')

train = pd.read_csv(train_path)
train_processed = pd.read_json(train_processed_path)
meta_feat = pd.read_json(meta_feat_path)

In [None]:
# Imports
import tensorflow as tf
import tensorflow.keras.backend as K
tf.set_random_seed(42)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils.np_utils import to_categorical
from keras.callbacks import Callback

from keras_tqdm import TQDMNotebookCallback
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
# Define some support function to adjust class weights and 
from sklearn.utils.class_weight import compute_sample_weight

class_weights = compute_sample_weight('balanced', 
                                      np.unique(train_processed.sentiment),
                                      train_processed.sentiment)

def get_label(row):
    """
    Get regular label from one hot encoded labels
    """
    for label in [0,1,2]:
        if row[label] == 1:
            return label

# Bi-Directional LSTM

# Set hyper-parameters

vocab_size = 5000
input_length = 120
embed_dim = 100
lstm_out = 100
batch_size = 32
num_epochs = 5

# Tokenization and build model input

tokenizer = Tokenizer(num_words=vocab_size, split=' ')
tokenizer.fit_on_texts(train_processed['text'].values)

X = tokenizer.texts_to_sequences(train_processed['text'].values)
X = pad_sequences(X, maxlen = input_length)

#y = to_categorical(train_processed['sentiment'].values)
y = train_processed['sentiment'].values

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)

# Define 3-fold cross validation
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
cvscores = []

%%time
while 1 == 0:
    for train, valid in kfold.split(Xtrain, ytrain):    
        # One hot encode label
        training_label = to_categorical(ytrain[train])
        validation_label = to_categorical(ytrain[valid])

        # Build Neural Network architecture
        model = Sequential()
        model.add(Embedding(vocab_size, 
                            embed_dim, 
                            input_length = X.shape[1], 
                            dropout=0.2))

        model.add(Bidirectional(LSTM(lstm_out, 
                               dropout_U=0.2,
                               dropout_W=0.2)))
        model.add(Dense(3,
                        activation='softmax'))
        model.compile(loss = 'categorical_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])

        # Fit model
        model.fit(Xtrain[train], training_label, 
                  batch_size=batch_size,
                  epochs=num_epochs,
                  class_weight = class_weights,
                  verbose = 0,
                  callbacks = [TQDMNotebookCallback()])

        # Get model results
        scores = model.evaluate(Xtrain[valid], validation_label, verbose = 0)
        ypred = model.predict(Xtrain[valid])
        ypred_df = pd.DataFrame(ypred)
        ypred_max = ypred_df.apply(max, axis = 1)
        for index, row in ypred_df.iterrows():
            for label, item in row.items():
                if item == ypred_max[index]:
                    row[label] = 1
                else:
                    row[label] = 0

        # Get confusion matrix
        ypred_label = ypred_df.apply(get_label, axis = 1)
        yvalid_label = ytrain[valid]

        cm = confusion_matrix(yvalid_label, ypred_label)
        print(cm, "\n\n")

        f1 = f1_score( yvalid_label, ypred_label, average = 'macro')

        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        print("f1: %.2f" % f1)
        cvscores.append(f1)
    print("%.2f (+- %.2f)" % (np.mean(cvscores), np.std(cvscores)))

model_json = model.to_json()
with open(os.path.join(model_dir, "LSTM_120inputlen_32bsize_5epoch.json"), 'w') as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_120inputlen_32bsize_5epoch.h5")

# Bi-Directional LSTM with Word2Vec embeddings

In [None]:
# Set hyper-parameters

vocab_size = 5000
input_length = 120
embed_dim = 100
lstm_out = 100
batch_size = 32
num_epochs = 5

In [None]:
# Tokenization and build model input

tokenizer = Tokenizer(num_words=vocab_size, split=' ')
tokenizer.fit_on_texts(train_processed['text'].values)

X = tokenizer.texts_to_sequences(train_processed['text'].values)
X = pad_sequences(X, maxlen = input_length)

#y = to_categorical(train_processed['sentiment'].values)
y = train_processed['sentiment'].values

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
print(Xtrain.shape,ytrain.shape)
print(Xtest.shape,ytest.shape)

In [None]:
# Define 3-fold cross validation
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 42)
cvscores = []

In [None]:
%%time
while 1 == 0:
    for train, valid in kfold.split(Xtrain, ytrain):    
        # One hot encode label
        training_label = to_categorical(ytrain[train])
        validation_label = to_categorical(ytrain[valid])

        # Build Neural Network architecture
        model = Sequential()
        model.add(Embedding(vocab_size, 
                            embed_dim, 
                            input_length = X.shape[1], 
                            dropout=0.2))

        model.add(Bidirectional(LSTM(lstm_out, 
                               dropout_U=0.2,
                               dropout_W=0.2)))
        model.add(Dense(3,
                        activation='softmax'))
        model.compile(loss = 'categorical_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])

        # Fit model
        model.fit(Xtrain[train], training_label, 
                  batch_size=batch_size,
                  epochs=num_epochs,
                  class_weight = class_weights,
                  verbose = 0,
                  callbacks = [TQDMNotebookCallback()])

        # Get model results
        scores = model.evaluate(Xtrain[valid], validation_label, verbose = 0)
        ypred = model.predict(Xtrain[valid])
        ypred_df = pd.DataFrame(ypred)
        ypred_max = ypred_df.apply(max, axis = 1)
        for index, row in ypred_df.iterrows():
            for label, item in row.items():
                if item == ypred_max[index]:
                    row[label] = 1
                else:
                    row[label] = 0

        # Get confusion matrix
        ypred_label = ypred_df.apply(get_label, axis = 1)
        yvalid_label = ytrain[valid]

        cm = confusion_matrix(yvalid_label, ypred_label)
        print(cm, "\n\n")

        f1 = f1_score( yvalid_label, ypred_label, average = 'macro')

        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        print("f1: %.2f" % f1)
        cvscores.append(f1)
    print("%.2f (+- %.2f)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
model_json = model.to_json()
with open(os.path.join(model_dir, "LSTM_120inputlen_32bsize_5epoch.json"), 'w') as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_120inputlen_32bsize_5epoch.h5")