# Imports

In [12]:
# Import modules
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import random
import pickle

# ignore deprecation warnings in sklearn

import warnings
warnings.filterwarnings("ignore")

# Specify data directory

data_dir = os.path.join(os.path.dirname(os.getcwd()),'Data')

# Set model directory

model_dir = os.path.join(os.path.dirname(os.getcwd()), 'Model')

# Set data paths

train_path = os.path.join(data_dir, 'train.csv')

train_processed_path = os.path.join(data_dir, 'interim', 'train_preprocessed.txt')

meta_feat_path = os.path.join(data_dir, 'interim', 'meta_feat.txt')

train = pd.read_csv(train_path)
train_processed = pd.read_json(train_processed_path)
dense_feat = pd.read_json(meta_feat_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import tensorflow.keras.backend as K
import re
from keras_tqdm import TQDMNotebookCallback

Using TensorFlow backend.


In [14]:
def f1(y_true, y_pred):
    y_pred = K.round(y_pred)
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.is_nan(f1), tf.zeros_like(f1), f1)
    return K.mean(f1)

# RNN-LSTM Grid Search

## Max Length grid search

In [None]:
max_length_dim = np.arange(10,300,10)
sample_size = 250
f1_grid = {}
history_grid = {}
for max_length in max_length_dim:
    
    # Get samples
    c0 = train_processed[train_processed.sentiment == 0][0:sample_size]
    c1 = train_processed[train_processed.sentiment == 1][0:sample_size]
    c2 = train_processed[train_processed.sentiment == 2][0:sample_size]
    train_sample = pd.concat([c0, c1, c2])
    
    # Build model input
    max_features = 2000
    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(train_sample['text'].values)

    X = tokenizer.texts_to_sequences(train_sample['text'].values)
    X = pad_sequences(X, maxlen = max_length)
    y = to_categorical(train_sample['sentiment'].values)

    # Derive X and y
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
    print(Xtrain.shape,ytrain.shape)
    print(Xtest.shape,ytest.shape)
    
    # Build NN Layers 
    embed_dim = 32
    lstm_out = 100
    model = Sequential()
    model.add(Embedding(max_features, 
                        embed_dim, 
                        input_length = X.shape[1], 
                        dropout=0.2))
    model.add(LSTM(lstm_out, 
                   dropout_U=0.2,
                   dropout_W=0.2))
    model.add(Dense(3,
                    activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy', f1])
    print(model.summary())
    
    # Training
    batch_size = 8
    num_epochs = 5
    Xvalid, yvalid = Xtrain[:batch_size], ytrain[:batch_size]
    Xtrain1, ytrain1 = Xtrain[batch_size:], ytrain[batch_size:]
    model.fit(Xtrain1, 
              ytrain1, 
              validation_data=(Xvalid, yvalid), 
              batch_size=batch_size, 
              epochs=num_epochs,
              verbose = 0,
              callbacks = [TQDMNotebookCallback()])
    
    # Visualizing
    scores = model.evaluate(Xtest, ytest, verbose=10)
    ypred = model.predict(Xtest)
    ypred_df = pd.DataFrame(ypred)
    ypred_max = ypred_df.apply(max, axis = 1)
    for index, row in ypred_df.iterrows():
        for label, item in row.items():
            if item == ypred_max[index]:
                row[label] = 1
            else:
                row[label] = 0
    print(ypred_df.head())
    plt.bar(x = [0,1,2],
       height = [np.sum(np.round(ypred_df[0])), np.sum(np.round(ypred_df[1])), np.sum(np.round(ypred_df[2]))])
    plt.xticks([0,1,2])
    plt.show()
    
    # Evaluating
    f1_s = f1_score(ytest, ypred_df, average = 'macro')
    f1_grid[max_length] = f1_s
    history_grid[max_length] = model.history
    print("F1-SCORE for max length of %d is %f"  % (max_length, f1_s))

pd.DataFrame.from_dict(f1_grid, orient = 'index').plot(legend = None)
plt.title('F1_score across maximum length')
plt.xlabel('Sample sizes')
plt.ylabel('F1 Score')
plt.show()

filename = 'f1_vs_maxlen_250samplesize_5epoch_8batchsize.sav'
pd.DataFrame.from_dict(f1_grid, orient = 'index').to_pickle(os.path.join(model_dir, 'rnn', filename))

filename = 'f1_vs_samplesizes_50maxlen_20epoch.sav'
gs = pickle.load(open(os.path.join(model_dir, 'rnn', filename), 'rb'))
gs.plot()

## Number of epoch grid search

In [None]:
num_epoch_dim = [5,10,15,20,25]
max_length = 90
sample_size = 250
f1_grid = {}
history_grid = {}
for num_epoch in num_epoch_dim:
    
    # Get samples
    c0 = train_processed[train_processed.sentiment == 0][0:sample_size]
    c1 = train_processed[train_processed.sentiment == 1][0:sample_size]
    c2 = train_processed[train_processed.sentiment == 2][0:sample_size]
    train_sample = pd.concat([c0, c1, c2])
    
    # Build model input
    max_features = 2000
    tokenizer = Tokenizer(num_words=max_features, split=' ')
    tokenizer.fit_on_texts(train_sample['text'].values)

    X = tokenizer.texts_to_sequences(train_sample['text'].values)
    X = pad_sequences(X, maxlen = max_length)
    y = to_categorical(train_sample['sentiment'].values)

    # Derive X and y
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,y, random_state = 42)
    print(Xtrain.shape,ytrain.shape)
    print(Xtest.shape,ytest.shape)
    
    # Build NN Layers 
    embed_dim = 32
    lstm_out = 100
    model = Sequential()
    model.add(Embedding(max_features, 
                        embed_dim, 
                        input_length = X.shape[1], 
                        dropout=0.2))
    model.add(LSTM(lstm_out, 
                   dropout_U=0.2,
                   dropout_W=0.2))
    model.add(Dense(3,
                    activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', 
                  optimizer='adam',
                  metrics=['accuracy', f1])
    print(model.summary())
    
    # Training
    batch_size = 8
    num_epochs = num_epoch
    Xvalid, yvalid = Xtrain[:batch_size], ytrain[:batch_size]
    Xtrain1, ytrain1 = Xtrain[batch_size:], ytrain[batch_size:]
    model.fit(Xtrain1, 
              ytrain1, 
              validation_data=(Xvalid, yvalid), 
              batch_size=batch_size, 
              epochs=num_epochs,
              verbose = 0,
              callbacks = [TQDMNotebookCallback()])
    
    # Visualizing
    scores = model.evaluate(Xtest, ytest, verbose=10)
    ypred = model.predict(Xtest)
    ypred_df = pd.DataFrame(ypred)
    ypred_max = ypred_df.apply(max, axis = 1)
    for index, row in ypred_df.iterrows():
        for label, item in row.items():
            if item == ypred_max[index]:
                row[label] = 1
            else:
                row[label] = 0
    print(ypred_df.head())
    plt.bar(x = [0,1,2],
       height = [np.sum(np.round(ypred_df[0])), np.sum(np.round(ypred_df[1])), np.sum(np.round(ypred_df[2]))])
    plt.xticks([0,1,2])
    plt.show()
    
    # Evaluating
    f1_s = f1_score(ytest, ypred_df, average = 'macro')
    f1_grid[num_epoch] = f1_s
    history_grid[num_epoch] = model.history
    print("F1-SCORE for num epoch of %d is %f"  % (num_epoch, f1_s))

In [None]:
pd.DataFrame.from_dict(f1_grid, orient = 'index').plot(legend = None)
plt.title('F1_score across maximum length')
plt.xlabel('Sample sizes')
plt.ylabel('F1 Score')
plt.show()

filename = 'f1_vs_nepoch_90maxlen_250samplesize.sav'
pd.DataFrame.from_dict(f1_grid, orient = 'index').to_pickle(os.path.join(model_dir, 'rnn', filename))