In [94]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import ReduceLROnPlateau

In [3]:
df = pd.read_csv('train.csv', sep = ',', header = 0)

In [5]:
df = df[['title', 'label']]

In [7]:
df.isnull().sum().sum()

558

In [8]:
df = df.dropna(axis = 0)

In [11]:
df = df.reset_index(drop = True)

In [64]:
X = df['title']
y = df['label']

In [17]:
def text_preprocess(text):
    review = re.sub('/s+', ' ', text)
    review = re.sub('/d', '', review)
    review = review.lower()
    review = re.sub(',', '', review)
    
    ps = PorterStemmer()
    tokens = nltk.word_tokenize(review)
    tokens = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]

    return ' '.join(tokens)

In [20]:
X = X.apply(text_preprocess)

In [25]:
vocab_size = 5000
sent_len = 20
features_num = 10

In [44]:
def vectorizer(text):
    
    one_hot_rep = [one_hot(sent, vocab_size) for sent in text]
    result = pad_sequences(one_hot_rep, maxlen = sent_len, padding = 'pre')
    return result

In [45]:
one_hot_rep = vectorizer(X)

In [129]:
y = np.array(y)

In [130]:
X_train, X_test, y_train, y_test = train_test_split(one_hot_rep, y, test_size = 0.2, random_state = 0)

In [155]:
def model_LSTM(X_train, y_train, X_test, y_test):
    
    X_train = X_train.reshape(-1,20,1)
    X_test = X_test.reshape(-1,20,1)
    
    reducer = ReduceLROnPlateau(monitor = 'accuracy', factor = 0.2, patience = 2, min_lr = 0.0001)
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = features_num, input_length = sent_len)) 
    model.add(LSTM(units = 100, return_sequences = True)) 
    model.add(Dropout(rate = 0.1))
    model.add(LSTM(units = 50))
    model.add(Dropout(rate = 0.1))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    model.compile(optimizer = Adam(lr = 0.01), loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(X_train, y_train, validation_data = (X_test, y_test),
              batch_size = 64, epochs = 8, callbacks = [reducer])
    
    return model
    
              

In [156]:
LSTM_model = model_LSTM(X_train, y_train, X_test, y_test)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [161]:
def Bidirectional_LSTM(X_train, y_train, X_test, y_test):
    
    X_train = X_train.reshape(-1,20,1)
    X_test = X_test.reshape(-1,20,1)
    
    reducer = ReduceLROnPlateau(monitor = 'accuracy', factor = 0.2, patience = 2, verbose = 1,
                               mode = 'auto', min_lr = 0.0001)
    
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = features_num, input_length = sent_len))
    model.add(Bidirectional(LSTM(units = 100, return_sequences = True)))
    model.add(Dropout(rate = 0.1))
    model.add(Bidirectional(LSTM(units = 50)))
    model.add(Dropout(rate = 0.1))
    model.add(Dense(units = 1, activation = 'sigmoid'))
    
    model.compile(optimizer = Adam(lr = 0.01), loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(X_train, y_train, validation_data = (X_test, y_test),
              batch_size = 64, epochs = 8, callbacks = [reducer])
    
    return model

In [162]:
model_Bid_LSTM = Bidirectional_LSTM(X_train, y_train, X_test, y_test)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [188]:
def validation(X_train, y_train, X_test, y_test, model, model_name):
    data = {'train': [X_train, y_train], 'test': [X_test, y_test]}
    
    for key in data.keys():
        y_pred = model.predict(data[key][0])
        
        for i in range(len(y_pred)):
            if y_pred[i] < 0.5:
                y_pred[i] = 0
            else:
                y_pred[i] = 1
                
        accuracy = accuracy_score(data[key][1], y_pred) * 100
        recall = recall_score(data[key][1], y_pred) * 100
        precision = precision_score(data[key][1], y_pred) * 100
        
        print('______________________{0}______________________'.format(model_name))
        print('Accuracy for {0} set = {1}'.format(key, round(accuracy, ndigits = 2)))
        print('Recall for {0} set = {1}'.format(key, round(recall, ndigits = 2)))
        print('Precision for {0} set = {1}'.format(key, round(precision, ndigits = 2)))

In [189]:
validation(X_train, y_train, X_test, y_test, LSTM_model, 'LSTM_model')
validation(X_train, y_train, X_test, y_test, model_Bid_LSTM, 'model_Bid_LSTM')

______________________LSTM_model______________________
Accuracy for train set = 99.9
Recall for train set = 99.91
Precision for train set = 99.87
______________________LSTM_model______________________
Accuracy for test set = 93.23
Recall for test set = 94.5
Precision for test set = 91.9
______________________model_Bid_LSTM______________________
Accuracy for train set = 99.72
Recall for train set = 99.72
Precision for train set = 99.7
______________________model_Bid_LSTM______________________
Accuracy for test set = 93.01
Recall for test set = 92.99
Precision for test set = 92.75
