## Introduction
This notebook will build a simple RNN model using keras to solve classification sentiment problem for movie reviews.

### Import libraries,  import custom scripts and define constants  

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SimpleRNN, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import re


##### **Mount** the google drive.

In [20]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import all our functions
import os,sys
sys.path.insert(0,'/content/drive/My Drive/Colab Notebooks/ml_training/app_predict/')
from src import preprocessing


In [None]:
#definition constants
RANDOM_STATE = 11
TEST_SIZE = 0.15


###  Loading the data and applying the preprocessing

In [25]:
# import & display data
data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ml_training/data/IMDB_Dataset.csv')
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})
data = data.drop_duplicates()
data['review'] = data['review'].apply(lambda x: preprocessing.preprocessing_text(x))
data.head()


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming t...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


Split the data for the training, the testing and the validation datasets

In [None]:
X = data.review
y = data.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y_train)

### Preprocessing Data

In [None]:
MAX_FEATURES = 50000
MAX_LEN = 500


tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token='unk')

# only fit on train
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

### Create model
For this step let's try to use the keras Functional API

In [None]:
#function for f1 metric
def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [30]:
EMBED_DIM = 512
RNN_OUT = 32
BATCH_SIZE = 32
EPOCHS = 50

np.random.seed(0)


main_input = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')

x = Embedding(output_dim=EMBED_DIM, input_dim=MAX_FEATURES, input_length=MAX_LEN)(main_input)
x = SpatialDropout1D(0.3)(x)
x = SimpleRNN(RNN_OUT)(x)


# We stack a deep densely-connected network on top
x = Dense(256, activation='relu')(x)


main_output = Dense(1, activation='sigmoid', name='main_output')(x)

model = Model(inputs=[main_input], outputs=[main_output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[get_f1])

early_stop = EarlyStopping(monitor='val_loss', 
                           patience=3)
cp_callback = ModelCheckpoint(filepath='model_rnn.hdf5',
                              save_best_only=True,
                              verbose=1)
callbacks = [cp_callback, early_stop]

model.fit(X_train_pad, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,  
          verbose=2,                   
          callbacks=callbacks,                    
          validation_data=(X_val_pad, y_val),                     
          shuffle=True)


Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.49410, saving model to model_rnn.hdf5
1120/1120 - 921s - loss: 0.5268 - get_f1: 0.7248 - val_loss: 0.4941 - val_get_f1: 0.7611
Epoch 2/50

Epoch 00002: val_loss improved from 0.49410 to 0.43275, saving model to model_rnn.hdf5
1120/1120 - 978s - loss: 0.4024 - get_f1: 0.8197 - val_loss: 0.4328 - val_get_f1: 0.7821
Epoch 3/50

Epoch 00003: val_loss did not improve from 0.43275
1120/1120 - 891s - loss: 0.3975 - get_f1: 0.8204 - val_loss: 0.5430 - val_get_f1: 0.7318
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.43275
1120/1120 - 858s - loss: 0.2412 - get_f1: 0.9012 - val_loss: 0.5568 - val_get_f1: 0.7634
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.43275
1120/1120 - 831s - loss: 0.1502 - get_f1: 0.9418 - val_loss: 0.5923 - val_get_f1: 0.7838


<tensorflow.python.keras.callbacks.History at 0x7ff1547d8710>

In [33]:
np.random.seed(0)


main_input = Input(shape=(MAX_LEN,), dtype='int32', name='main_input')

x = Embedding(output_dim=EMBED_DIM, input_dim=MAX_FEATURES, input_length=MAX_LEN)(main_input)
x = SpatialDropout1D(0.3)(x)
x = SimpleRNN(RNN_OUT)(x)


# We stack a deep densely-connected network on top
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)

main_output = Dense(1, activation='sigmoid', name='main_output')(x)

model = Model(inputs=[main_input], outputs=[main_output])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[get_f1])

early_stop = EarlyStopping(monitor='val_loss', 
                           patience=3)
cp_callback = ModelCheckpoint(filepath='model_rnn.hdf5',
                              save_best_only=True,
                              verbose=1)
callbacks = [cp_callback, early_stop]

model.fit(X_train_pad, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,  
          verbose=2,                   
          callbacks=callbacks,                    
          validation_data=(X_val_pad, y_val),                     
          shuffle=True)


Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.51726, saving model to model_rnn.hdf5
1120/1120 - 854s - loss: 0.6190 - get_f1: 0.6282 - val_loss: 0.5173 - val_get_f1: 0.7284
Epoch 2/50

Epoch 00002: val_loss improved from 0.51726 to 0.41309, saving model to model_rnn.hdf5
1120/1120 - 866s - loss: 0.3944 - get_f1: 0.8267 - val_loss: 0.4131 - val_get_f1: 0.8209
Epoch 3/50

Epoch 00003: val_loss did not improve from 0.41309
1120/1120 - 860s - loss: 0.2170 - get_f1: 0.9126 - val_loss: 0.5261 - val_get_f1: 0.8155
Epoch 4/50

Epoch 00004: val_loss did not improve from 0.41309
1120/1120 - 867s - loss: 0.1126 - get_f1: 0.9582 - val_loss: 0.6042 - val_get_f1: 0.8079
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.41309
1120/1120 - 858s - loss: 0.0798 - get_f1: 0.9719 - val_loss: 0.6682 - val_get_f1: 0.7933


<tensorflow.python.keras.callbacks.History at 0x7ff152da0278>