In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


### Introduction
This notebook will build a Bidirectional LSTM network using keras to solve classification sentiment problem for movie reviews.


Import **libraries**, import **custom scripts** and define **constants**

In [2]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, Input, GlobalMaxPool1D 
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split

import re


In [3]:
# import all our functions
import os,sys,inspect
currentdir=os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir=os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from usr.lib.preprocessing import preprocessing


/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [4]:
#definition constants
RANDOM_STATE = 11
TEST_SIZE = 0.15


#### Loading the data and applying the preprocessing

In [5]:
# import & display data
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})
data = data.drop_duplicates()
data['review'] = data['review'].apply(lambda x: preprocessing.preprocessing_text(x))
data.head()


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming t...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


#### Split the data for the training, the testing and the validation datasets

In [6]:
X = data.review
y = data.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y_train)


### Preprocessing Data

In [7]:
max_features = 20000
max_len = 500


tokenizer = Tokenizer(num_words=max_features, oov_token='unk')

# only fit on train
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


### Let's define the function for building and training model

In [8]:
#function for f1 metric
def get_f1(y_true, y_pred): 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val


def build_model(embed_dim=128, lstm_out=64, dropout=0.5, optimizer='adam', activation='relu', units=32 ):
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=max_len))
    model.add(Bidirectional(LSTM(lstm_out)))
    model.add(Dropout(dropout))
    model.add(Dense(units, activation=activation))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer, 'binary_crossentropy', metrics=[get_f1])
    return model


def train_model(embed_dim=128, 
                lstm_out=64, 
                dropout=0.5, 
                optimizer='adam',
                batch_size=32,
                epochs=20,
                activation='relu', 
                units=32, 
                log_dir='logs/bilstm',
                filepath='model.hdf5'):
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    early_stop = EarlyStopping(monitor='val_loss', 
                               patience=3)
    cp_callback = ModelCheckpoint(filepath=filepath,
                                  save_best_only=True,
                                  verbose=1)
    
    model = build_model(embed_dim, lstm_out, dropout, optimizer)
    model.summary()
    history = model.fit(X_train_pad, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=2, 
                        callbacks=[tensorboard_callback, early_stop, cp_callback],                    
                        validation_data=(X_val_pad, y_val))
    return history
    

    

### Train different models

Result of exploring let's save to Dataframe

In [9]:
models_research_df = pd.DataFrame(columns=['Model',
                                           'loss',
                                           'f1_score',
                                           'val_loss',
                                           'val_f1'], )
results = {}
# add data to result research dataframe
def add_data_df(table_df_for_add, model_name, history_research):
    index = history_research['val_loss'].index(min(history_research['val_loss']))
    model_research = (model_name, history_research['loss'][index], history_research['get_f1'][index], history_research['val_loss'][index], history_research['val_get_f1'][index])
    if any([list(row.values ) == list(model_research) for _, row in table_df_for_add.iterrows()]):
        return
    table_df_for_add.loc[len(table_df_for_add)] = model_research


### Model_1
The model with default params for this model with `dropout=0`

In [10]:
history_1 = train_model(filepath='Model_1_{epoch:02d}-{val_get_f1:.2f}.hdf5',
           log_dir='logs/bilstm/model_1',
           dropout=0)
results['model_1'] = history_1

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               98816     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,662,977
Trainable params: 2,662,977
Non-trainable params: 0
______________________________________________

### Model_2
The model with `dropout=0.2`

In [11]:
history_2 = train_model(filepath='Model_2_{epoch:02d}-{val_get_f1:.2f}.hdf5',
           log_dir='logs/bilstm/model_2',
           dropout=0.2)
results['model_2'] = history_2

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 2,662,977
Trainable params: 2,662,977
Non-trainable params: 0
____________________________________________

### Model_3
The model with `dropout=0.2` and `activation=tanh` 

In [12]:
history_3 = train_model(filepath='Model_3_{epoch:02d}-{val_get_f1:.2f}.hdf5',
           log_dir='logs/bilstm/model_3', 
           dropout=0.2,
           activation='tanh')
results['model_3'] = history_3

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 2,662,977
Trainable params: 2,662,977
Non-trainable params: 0
____________________________________________

### Model_4
The model with `dropout=0.2` and `activation=tanh` and `units=64`


In [13]:
history_4 = train_model(filepath='Model_4_{epoch:02d}-{val_get_f1:.2f}.hdf5',
                        log_dir='logs/bilstm/model_4',
                        dropout=0.2,
                        units=64,
                       activation='tanh')
results['model_4'] = history_4

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 2,662,977
Trainable params: 2,662,977
Non-trainable params: 0
____________________________________________

### Model_5
The model with 
* `dropout=0.2` 
* `activation=relu`
* `units=64`

In [14]:
history_5 = train_model(filepath='Model_5_{epoch:02d}-{val_get_f1:.2f}.hdf5',
                        log_dir='logs/bilstm/model_5',
                        dropout=0.2,
                        units=64)
results['model_5'] = history_5

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 32)                4128      
_________________________________________________________________
dropout_9 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 33        
Total params: 2,662,977
Trainable params: 2,662,977
Non-trainable params: 0
____________________________________________

### Model_6
The model with 
* `dropout=0.2` 
* `activation=relu`
* `units=64`
* `lstm_out=128`

In [15]:
history_6 = train_model(filepath='Model_6_{epoch:02d}-{val_get_f1:.2f}.hdf5',
                        log_dir='logs/bilstm/model_6',
                        dropout=0.2,
                       units=64,
                       lstm_out=128)
results['model_6'] = history_6

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_11 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 33        
Total params: 2,831,425
Trainable params: 2,831,425
Non-trainable params: 0
____________________________________________

### Model_7
The model with 
* `dropout=0.2` 
* `activation=tanh`
* `units=64`
* `lstm_out=128`

In [16]:
history_7 = train_model(filepath='Model_7_{epoch:02d}-{val_get_f1:.2f}.hdf5',
           log_dir='logs/bilstm/model_7',
           dropout=0.2,
                       units=64,
                       lstm_out=128,
                       activation='tanh')
results['model_7'] = history_7

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 128)          2560000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 256)               263168    
_________________________________________________________________
dropout_12 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 32)                8224      
_________________________________________________________________
dropout_13 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 33        
Total params: 2,831,425
Trainable params: 2,831,425
Non-trainable params: 0
____________________________________________

### Result of exploring models

In [17]:
for result in results:
    add_data_df(models_research_df, result, results.get(result).history)

In [18]:
models_research_df

Unnamed: 0,Model,loss,f1_score,val_loss,val_f1
0,model_1,0.181231,0.930693,0.300055,0.893155
1,model_2,0.240558,0.897326,0.328663,0.873913
2,model_3,0.327967,0.860688,0.305403,0.872537
3,model_4,0.165357,0.939424,0.294052,0.877311
4,model_5,0.430557,0.79056,0.284407,0.883737
5,model_6,0.266093,0.896062,0.274701,0.894998
6,model_7,0.200573,0.922255,0.247807,0.89997
