In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


### Introduction
This notebook will build a RNN with GRU layer using keras to solve classification sentiment problem for movie reviews.


Import **libraries**, import **custom scripts** and define **constants**

In [2]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Dropout, GRU
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard, ModelCheckpoint

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split

import re


In [3]:
# import all our functions
import os,sys,inspect
currentdir=os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir=os.path.dirname(currentdir)
sys.path.insert(0,parentdir)
from usr.lib.preprocessing import preprocessing


/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


In [4]:
#definition constants
RANDOM_STATE = 11
TEST_SIZE = 0.15


#### Loading the data and applying the preprocessing

In [5]:
# import & display data
data = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
data['sentiment'] = data['sentiment'].replace({'positive' : 1, 'negative' : 0})
data = data.drop_duplicates()
data['review'] = data['review'].apply(lambda x: preprocessing.preprocessing_text(x))
data.head()


Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production. the filming t...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically there's a family where a little boy ...,0
4,"petter mattei's ""love in the time of money"" is...",1


#### Split the data for the training, the testing and the validation datasets

In [6]:
X = data.review
y = data.sentiment
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y)

X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                    y_train,
                                                    test_size=TEST_SIZE, 
                                                    random_state=RANDOM_STATE, 
                                                    stratify = y_train)


### Preprocessing Data

In [7]:
max_features = 50000
max_len = 500


tokenizer = Tokenizer(num_words=max_features, oov_token='unk')

# only fit on train
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)


In [8]:
def build_model(embed_dim=128, gru_out=64, dropout=0.5, optimizer='adam', units='64', activation='relu' ):
    model = Sequential()
    model.add(Embedding(max_features, embed_dim, input_length=max_len))
    model.add(GRU(gru_out))
    model.add(Dropout(dropout))
    model.add(Dense(units, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer, 'binary_crossentropy', metrics=['accuracy'])
    return model


def train_model(embed_dim=128, 
                gru_out=64, 
                dropout=0.5, 
                optimizer='adam', 
                epochs=5,
               batch_size=32):
    model = build_model(embed_dim, gru_out, dropout, optimizer)
    model.summary()
    hystory = model.fit(X_train_pad, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=2, 
                        validation_data=(X_val_pad, y_val))
    

    

In [9]:
train_model()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 128)          6400000   
_________________________________________________________________
gru (GRU)                    (None, 64)                37248     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 6,441,473
Trainable params: 6,441,473
Non-trainable params: 0
______________________________________________