In [1]:
import numpy as np
import pandas as pd

## Data preparation and preprocessing

In [2]:
data = pd.read_csv('data/reviews.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.iloc[3][0]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [4]:
data.shape

(50000, 2)

In [5]:
X = data['review']
y = data['sentiment']
X = X.str.lower()

##### We want to distribute reviews results to 2 classes (1 and 0) for positive and negative

In [6]:
y = y.replace(to_replace='positive', value=1)
y = y.replace(to_replace='negative', value=0)

In [7]:
y.head(7)

0    1
1    1
2    1
3    0
4    1
5    1
6    1
Name: sentiment, dtype: int64

In [8]:
vocab_size = 10000
oov_tok = "<OOV>"
max_length = 120 #the maximum length of review text

In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(X)
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [10]:
tokenizer.vocab_size

AttributeError: 'Tokenizer' object has no attribute 'vocab_size'

### Now let's split our data to train and test without cross-validation

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded, y, test_size=0.2, random_state=42)
y_train, y_test = y_train.values, y_test.values

In [24]:
y_test.shape

(10000,)

### We are ready to train our model. Lets see how it will perform on validation.

In [14]:
#hyperparameters
embedding_dim = 100
num_epochs = 10

In [15]:
import tensorflow as tf

In [16]:
total_words = len(tokenizer.word_index)
total_words

124253

In [17]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size+1, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          640064    
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               66048     
_________________________________________________________________
dense (Dense)                (None, 64)                8256      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 714,433
Trainable params: 714,433
Non-trainable params: 0
_________________________________________________________________


In [25]:
history = model.fit(X_train, y_train, epochs=num_epochs)

Train on 40000 samples
Epoch 1/50
 2208/40000 [>.............................] - ETA: 6:47 - loss: 0.6924 - accuracy: 0.5290

KeyboardInterrupt: 