In [1]:
%cd drive/MyDrive/Github/Natural-Language-Processing/Sentiment\ Analysis/IMDB movie review sentiment/

/content/drive/MyDrive/Github/Natural-Language-Processing/Sentiment Analysis/IMDB movie review sentiment


## Importing necessary modules

In [2]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Loading dataset

In [3]:
train = pd.read_csv("Train.csv")
valid = pd.read_csv("Valid.csv")
test = pd.read_csv("Test.csv")

print('train size:', len(train))
print('valid size:', len(valid))
print('test size:', len(test))
train.head()

train size: 40000
valid size: 5000
test size: 5000


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data preprocessing

In [4]:
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

unwanted_symbols = re.compile('[/{}\[\]\|@,;]')
def review_tokenizer(review):
    review = review.lower()
    review = unwanted_symbols.sub(' ', review) # replace unwanted_symbols by space in text
    tokenized_review = word_tokenize(review)
    tokenized_review = [stemmer.stem(word) for word in tokenized_review]
    return tokenized_review

## Tokenizing dataset

In [5]:
X_train = []
X_valid = []
X_test = []

for review in train.text:
    tokenized_review = review_tokenizer(review.strip())
    X_train.append(tokenized_review)
y_train = list(train.label)

for review in valid.text:
    tokenized_review = review_tokenizer(review.strip())
    X_valid.append(tokenized_review)
y_valid = list(valid.label)

for review in test.text:
    tokenized_review = review_tokenizer(review.strip())
    X_test.append(tokenized_review)
y_test = list(test.label)

## Creating vocabulary

In [6]:
vocab = {'<PAD': 0, '</e>': 1, '<UNK>': 2} 

for review in X_train: 
    for word in review:
        if word not in vocab: 
            vocab[word] = len(vocab)
    
print("vocab size",len(vocab))

vocab size 115487


## Converting tokenized data to tensors of indices

In [140]:
def review_to_indices(review, vocab, unk_token='<UNK>'):
    review_indices = []
    unk_id = vocab[unk_token]
    for word in review:
        word_id = vocab[word] if word in vocab else unk_id
        review_indices.append(word_id) 
    return review_indices

def data_to_tensor(data):
    data_indices = []
    for review in data:
      review_indices = review_to_indices(review, vocab, unk_token='<UNK>')
     
      data_indices.append(review_indices)
    return data_indices


In [141]:
X_train_indices = data_to_tensor(X_train)
X_valid_indices = data_to_tensor(X_valid)
X_test_indices = data_to_tensor(X_test)


# Converting targets to one hot vectors

In [84]:
from tensorflow.keras.utils import to_categorical
y_train_hot = to_categorical(y_train, num_classes=2) #class 0: negative and class 1: possitive
y_valid_hot = to_categorical(y_valid, num_classes=2)
y_test_hot = to_categorical(y_test, num_classes=2)

## Bucketing dataset by length

In [86]:
import tensorflow as tf

#sentence length finder
def element_length_fn(x, y):
    return tf.shape(x)[0]

#creating data generator
def data_generator(inputs, targets):
  dataset = tf.data.Dataset.from_generator(
      generator = lambda: ((x, y) for x, y in zip(inputs, targets)),
      output_shapes=([None], [2,]),
      output_types=(tf.int32, tf.int32)
  )

  dataset = dataset.bucket_by_sequence_length(
          element_length_func = element_length_fn,
          bucket_batch_sizes = [512, 512, 256, 128, 128, 64, 32, 16],
          bucket_boundaries = [32, 64, 128, 256, 512, 1024, 2048],   #sentences lengths
          padding_values=(0, 0))
  return dataset


In [88]:
train_generator = data_generator(X_train_indices, y_train_hot)
valid_generator = data_generator(X_valid_indices, y_valid_hot)
test_generator = data_generator(X_test_indices, y_test_hot)

## Defining model

In [101]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, Lambda
from tensorflow.keras import backend as K

In [130]:
vocab_size = len(vocab)+1
embedding_dim = 64

def mean(x):
  return K.mean(x, axis=1)
  
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(Dense(32, activation='relu'))
model.add(Lambda(mean))
model.add(Dense(2, activation='softmax')) #two classes. one for positive and another for negative sentiment
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_23"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_23 (Embedding)     (None, None, 64)          7391232   
_________________________________________________________________
dense_31 (Dense)             (None, None, 32)          2080      
_________________________________________________________________
lambda_7 (Lambda)            (None, 32)                0         
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 66        
Total params: 7,393,378
Trainable params: 7,393,378
Non-trainable params: 0
_________________________________________________________________


# Training

In [131]:
model.fit(train_generator, epochs=5, validation_data=(valid_generator))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f97cf8d9f90>

## Evaluating model on test set

In [135]:
loss, accuracy = model.evaluate(test_generator)
print("test loss:", loss, "\n test accuracy:", accuracy)

test loss: 0.2691759467124939 
test accuracy: 0.90420001745224


## Predicting custom review sentiment

In [155]:
def predict_review(review, model):
  tokenized_review = review_tokenizer(review)
  review_indices = review_to_indices(tokenized_review, vocab)
  review_indices = np.array(review_indices)
  review_indices = np.expand_dims(review_indices, axis=0)
  pred = model.predict(review_indices)
  is_positive =  pred[0][1] > pred[0][0]
  if is_positive:
    print("Positive review")
  else:
    print("Negative review")

In [156]:
review = "Wow! That's my reaction to pretty much the entire film that writer-director Christopher Nolan has made called Inception. Since this is the first movie I've seen of his that isn't part of the Batman series (correction, I did see The Prestige and review it back in 2006), I just marvel at how creatively compelling his work truly is when he literally uses his imagination to the fullest of his ability. To make a more apt comparison: What Lost is to television, Inception is to the movies-the ability to truly take you to places you've never been before. Plus, what a great cast: Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page, Tom Hardy, Ken Watanabe, Dileep Rao, Cillian Murphy, Tom Berenger, Marion Cotillard, Pete Postlethwaite, and Michael Caine among others. All deserve the kudos they're getting and more. No wonder it's been No. 1 at the box office these past three weeks! This movie isn't very easy to describe so I won't even try. I'll just highly recommend this to anyone who wants their minds challenged to the fullest extent of their ability and just leave it at that."
predict_review(review, model)

Positive review
