In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import numpy as np
import re

In [2]:
reviews_df=pd.read_csv('drive/My Drive/IMDB Dataset.csv')
reviews = np.array(reviews_df['review'])
sentiment = np.array(reviews_df['sentiment'])
print(reviews[:3])

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

### **Text Normalization & Preprocessing**

In [3]:
import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer


stops = set(stopwords.words("english"))
lemma = WordNetLemmatizer()


def preprocess_review(review_list):
    """
    param: a list of string
    return: a list of string
    preprocess reviews by changing all alphabets to lowercase, removing punctuations, and cleaning html leftover code
    """
    for i, review in enumerate(review_list):
        review = review.replace('<br /><br />', ' ')
        review = review.lower()
        review = re.sub(r"[^A-Za-z0-9' ]+", '', review)
        review = re.sub(r'[^\w\s]', '', review)  # remove punctuations
        review = review.replace('  ', ' ')
        review_list[i] = review
    return review_list


def remove_stops(string_list):
  """
  param: a list of strings
  return: a list of strings without stopwords
  remove all stopwords from a list
  """
  # tokenize words and lemmatize some words
  for i, string in enumerate(string_list):
    pre_filtered = word_tokenize(string)
    filtered = [lemma.lemmatize(word, pos = "v") for word in pre_filtered]
    filtered = [lemma.lemmatize(word, pos = "n") for word in filtered]
    filtered = [word for word in filtered if not word in stops]
    filtered_string = ' '.join(filtered)
    string_list[i] = filtered_string 
  return string_list


preprocessed_reviews = preprocess_review(reviews)
preprocessed_reviews = remove_stops(preprocessed_reviews)

In [4]:
print(preprocessed_reviews[:3])

['one reviewer mention watch 1 oz episode youll hook right exactly happen first thing strike oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home manyaryans muslim gangsta latino christian italian irish moreso scuffle death star dodgy deal shady agreement never far away would say main appeal show due fact go show wouldnt dare forget pretty picture paint mainstream audience forget charm forget romanceoz doesnt mess around first episode ever saw strike nasty surreal couldnt say ready watch develop taste oz get accustom high level graphic violence violence injustice crook guard wholl sell nickel inmate wholl kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison ex

### **Building a Sequential Model**


In [5]:
# to divide train & test sets
test_sample_size = int(0.1*len(preprocessed_reviews))  # 10% of data as the validation set

# for sentiment
sentiment = [1 if x=='positive' else 0 for x in sentiment]

# separate data to train & test sets
X_test, X_train = (np.array(preprocessed_reviews[:test_sample_size]), 
                   np.array(preprocessed_reviews[test_sample_size:])
)

y_test, y_train = (np.array(sentiment[:test_sample_size]), 
                   np.array(sentiment[test_sample_size:])
)


tokenizer = Tokenizer(oov_token='<OOV>')  # for the unknown words
tokenizer.fit_on_texts(X_train)

vocab_count = len(tokenizer.word_index) + 1  # +1 is for padding

# create padded sequences
training_sequences = tokenizer.texts_to_sequences(X_train)  # tokenizer.word_index to see indexes
training_padded = pad_sequences(training_sequences, padding='post')  # pad sequences with 0s

testing_sequences = tokenizer.texts_to_sequences(X_test)  # tokenizer.word_index to see indexes
testing_padded = pad_sequences(testing_sequences, padding='post')  # pad sequences with 0s

input_length = len(testing_padded[0])  # length of all sequences

In [6]:
# build a model
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_count,
                                 output_dim=4,
                                 input_length=input_length,
                                 mask_zero = True)
)
model.add(keras.layers.GlobalAveragePooling1D())  # find the average of vectors to get sentiment
model.add(keras.layers.Dense(128, activation='relu'))  # hidden layer
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(1, activation='sigmoid'))  # output layer

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 921, 4)            546404    
_________________________________________________________________
global_average_pooling1d (Gl (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               640       
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 547,173
Trainable params: 547,173
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.fit(training_padded, y_train, epochs=6, batch_size=512,
          validation_data=(testing_padded, y_test)
)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<tensorflow.python.keras.callbacks.History at 0x7f40f939e910>

In [8]:
y_prediction = model.predict(testing_padded)
y_prediction = np.array([0 if x <= 0.5 else 1 for x in y_prediction])
index_list = []
for i, (y_real, y_predict) in enumerate(zip(y_test, y_prediction)):
  if y_real != y_predict:
    index_list.append(i)
print(f'Predicted {len(index_list)} wrong of {len(y_prediction)}')

Predicted 476 wrong of 5000


In [9]:
for i, item in enumerate(index_list):
    if i <= 5:
        print(X_test[item])
        print(f'Prediction: {y_prediction[item]}')
        print(f'Sentiment: {y_test[item]}')
        print('============================')

sure would like see resurrection date seahunt series tech today would bring back kid excitement mei grow black white tv seahunt gunsmoke hero every weekyou vote comeback new sea huntwe need change pace tv would work world water adventureoh way thank outlet like view many viewpoint tv many moviesso ole way believe ive get wan na saywould nice read plus point sea huntif rhyme would 10 line would let submitor leave doubt quitif must go let
Prediction: 0
Sentiment: 1
cast play shakespeare shakespeare lose appreciate try bring shakespeare mass ruin something good scottish play favorite shakespeare know know certain rev bowdler hence bowdlerization try something similar victorian era word improve perfection write write least ten line text english composition never forte keep go say movie say go cut
Prediction: 1
Sentiment: 0
film simply remake one bad film fail capture flavor terror 1963 film title liam neeson excellent always cast hold exception owen wilson bring right feel character luke m

The model made good assumptions for most of them, but it got easily confused by noise


### **Trying a different model: LSTM**


In [10]:
# y_test = [0 if x == 'negative' else 1 for x in y_test]

# the model takes numpy arrays as inputs
y_test = np.asarray(y_test).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

for i, (train_item, test_item) in enumerate(zip(training_padded, testing_padded)):
    training_padded[i] = np.asarray(train_item).astype(np.float32)
    testing_padded[i] = np.asarray(test_item).astype(np.float32)

In [11]:
from tensorflow.keras.layers import SpatialDropout1D, Embedding, LSTM, Bidirectional, Dense, Dropout

new_model = keras.models.Sequential()
new_model.add(Embedding(input_dim=vocab_count,
                        output_dim=4,
                        input_length=input_length))
new_model.add(Bidirectional(tf.keras.layers.LSTM(32, dropout=0.2)))
new_model.add(Dense(32, activation='relu'))
new_model.add(Dropout(0.2))
new_model.add(Dense(1, activation='sigmoid'))
new_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

print(new_model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 921, 4)            546404    
_________________________________________________________________
bidirectional (Bidirectional (None, 64)                9472      
_________________________________________________________________
dense_2 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 557,989
Trainable params: 557,989
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
new_model.fit(training_padded, y_train, epochs=5, batch_size=512,  # after 4 epochs, it starts overfitting
              validation_data=(testing_padded, y_test)
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f40ac1cda10>

In [13]:
new_index_list = []
new_predictions = new_model.predict(testing_padded)
new_predictions = ['negative' if x <= 0.5 else 'positive' for x in new_predictions]
y_test = ['negative' if x <= 0.5 else 'positive' for x in y_test]
for i, (prediction, truth) in enumerate(zip(new_predictions, y_test)):
    if prediction != truth:
        new_index_list.append(i)

print(f'The model has predicted {len(new_index_list)} wrong of {len(new_predictions)}')

The model has predicted 521 wrong of 5000


Our previous model has performed better.

I would like to reduce validation loss even more, but this is the furthest I could achieve