In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
import tensorflow as tf
import pandas as pd
import numpy as np
import re

In [2]:
reviews_df = pd.read_csv(r'C:\Users\taewoo\Desktop\Datasets\IMDB Dataset.csv')
reviews = np.array(reviews_df['review'])
sentiment = np.array(reviews_df['sentiment'])
print(reviews[:3])

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [3]:
def preprocess_review(review_list):
    """
    param: a list of string
    return: a list of string
    preprocess reviews by changing all alphabets to lowercase, removing punctuations, and cleaning html leftover code
    """
    for i, review in enumerate(review_list):
        review = review.replace('<br /><br />', ' ')
        review = review.lower()
        review = re.sub(r"[^A-Za-z0-9'. ]+", '', review)
        review = review.replace('  ', ' ')
        review_list[i] = review
    return review_list

preprocessed_reviews = preprocess_review(reviews)
print(preprocessed_reviews[:3])

["one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked. they are right as this is exactly what happened with me. the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go. trust me this is not a show for the faint hearted or timid. this show pulls no punches with regards to drugs sex or violence. its is hardcore in the classic use of the word. it is called oz as that is the nickname given to the oswald maximum security state penitentary. it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda. em city is home to many..aryans muslims gangstas latinos christians italians irish and more....so scuffles death stares dodgy dealings and shady agreements are never far away. i would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare

In [4]:
# to divide train & test sets
test_sample_size = int(0.1*len(preprocessed_reviews))  # 10% of data as the validation set

# for sentiment
sentiment = [1 if x=='positive' else 0 for x in sentiment]

# separate data to train & test sets
X_test, X_train = (np.array(preprocessed_reviews[:test_sample_size]), 
                   np.array(preprocessed_reviews[test_sample_size:])
)

y_test, y_train = (np.array(sentiment[:test_sample_size]), 
                   np.array(sentiment[test_sample_size:])
)

tokenizer = Tokenizer(oov_token='<OOV>')  # for the unknown words
tokenizer.fit_on_texts(X_train)

vocab_count = len(tokenizer.word_index) + 1  # +1 is for padding

# create padded sequences
training_sequences = tokenizer.texts_to_sequences(X_train)  # tokenizer.word_index to see indexes
training_padded = pad_sequences(training_sequences, padding='post')  # pad sequences with 0s 

testing_sequences = tokenizer.texts_to_sequences(X_test)  # tokenizer.word_index to see indexes
testing_padded = pad_sequences(testing_sequences, padding='post')  # pad sequences with 0s 

input_length = len(testing_padded[0])  # length of all sequences


# build a model
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=vocab_count,
                                 output_dim=6,
                                 input_length=input_length))
model.add(keras.layers.GlobalAveragePooling1D())  # find the average of vectors to get sentiment
model.add(keras.layers.Dense(128, activation='relu'))  # hidden layer
model.add(keras.layers.Dense(16, activation='relu'))  # hidden layer
model.add(keras.layers.Dense(1, activation='sigmoid'))  # output layer

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1708, 6)           948912    
_________________________________________________________________
global_average_pooling1d (Gl (None, 6)                 0         
_________________________________________________________________
dense (Dense)                (None, 128)               896       
_________________________________________________________________
dense_1 (Dense)              (None, 16)                2064      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 951,889
Trainable params: 951,889
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
print(y_test[:5], y_train[:5])

['positive', 'positive', 'positive', 'negative', 'positive'] [1, 1, 1, 0, 1]


In [13]:
print(np.array(sentiment[:test_sample_size]))

[1 1 1 ... 1 0 0]


In [5]:
model.fit(training_padded, y_train, epochs=1, batch_size=512,
          validation_data=(testing_padded, y_test)
)



<tensorflow.python.keras.callbacks.History at 0x22b8da05460>

In [6]:
index_list = []
predictions = model.predict(testing_padded)
predictions = ['negative' if x <= 0.5 else 'positive' for x in predictions]
y_test = ['negative' if x <= 0.5 else 'positive' for x in y_test]
for i, (prediction, truth) in enumerate(zip(predictions, y_test)):
    if prediction != truth:
        index_list.append(i)

print(f'The model has predicted {len(index_list)} wrong')

The model has predicted 2532 wrong


Let's take a look at some of the faulty predictions

In [7]:
for i, item in enumerate(index_list):
    if i <= 5:
        print(X_test[item])
        print(f'Prediction: {predictions[item]}')
        print(f'Sentiment: {y_test[item]}')
        print('============================')

basically there's a family where a little boy jake thinks there's a zombie in his closet his parents are fighting all the time. this movie is slower than a soap opera... and suddenly jake decides to become rambo and kill the zombie. ok first of all when you're going to make a film you must decide if its a thriller or a drama as a drama the movie is watchable. parents are divorcing arguing like in real life. and then we have jake with his closet which totally ruins all the film i expected to see a boogeyman similar movie and instead i watched a drama with some meaningless thriller spots. 3 out of 10 just for the well playing parents descent dialogs. as for the shots with jake just ignore them.
Prediction: positive
Sentiment: negative
this show was an amazing fresh innovative idea in the 70's when it first aired. the first 7 or 8 years were brilliant but things dropped off after that. by 1990 the show was not really funny anymore and it's continued its decline further to the complete was

The model made good assumptions for most of them, but it got easily confused by words like 'good', 'bad', 'well', etc. 

### LSTM Model

In [8]:
new_model = keras.models.Sequential()
new_model.add(keras.layers.Embedding(input_dim=vocab_count,
                                 output_dim=6,
                                 input_length=input_length)
)
new_model.add(keras.layers.LSTM(16))
new_model.add(keras.layers.Dense(1, activation='sigmoid'))
new_model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']
)

print(new_model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1708, 6)           948912    
_________________________________________________________________
lstm (LSTM)                  (None, 16)                1472      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 950,401
Trainable params: 950,401
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
# y_train = list(y_train)

# new_model.fit(training_sequences, y_train, epochs=5, batch_size=256,
#               validation_data=(testing_sequences, y_test))

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {'(<class \'list\'> containing values of types {"<class \'int\'>"})'}), (<class 'list'> containing values of types {"<class 'numpy.int32'>"})

In [10]:
print(type(training_sequences))
print(type(y_test))
print(type(y_train))
print(type(testing_sequences))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


In [11]:
print(y_test[:5])
print(y_train[:5])
print()

['positive', 'positive', 'positive', 'negative', 'positive']
[1, 1, 1, 0, 1]
