## Imports

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
import matplotlib.pyplot as plt
import os

## GRU RNN - Pranav

## LSTM - Teresa

In [4]:
movieRev = pd.read_csv("IMDB Dataset.csv")
movieRev

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [5]:
# Equal amounts, so there is no need to normalize
movieRev['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

### Preprocessing

In [6]:
# Find the review with the highest number of words
def maxWords(reviewList):
    words = 5000        # Arbitrary value for max words
    for review in reviewList:
        if (len(review.split()) > words):
            words = len(review.split())
            
    return words

In [8]:
reviewList = movieRev['review'].tolist()
words = maxWords(reviewList)
print(words)

5000


In [9]:
# Convert categorical values to numeric using factorize()
sentiment_label = movieRev['sentiment'].factorize()
print(sentiment_label[0])

[0 0 0 ... 1 1 1]


In [12]:
text = movieRev['review'].values

# Tokenize the words in text and fit to associate the words and labels
tokenizer = Tokenizer(num_words=words)
tokenizer.fit_on_texts(text)

# Identify the size of vocabulary
vocab_size = len(tokenizer.word_index) + 1

# Replace words with their assigned numbers using text_to_sequence()
encoding = tokenizer.texts_to_sequences(text)

# Add padding so sentences can have equal length
pad_sequence = pad_sequences(encoding, maxlen=200)

### LSTM Classifier

In [14]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(128, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 32)           3976096   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 200, 32)          0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 128)               82432     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 4,058,657
Trainable params: 4,058,657
Non-trainable params: 0
____________________________________________

### Train Model

**[CAUTION]** May take over 10 minutes to run. <br>
Base Model: 5 epochs, batch_size=32, validation_split=0.2

In [None]:
# Train for 5 epochs with batch size 32 and validation split 20%
history = model.fit(pad_sequence,sentiment_label[0],validation_split=0.2, epochs=5, batch_size=32)

Epoch 1/5

### Results

In [None]:
plt.plot(history.history['accuracy'], label='acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.legend()
plt.show()

In [None]:
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

### Predict using Model

In [None]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(model.predict(tw).round().item())
#     print("Predicted label: ", sentiment_label[1][prediction])
    return sentiment_label[1][prediction]

test_sentence1 = "I enjoyed this movie. The actors were very inspirational."
print(predict_sentiment(test_sentence1))

test_sentence2 = "This is the worst movie I've ever seen in my life!"
print(predict_sentiment(test_sentence2))

In [None]:
# Check Positive and Negative Testing Examples
print("Number of Positive Examples:", len(os.listdir("aclImdb/test/pos/")))
print("Number of Negative Examples:", len(os.listdir("aclImdb/test/neg/")))

## Transformer-based Model - Tri