In [1]:
import pandas as pd    # to store and preprocess dataset
import numpy as np     # for mathematical equations
from nltk.corpus import stopwords   # to get a wworking set of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # for padding and truncation
from tensorflow.keras.models import Sequential     # for importing model
from tensorflow.keras.layers import Embedding, LSTM, Dense # define lsyers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [2]:
# load and view data	
data = pd.read_csv('/Users/suvirsingh/SentimentAnalysis/IMDB Dataset.csv')

print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [3]:
from nltk.corpus import stopwords   # to get a wworking set of stopwords
english_stops = set(stopwords.words('english'))

In [4]:

def load_dataset():
    df = pd.read_csv('/Users/suvirsingh/SentimentAnalysis/IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
15390    [this, simply, good, ole, fashioned, western, ...
24180    [this, long, lost, horror, gem, starring, sydn...
29946    [this, one, favorite, james, bond, games, the,...
47318    [bon, voyage, fun, audience, combines, requisi...
45993    [while, i, would, say, i, enjoy, show, i, expe...
                               ...                        
29329    [about, year, ago, i, finally, gave, american,...
11305    [this, must, one, chaplin, ambitious, projects...
14183    [this, documentary, amateurish, it, could, mad...
14606    [spoiler, nothing, but, spoileri, add, name, l...
17490    [even, i, never, seen, heard, georgetown, co, ...
Name: review, Length: 40000, dtype: object 

847      [stmd, terrible, movie, is, quite, forgettable...
9607     [more, eeriness, dark, secrets, released, fina...
45193    [comments, subject, rated, film, worth, watchi...
15579    [hi, john, would, like, tell, dog, film, baiti...
31333    [when, attempt, made, assassinate, emir, ohtar...
 

In [6]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [7]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   8  237    9 ... 1317 1567 1368]
 [   8  102  331 ...  419  126  613]
 [   8    5  433 ...    0    0    0]
 ...
 [   8  540 2174 ...    0    0    0]
 [1299   76   30 ...    0    0    0]
 [  11    1   41 ...    0    0    0]] 

Encoded X Test
 [[40609   282     3 ...     0     0     0]
 [ 1659 19543   361 ...     0     0     0]
 [  679   695  1078 ...  1143   250    65]
 ...
 [ 4739    29   243 ...     0     0     0]
 [    8 10071   911 ...  1580     2  2617]
 [   39    14    19 ...   151    19  2300]] 

Maximum review length:  130


In [12]:
from keras.layers import Bidirectional, Dropout

# ARCHITECTURE
EMBED_DIM = 64
LSTM_OUT = 256

# Modify the architecture by adding another LSTM layer, Bidirectional LSTM, and Dropout
model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model.add(Bidirectional(LSTM(LSTM_OUT, return_sequences=True)))  # Bidirectional LSTM
model.add(Dropout(0.3))  # Dropout to prevent overfitting
model.add(LSTM(LSTM_OUT))  # Second LSTM layer for added complexity
model.add(Dense(1, activation='sigmoid'))

# Compile the model with a lower learning rate
from keras.optimizers import Adam
optimizer = Adam(learning_rate=0.0005)  # Reduce learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Implement ModelCheckpoint and EarlyStopping
from keras.callbacks import EarlyStopping

checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='val_accuracy',  # Monitor validation accuracy for better generalization
    save_best_only=True,
    verbose=1
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop after 3 epochs of no improvement
    verbose=1,
    restore_best_weights=True
)

# Train the model with more epochs and a smaller batch size
history = model.fit(
    x_train, y_train,
    validation_split=0.2,  # Add validation split to monitor validation accuracy
    batch_size=64,  # Reduce batch size for finer weight updates
    epochs=20,  # Increase number of epochs for more learning
    callbacks=[checkpoint, early_stopping],
    verbose=1
)

2024-09-08 15:49:07.603908: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-08 15:49:07.604720: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-08 15:49:07.605299: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 1/20


2024-09-08 15:49:07.910712: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-08 15:49:07.911353: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-08 15:49:07.912124: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-09-08 15:53:43.087796: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-08 15:53:43.088782: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-08 15:53:43.089461: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


Epoch 1: val_accuracy improved from -inf to 0.84288, saving model to models/LSTM.h5
Epoch 2/20
Epoch 2: val_accuracy improved from 0.84288 to 0.87037, saving model to models/LSTM.h5
Epoch 3/20
Epoch 3: val_accuracy did not improve from 0.87037
Epoch 4/20
Epoch 4: val_accuracy did not improve from 0.87037
Epoch 5/20
Epoch 5: val_accuracy did not improve from 0.87037
Restoring model weights from the end of the best epoch: 2.
Epoch 5: early stopping


In [13]:
# Predict class probabilities
y_pred_prob = model.predict(x_test, batch_size=128)

# Convert probabilities to binary class labels (0 or 1)
y_pred = (y_pred_prob > 0.5).astype(int)  # Since it's binary classification, threshold at 0.5

# Initialize a counter for correct predictions
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

# Calculate accuracy
accuracy = true / len(y_pred) * 100

# Print results
print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {:.2f}%'.format(accuracy))

2024-09-08 16:18:40.804112: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-08 16:18:40.805516: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-08 16:18:40.806204: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Correct Prediction: 8807
Wrong Prediction: 1193
Accuracy: 88.07%


In [14]:
loaded_model = load_model('models/LSTM.h5')

2024-09-08 16:20:54.179250: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-09-08 16:20:54.180649: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-09-08 16:20:54.181741: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [20]:
review = str(input('Movie Review: Mystery of the Midnight Woods is a suspenseful thriller about friends who discover a hidden town, leading to eerie and dangerous events. While some twists are predictable, the creepy atmosphere and satisfying ending make it worth watching for fans of the genre.'))

In [22]:
import re
from nltk.corpus import stopwords

# Example review
review = "I watched *Mystery of the Midnight Woods*, and it turned out to be a pretty good thriller!"

# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')  # Keeping only letters and spaces
cleaned_review = regex.sub('', review)
print('Cleaned:', cleaned_review)

# Stopword removal
english_stops = set(stopwords.words('english'))  # Ensure you have the stopwords downloaded

words = cleaned_review.split(' ')
filtered = [w for w in words if w.lower() not in english_stops and w.strip() != '']  # Remove empty words
filtered_review = ' '.join(filtered)  # Join words back into a sentence

# Lowercase the final filtered sentence
filtered_review = filtered_review.lower()

print('Filtered:', filtered_review)

Cleaned: I watched Mystery of the Midnight Woods and it turned out to be a pretty good thriller
Filtered: watched mystery midnight woods turned pretty good thriller


In [23]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[193   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   0   0]
 ...
 [ 90   0   0 ...   0   0   0]
 [  9   0   0 ...   0   0   0]
 [593   0   0 ...   0   0   0]]


In [26]:
# Assuming `tokenize_words` is the preprocessed input (e.g., a tokenized review)
result = loaded_model.predict(tokenize_words)

# Extracting the predicted value (since it's returning an array)
predicted_value = result[0][0]  # This extracts the first (and only) value

print(predicted_value)

0.7659284


In [28]:
if predicted_value >= 0.7:
    print('positive')
else:
    print('negative')

positive
