In [19]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model

In [20]:
data = pd.read_csv("IMDB Dataset.csv", on_bad_lines="skip")
print(data.shape)


(50000, 2)


In [21]:
import nltk
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords
english_stops = set(stopwords.words("english"))


In [23]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)


Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [24]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)


Train Set
11163    [now, either, like, mr, carrey, humour, me, my...
19644    [this, super, creepy, southern, gothic, melodr...
47772    [there, much, anyone, say, flick, plot, quite,...
48600    [well, better, plan, sitting, amateurish, blan...
32378    [this, somewhat, attractive, fans, bad, movie,...
                               ...                        
46847    [the, reason, i, knew, midnight, cowboy, afi, ...
8069     [this, movie, pretty, absurd, there, few, funn...
4994     [joel, schumacher, made, heck, choice, decided...
16443    [sometimes, changes, novels, made, films, nece...
45613    [a, hilarious, insightful, perspective, dating...
Name: review, Length: 40000, dtype: object 

35041    [i, seen, lot, movies, life, many, bad, it, mo...
25047    [a, man, arrives, strange, beautiful, sterile,...
12518    [this, movie, even, though, years, old, still,...
24876    [you, admire, brad, sykes, even, particularly,...
4951     [i, made, big, mistake, actually, watching, wh...
 

In [25]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))


In [26]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)


Encoded X Train
 [[  377   256     6 ... 20870  2298  1121]
 [    8  1041   868 ...   659    12   218]
 [   50    17   152 ...   221   260   237]
 ...
 [ 4801  8574    24 ...     0     0     0]
 [  433  1240  2748 ...     0     0     0]
 [   39   478  5348 ...  3403  1121  2448]] 

Encoded X Test
 [[    1    38    81 ...     0     0     0]
 [   39    52  2711 ...     0     0     0]
 [    8     3    11 ...     0     0     0]
 ...
 [  171     1   121 ...     0     0     0]
 [   95    12    13 ...    49    26 10492]
 [   39  2269  1280 ...     0     0     0]] 

Maximum review length:  130


In [27]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())



None


In [28]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [29]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.5272 - loss: 0.6911
Epoch 1: accuracy improved from -inf to 0.55750, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 124ms/step - accuracy: 0.5273 - loss: 0.6911
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.7755 - loss: 0.5301
Epoch 2: accuracy improved from 0.55750 to 0.76535, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - accuracy: 0.7755 - loss: 0.5301
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.7549 - loss: 0.5384
Epoch 3: accuracy did not improve from 0.76535
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 123ms/step - accuracy: 0.7548 - loss: 0.5385
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.7318 - loss: 0.5483
Epoch 4: accuracy did not improve from 0.76535
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 125ms/step - accuracy: 0.7319 - loss: 0.5482
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step - accuracy: 0.7632 - loss: 0.5157
Epoch 5: accuracy improved from 0.76535 to 0.77618, saving model to models/LSTM.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 125ms/step - accuracy: 0.7632 - loss: 0.5157


<keras.src.callbacks.history.History at 0x7e3914f36600>

In [30]:
# Get prediction probabilities
y_probs = model.predict(x_test, batch_size=128)

# Convert probabilities → class labels
y_pred = np.argmax(y_probs, axis=1)

# Now compare with true labels
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {:.2f}%'.format(true / len(y_pred) * 100))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step
Correct Prediction: 4999
Wrong Prediction: 5001
Accuracy: 49.99%


In [31]:
loaded_model = load_model('models/LSTM.h5')




In [33]:
review = "This movie was amazing! The story and acting were fantastic."

In [36]:
import re

# Remove non-letters
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)

print("Cleaned:", review)

# Tokenize and remove stopwords
words = review.lower().split()
filtered = [w for w in words if w not in english_stops]

# Back to string
filtered_text = " ".join(filtered)

print("Filtered:", filtered_text)


Cleaned: This movie was amazing The story and acting were fantastic
Filtered: movie amazing story acting fantastic


In [40]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assume `filtered_text` is your cleaned review string
tokenize_words = token.texts_to_sequences([filtered_text])  # wrap in list

tokenize_words = pad_sequences(tokenize_words, maxlen=max_length,
                               padding='post', truncating='post')

print("Tokenized:", tokenize_words)


Tokenized: [[  3 397  14  43 708   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0]]


In [41]:
result = loaded_model.predict(tokenize_words)
score = result[0][0]   # get scalar

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step


In [42]:
print("Raw model output:", score)


Raw model output: 0.8973522


In [43]:
if score >= 0.7:
    print("positive")
else:
    print("negative")

positive
