In [None]:
%pip install nltk

import nltk
nltk.download('stopwords')


In [11]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import ModelCheckpoint

# Download NLTK stopwords (only needed once)
nltk.download('stopwords')

# Load IMDB dataset
data = pd.read_csv('IMDB Dataset.csv')
print(data.head())

# Set English stopwords
english_stops = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\huzai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [12]:
def preprocess_reviews(df):
    x_data = df['review']
    y_data = df['sentiment']

    # Remove HTML tags
    x_data = x_data.replace({'<.*?>': ''}, regex=True)
    # Remove non-alphabetic characters
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex=True)
    # Lowercase + remove stopwords
    x_data = x_data.apply(lambda review: [w.lower() for w in review.split() if w.lower() not in english_stops])

    # Encode sentiment: positive=1, negative=0
    y_data = y_data.replace({'positive': 1, 'negative': 0})

    return x_data, y_data

x_data, y_data = preprocess_reviews(data)
print('Sample preprocessed review:', x_data[0])
print('Sample sentiment:', y_data[0])


Sample preprocessed review: ['one', 'reviewers', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scenes', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pulls', 'punches', 'regards', 'drugs', 'sex', 'violence', 'hardcore', 'classic', 'use', 'word', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focuses', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cells', 'glass', 'fronts', 'face', 'inwards', 'privacy', 'high', 'agenda', 'em', 'city', 'home', 'many', 'aryans', 'muslims', 'gangstas', 'latinos', 'christians', 'italians', 'irish', 'scuffles', 'death', 'stares', 'dodgy', 'dealings', 'shady', 'agreements', 'never', 'far', 'away', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goes', 'shows', 'dare', 'forget', 'pretty', 'pictures', 'painted', 'mainstream', 'audienc

  y_data = y_data.replace({'positive': 1, 'negative': 0})


In [13]:
# Split dataset
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

# Tokenizer
token = Tokenizer(lower=False)  # already lowercased
token.fit_on_texts(x_train)

x_train_seq = token.texts_to_sequences(x_train)
x_test_seq = token.texts_to_sequences(x_test)

# Pad sequences
max_length = int(np.ceil(np.mean([len(x) for x in x_train_seq])))
x_train_pad = pad_sequences(x_train_seq, maxlen=max_length, padding='post', truncating='post')
x_test_pad = pad_sequences(x_test_seq, maxlen=max_length, padding='post', truncating='post')

# Total words in vocabulary
total_words = len(token.word_index) + 1

print("Max review length:", max_length)
print("Vocabulary size:", total_words)


Max review length: 119
Vocabulary size: 92394


In [14]:
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length=max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

# Checkpoint to save best model
checkpoint = ModelCheckpoint('LSTM_IMDB.h5', monitor='accuracy', save_best_only=True, verbose=1)

# Train model
model.fit(x_train_pad, y_train, batch_size=128, epochs=5, callbacks=[checkpoint])




None
Epoch 1/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 53ms/step - accuracy: 0.5643 - loss: 0.6695
Epoch 1: accuracy improved from None to 0.60583, saving model to LSTM_IMDB.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 53ms/step - accuracy: 0.6058 - loss: 0.6518
Epoch 2/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 51ms/step - accuracy: 0.5237 - loss: 0.6895
Epoch 2: accuracy did not improve from 0.60583
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 51ms/step - accuracy: 0.5299 - loss: 0.6882
Epoch 3/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 51ms/step - accuracy: 0.5945 - loss: 0.6639
Epoch 3: accuracy improved from 0.60583 to 0.63042, saving model to LSTM_IMDB.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 51ms/step - accuracy: 0.6304 - loss: 0.6387
Epoch 4/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 51ms/step - accuracy: 0.6617 - loss: 0.6162
Epoch 4: accuracy improved from 0.63042 to 0.63850, saving model to LSTM_IMDB.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 51ms/step - accuracy: 0.6385 - loss: 0.6394
Epoch 5/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 52ms/step - accuracy: 0.7126 - loss: 0.5438
Epoch 5: accuracy improved from 0.63850 to 0.79158, saving model to LSTM_IMDB.h5




[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 52ms/step - accuracy: 0.7916 - loss: 0.4719


<keras.src.callbacks.history.History at 0x28315f74440>

In [15]:
# Predict on test set
y_pred_prob = model.predict(x_test_pad, batch_size=128)
y_pred = (y_pred_prob >= 0.5).astype(int).reshape(-1)

correct = np.sum(y_test.values == y_pred)
total = len(y_pred)
print(f"Correct Predictions: {correct}")
print(f"Wrong Predictions: {total - correct}")
print(f"Accuracy: {correct/total*100:.2f}%")


[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step
Correct Predictions: 8323
Wrong Predictions: 1677
Accuracy: 83.23%


In [18]:
# Load model
loaded_model = load_model('LSTM_IMDB.h5')

# Example review
review = input("Movie Review: ")

# Preprocess review
review = re.sub(r'[^a-zA-Z\s]', '', review)
words = [w.lower() for w in review.split() if w.lower() not in english_stops]

# Tokenize and pad
seq = token.texts_to_sequences([words])
seq_pad = pad_sequences(seq, maxlen=max_length, padding='post', truncating='post')

# Predict
result = loaded_model.predict(seq_pad)
print("Positive" if result >= 0.7 else "Negative")








[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Negative
