In [6]:
# 1. Import libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# 2. Load and filter dataset
df = pd.read_csv("imdb_master.csv", encoding="latin-1")
df = df[df['label'].isin(['pos', 'neg'])]  # Remove 'unsup' rows
df = df[['review', 'label']]  # Keep only needed columns
df = shuffle(df).reset_index(drop=True)

In [8]:
# 3. Preprocess text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['clean_review'] = df['review'].apply(preprocess_text)

In [9]:
# 4. Encode labels (pos=1, neg=0)
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])  # positive=1, negative=0

In [10]:
# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_review'], df['label'], test_size=0.2, random_state=42
)

In [11]:
# 6. TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)  # increased features
X_traintf = tfidf.fit_transform(X_train).toarray()
X_testtf = tfidf.transform(X_test).toarray()

In [12]:
# 7. Build a deeper, regularized model
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_traintf.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # sigmoid for binary classification


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# 8. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [16]:
# 9. Train the model with validation split and early stopping
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(
    X_traintf, y_train,
    epochs=20,  # reduce from 100
    batch_size=64,
    validation_split=0.1,
    callbacks=[early_stop],
    verbose=1
)

Epoch 1/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 29ms/step - accuracy: 0.9182 - loss: 0.2120 - val_accuracy: 0.8895 - val_loss: 0.2785
Epoch 2/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.9510 - loss: 0.1387 - val_accuracy: 0.8820 - val_loss: 0.3099
Epoch 3/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.9745 - loss: 0.0790 - val_accuracy: 0.8813 - val_loss: 0.3714
Epoch 4/20
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 28ms/step - accuracy: 0.9856 - loss: 0.0450 - val_accuracy: 0.8835 - val_loss: 0.4128


<keras.src.callbacks.history.History at 0x1b987b5c610>

In [17]:
# 10. Evaluate the model
loss, accuracy = model.evaluate(X_testtf, y_test)
print("Test Accuracy:", accuracy)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8808 - loss: 0.3004
Test Accuracy: 0.8798999786376953


In [18]:
# 11. Predict on new review
def predict_sentiment(review_text):
    # Preprocess the text
    clean_text = preprocess_text(review_text)
   
    # Convert to TF-IDF features (same vectorizer used during training)
    vectorized_text = tfidf.transform([clean_text]).toarray()
   
    # Predict
    prediction = model.predict(vectorized_text)[0][0]  # sigmoid output
   
    # Interpret prediction
    sentiment = "Positive" if prediction >= 0.5 else "Negative"
    print(f"Review: {review_text}")
    print(f"Predicted Sentiment: {sentiment} (Confidence: {prediction:.2f})")
    return sentiment

# Example usage
example_review = "The movie was absolutely fantastic, I loved it!"
predict_sentiment(example_review)

example_review2 = "It was a good film, I enjoyed it."
predict_sentiment(example_review2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step
Review: The movie was absolutely fantastic, I loved it!
Predicted Sentiment: Positive (Confidence: 1.00)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Review: It was a good film, I enjoyed it.
Predicted Sentiment: Positive (Confidence: 0.99)


'Positive'