In [13]:
import numpy as np
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [14]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Try reading with the Python engine and explicit quoting options
from pandas.errors import ParserError # Import ParserError
try:
    data = pd.read_csv("/content/IMDB Dataset.csv", engine='python', quotechar='"', doublequote=True)
except ParserError as e:
    print(f"Error reading CSV with default parameters and python engine: {e}")
    # If the above still fails, try reading only a few rows to pinpoint the issue
    try:
        data = pd.read_csv("/content/IMDB Dataset.csv", engine='python', quotechar='"', doublequote=True, nrows=6300) # Read slightly more than the error row
        print("Successfully read the first 6300 rows with python engine.")
        # You might need to inspect the rows around 6271 manually in the file
    except Exception as e_small:
         print(f"Still failed to read even a small portion with python engine: {e_small}")

In [16]:
data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})  # <-- MAKE SURE THIS LINE IS EXACTLY THIS


In [17]:
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower().split()
    return " ".join([word for word in text if word not in stop_words])

data['review'] = data['review'].apply(clean_text)


In [18]:
# This cell (ipython-input-8-74ec06b280e8) processes the data and prepares X and y
vocab_size = 15000
max_len = 250

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['review'])
X = tokenizer.texts_to_sequences(data['review'])
X = pad_sequences(X, maxlen=max_len)
y = data['sentiment'].values

In [19]:
# This cell (ipython-input-9-74ec06b280e8) splits the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# This cell (ipython-input-10-74ec06b280e8) defines and compiles the Keras model
model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=max_len))
model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [21]:
# file ipython-input-6-c4d4f3e01691
from tensorflow.keras.models import Sequential # This import was already present, but ensure it runs before the model definition

In [24]:
# This cell (ipython-input-12-74ec06b280e8) trains the model using the defined 'model' variable
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.2)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m499s[0m 996ms/step - accuracy: 0.9471 - loss: 0.1522 - val_accuracy: 0.8671 - val_loss: 0.3666
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 962ms/step - accuracy: 0.9614 - loss: 0.1152 - val_accuracy: 0.8660 - val_loss: 0.4696
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m491s[0m 940ms/step - accuracy: 0.9709 - loss: 0.0875 - val_accuracy: 0.8465 - val_loss: 0.4527
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m504s[0m 946ms/step - accuracy: 0.9740 - loss: 0.0778 - val_accuracy: 0.8660 - val_loss: 0.5146
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m446s[0m 892ms/step - accuracy: 0.9838 - loss: 0.0527 - val_accuracy: 0.8585 - val_loss: 0.5771
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m509s[0m 906ms/step - accuracy: 0.9795 - loss: 0.0611 - val_accuracy: 0.8596 - val_loss: 0.5854
Epoc

NameError: name 'model' is not defined

In [25]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f"\nTest Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 164ms/step

Test Accuracy: 0.8620

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86      4961
           1       0.86      0.86      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [26]:
def predict_review(review_text):
    cleaned = clean_text(review_text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)
    prediction = model.predict(padded)[0][0]
    sentiment = "Positive" if prediction >= 0.5 else "Negative"  # <-- ✅ CORRECT LOGIC
    print(f"\nReview Sentiment: {sentiment} (Confidence: {prediction:.2f})")

In [27]:
user_input = input("\nEnter your movie review: ")
predict_review(user_input)


Enter your movie review: MOVIE IS GOOD
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 166ms/step

Review Sentiment: Positive (Confidence: 1.00)
