In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score



In [4]:
# **Step 1: Load Dataset from Local Directory**
file_path = "IMDB Dataset1.csv"  # 👉 Replace this with your actual file path
df = pd.read_csv(file_path)

In [5]:
# **Step 1: Handling Null Values**
df.dropna(inplace=True)  # Remove missing values

In [6]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
# **Step 3: Text Preprocessing (Cleaning)**
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

In [8]:
df['review'] = df['review'].apply(clean_text)

In [9]:
# **Step 4: Convert Sentiments to Binary**
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [10]:
# **Tokenization & Padding**
vocab_size = 6000  # ✅ Reduced vocab size for faster training
max_length = 150  # ✅ Reduced sequence length
embedding_dim = 64  # ✅ Smaller embedding size
batch_size = 64 # ✅ Lower batch size for memory efficiency

In [11]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

In [12]:
# **Train-Test Split**
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)

In [13]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),  # Convolutional Layer
    tf.keras.layers.GlobalMaxPooling1D(),  # Pooling Layer
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.4),  # Dropout for Regularization
    tf.keras.layers.Dense(1, activation='sigmoid')  # Output Layer
])



In [14]:
# **Compile Model**
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# **Train Model (Fewer Epochs)**
history = model.fit(X_train, y_train, epochs=4, batch_size=batch_size, validation_data=(X_test, y_test))  

Epoch 1/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.7009 - loss: 0.5360 - val_accuracy: 0.8751 - val_loss: 0.2884
Epoch 2/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9108 - loss: 0.2239 - val_accuracy: 0.8813 - val_loss: 0.2827
Epoch 3/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9643 - loss: 0.1116 - val_accuracy: 0.8828 - val_loss: 0.3082
Epoch 4/4
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9896 - loss: 0.0414 - val_accuracy: 0.8812 - val_loss: 0.3823


In [16]:
# **Evaluate Model**
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {test_acc:.2f}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8838 - loss: 0.3647

✅ Test Accuracy: 0.88


In [17]:
# **Predictions & Metrics**
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype("int32").flatten()


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [18]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\n✅ Performance Metrics:")
print(f"🔹 Accuracy: {accuracy * 100:.2f}%")
print(f"🔹 Precision: {precision:.2f}")
print(f"🔹 Recall: {recall:.2f}")
print(f"🔹 F1-Score: {f1:.2f}")


✅ Performance Metrics:
🔹 Accuracy: 88.12%
🔹 Precision: 0.88
🔹 Recall: 0.89
🔹 F1-Score: 0.88


In [20]:
# **Step 14: Misclassified Samples**
misclassified_indices = np.where(y_pred != y_test.to_numpy())[0]
print(f"\n🔹 Number of Misclassified Samples: {len(misclassified_indices)}")


🔹 Number of Misclassified Samples: 1188


In [None]:
# Get some misclassified sample indices
num_samples_to_display = 5  # Change this number if you want to see more examples
misclassified_samples = misclassified_indices[:num_samples_to_display]

print("\n🔹 Sample Misclassified Reviews:")
for idx in misclassified_samples:
    print(f"\n🔹 Review: {df.iloc[idx]['review'][:300]}...")  # Displaying first 300 characters
    print(f"   ✅ Actual Sentiment: {'Positive' if y_test.iloc[idx] == 1 else 'Negative'}")
    print(f"   ❌ Predicted Sentiment: {'Positive' if y_pred[idx] == 1 else 'Negative'}")


In [25]:
# Suppose your Tokenizer is called 'tokenizer'
# Suppose your trained model is called 'model'

# New input text
new_review = input("Enter your review: ")

# Preprocess the new input (tokenize and pad)
new_seq = tokenizer.texts_to_sequences([new_review])
new_padded = pad_sequences(new_seq, maxlen=max_length, padding='post', truncating='post')

# Predict
predicted_prob = model.predict(new_padded)[0][0]

# Convert probability to class label
predicted_sentiment = 'Positive' if predicted_prob >= 0.5 else 'Negative'

# Output
print(f"Predicted Sentiment: {predicted_sentiment}")


Enter your review: great
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
Predicted Sentiment: Positive
