In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import SMOTE
import re
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Remove rare classes (classes with only 1 sample)
class_counts = df["SentimentEncoded"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["SentimentEncoded"].isin(rare_classes)]

# Split dataset
X = df["ProcessedText"]
y = label_encoder.fit_transform(df["Sentiment"])  # Re-encode after removing rare classes

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)  # Increase features for better representation
X_tfidf = vectorizer.fit_transform(X).toarray()

# Apply SMOTE to balance classes
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42, k_neighbors=1)
X_tfidf, y = smote.fit_resample(X_tfidf, y)

# Split after SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Convert labels to categorical (AFTER splitting)
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test, num_classes=y_train_categorical.shape[1])  # Match train classes

# Train MLP Classifier with hyperparameter tuning
mlp_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128),  # Deeper network
    activation='relu',
    solver='adam',
    alpha=0.0005,  # Lower regularization
    learning_rate='adaptive',
    max_iter=1500,  # More iterations for convergence
    early_stopping=True,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predict with MLP
y_pred_mlp = mlp_model.predict(X_test)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy:.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

# Train Deep Neural Network with improved regularization
num_classes = y_train_categorical.shape[1]  # Ensure correct output shape
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile DNN model with SGD + Momentum
model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

# Train DNN model with more epochs
model.fit(X_train, y_train_categorical, epochs=30, batch_size=64, validation_data=(X_test, y_test_categorical))

# Evaluate DNN model
loss, accuracy = model.evaluate(X_test, y_test_categorical)
print(f"Deep Neural Network Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


MLP Classifier Accuracy: 0.9546


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


MLP Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00         9
           3       1.00      1.00      1.00         9
           4       0.00      0.00      0.00         1
           5       1.00      1.00      1.00         9
           6       1.00      1.00      1.00         9
           7       1.00      1.00      1.00         9
           8       0.00      0.00      0.00         1
           9       1.00      1.00      1.00         9
          10       1.00      1.00      1.00         9
          11       1.00      1.00      1.00         9
          12       1.00      1.00      1.00         9
          13       1.00      1.00      1.00         9
          14       1.00      0.89      0.94         9
          15       1.00      1.00      1.00         9
          16       1.00      1.00      1.00         9