In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import BorderlineSMOTE
import re
import nltk
from nltk.corpus import stopwords
import numpy as np

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Remove rare classes (classes with only 1 sample)
class_counts = df["SentimentEncoded"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["SentimentEncoded"].isin(rare_classes)]

# Split dataset
X = df["ProcessedText"]
y = label_encoder.fit_transform(df["Sentiment"])  # Re-encode after removing rare classes

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)  # Increase features for better representation
X_tfidf = vectorizer.fit_transform(X).toarray()

# Split BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only on training data
smote = BorderlineSMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Convert labels to categorical (AFTER applying SMOTE)
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test, num_classes=y_train_categorical.shape[1])  # Match train classes

# Train MLP Classifier with hyperparameter tuning
mlp_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128),  # Deeper network
    activation='relu',
    solver='adam',
    alpha=0.0005,  # Lower regularization
    learning_rate='adaptive',
    max_iter=1500,  # More iterations for convergence
    early_stopping=True,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predict with MLP
y_pred_mlp = mlp_model.predict(X_test)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy:.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

# Train Deep Neural Network with improved regularization
num_classes = y_train_categorical.shape[1]  # Ensure correct output shape
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

# Compile DNN model with SGD + Momentum
model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

# Train DNN model with more epochs
model.fit(X_train, y_train_categorical, epochs=30, batch_size=64, validation_data=(X_test, y_test_categorical))

# Evaluate DNN model
loss, accuracy = model.evaluate(X_test, y_test_categorical)
print(f"Deep Neural Network Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package stopwords to C:\Users\Valmik
[nltk_data]     Belgaonkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


MLP Classifier Accuracy: 0.3053
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          12       1.00      1.00      1.00         1
          13       1.00      0.50      0.67         2
          14       0.00      0.00      0.00         1
          15       1.00      1.00      1.00         1
          16       1.00      1.00      1.00         1
          17       0.00      0.00      0.00         1
          18       0.00      0.00      0.00         1
          19       0.00      0.00      0.00         1
          21       1.00      1.00      1.00         1
          22       0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 31ms/step - accuracy: 0.0128 - loss: 4.7159 - val_accuracy: 0.0000e+00 - val_loss: 4.7119
Epoch 2/30
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0239 - loss: 4.6960 - val_accuracy: 0.0076 - val_loss: 4.7026
Epoch 3/30
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.0357 - loss: 4.6699 - val_accuracy: 0.0076 - val_loss: 4.6931
Epoch 4/30
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0515 - loss: 4.6377 - val_accuracy: 0.0076 - val_loss: 4.6829
Epoch 5/30
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.0686 - loss: 4.6031 - val_accuracy: 0.0076 - val_loss: 4.6752
Epoch 6/30
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.0781 - loss: 4.5613 - val_accuracy: 0.0000e+00 - val_loss: 4.6659
Epoch 7/30
[1m53/53[0m [32m━━━━━━━

POSITIVE, NEGATIVE AND NEUTRAL WITH SMOTE

In [5]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.metrics.pairwise import cosine_similarity

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define sentiment categories mapping
sentiment_mapping = {
    "positive": ["happy", "joyful", "excellent", "great", "amazing", "good", "love"],
    "negative": ["bad", "terrible", "awful", "hate", "worst", "sad", "angry"],
    "neutral": ["okay", "fine", "average", "neutral", "fair", "moderate"]
}

# Convert words to TF-IDF vectors for similarity comparison
vectorizer = TfidfVectorizer()
all_words = list(set(sum(sentiment_mapping.values(), [])))
word_vectors = vectorizer.fit_transform(all_words)

# Function to classify sentiment labels using cosine similarity
def classify_sentiment(text):
    text_vector = vectorizer.transform([text])
    similarities = {category: np.mean(cosine_similarity(text_vector, 
                                vectorizer.transform(words))) for category, words in sentiment_mapping.items()}
    return max(similarities, key=similarities.get)  # Return category with highest similarity

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

df["SentimentCategory"] = df["Sentiment"].apply(classify_sentiment)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["SentimentCategory"])

# Split dataset
X = df["ProcessedText"]
y = df["SentimentEncoded"]

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_tfidf = vectorizer.fit_transform(X).toarray()

# Split BEFORE applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only on training data
smote = BorderlineSMOTE(random_state=42, k_neighbors=1)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Convert labels to categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test, num_classes=y_train_categorical.shape[1])

# Train MLP Classifier
mlp_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128),
    activation='relu',
    solver='adam',
    alpha=0.0005,
    learning_rate='adaptive',
    max_iter=1500,
    early_stopping=True,
    random_state=42
)
mlp_model.fit(X_train, y_train)

y_pred_mlp = mlp_model.predict(X_test)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy:.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

# Train Deep Neural Network
num_classes = y_train_categorical.shape[1]
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train_categorical, epochs=30, batch_size=64, validation_data=(X_test, y_test_categorical))

loss, accuracy = model.evaluate(X_test, y_test_categorical)
print(f"Deep Neural Network Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package stopwords to C:\Users\Valmik
[nltk_data]     Belgaonkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


MLP Classifier Accuracy: 0.9388
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         4
           2       0.95      0.99      0.97       139

    accuracy                           0.94       147
   macro avg       0.32      0.33      0.32       147
weighted avg       0.89      0.94      0.92       147

Epoch 1/30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.4054 - loss: 1.0968 - val_accuracy: 0.8844 - val_loss: 1.0848
Epoch 2/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.6939 - loss: 1.0797 - val_accuracy: 0.8844 - val_loss: 1.0763
Epoch 3/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step - accuracy: 0.8707 - loss: 1.0438 - val_accuracy: 0.9456 - val_loss: 1.0393
Epoch 4/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - accuracy: 0.9434 - loss: 0.9678 - val_accuracy: 0.9456 - val_loss: 0.9734
Epoch 5/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - accuracy: 0.9791 - loss: 0.7899 - val_accuracy: 0.9388 - val_loss: 0.8349
Epoch 6/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.9845 - loss: 0.4744 - val_accuracy: 0.9456 - val_loss: 0.4384
Epoch 7/30
[1m26/26[0m [32m━━━━━━━━━━━━━━━

POSITIVE, NEGATIVE OR NEUTRAL WITHOUT SMOTE

In [6]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from sklearn.metrics.pairwise import cosine_similarity

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Define sentiment categories mapping
sentiment_mapping = {
    "positive": ["happy", "joyful", "excellent", "great", "amazing", "good", "love"],
    "negative": ["bad", "terrible", "awful", "hate", "worst", "sad", "angry"],
    "neutral": ["okay", "fine", "average", "neutral", "fair", "moderate"]
}

# Convert words to TF-IDF vectors for similarity comparison
vectorizer = TfidfVectorizer()
all_words = list(set(sum(sentiment_mapping.values(), [])))
word_vectors = vectorizer.fit_transform(all_words)

# Function to classify sentiment labels using cosine similarity
def classify_sentiment(text):
    text_vector = vectorizer.transform([text])
    similarities = {category: np.mean(cosine_similarity(text_vector, 
                                vectorizer.transform(words))) for category, words in sentiment_mapping.items()}
    return max(similarities, key=similarities.get)  # Return category with highest similarity

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

df["SentimentCategory"] = df["Sentiment"].apply(classify_sentiment)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["SentimentCategory"])

# Split dataset
X = df["ProcessedText"]
y = df["SentimentEncoded"]

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_tfidf = vectorizer.fit_transform(X).toarray()

# Split dataset WITHOUT applying SMOTE
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Convert labels to categorical
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test, num_classes=y_train_categorical.shape[1])

# Train MLP Classifier
mlp_model = MLPClassifier(
    hidden_layer_sizes=(512, 256, 128),
    activation='relu',
    solver='adam',
    alpha=0.0005,
    learning_rate='adaptive',
    max_iter=1500,
    early_stopping=True,
    random_state=42
)
mlp_model.fit(X_train, y_train)

y_pred_mlp = mlp_model.predict(X_test)
mlp_accuracy = accuracy_score(y_test, y_pred_mlp)
print(f"MLP Classifier Accuracy: {mlp_accuracy:.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred_mlp))

# Train Deep Neural Network
num_classes = y_train_categorical.shape[1]
model = Sequential([
    Dense(1024, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(512, activation='relu'),
    Dropout(0.4),
    Dense(256, activation='relu'),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer=SGD(learning_rate=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train_categorical, epochs=30, batch_size=64, validation_data=(X_test, y_test_categorical))

loss, accuracy = model.evaluate(X_test, y_test_categorical)
print(f"Deep Neural Network Accuracy: {accuracy:.4f}")

[nltk_data] Downloading package stopwords to C:\Users\Valmik
[nltk_data]     Belgaonkar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


MLP Classifier Accuracy: 0.9456
MLP Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         4
           2       0.95      1.00      0.97       139

    accuracy                           0.95       147
   macro avg       0.32      0.33      0.32       147
weighted avg       0.89      0.95      0.92       147

Epoch 1/30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.8616 - loss: 1.0330 - val_accuracy: 0.9456 - val_loss: 0.7168
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.9433 - loss: 0.5992 - val_accuracy: 0.9456 - val_loss: 0.3159
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.9460 - loss: 0.2849 - val_accuracy: 0.9456 - val_loss: 0.2499
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.9417 - loss: 0.2658 - val_accuracy: 0.9456 - val_loss: 0.2579
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.9521 - loss: 0.2314 - val_accuracy: 0.9456 - val_loss: 0.2570
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.9466 - loss: 0.2520 - val_accuracy: 0.9456 - val_loss: 0.2532
Epoch 7/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━