
# Step 1: Understand the Dataset

## 1. Load the Dataset
We load the sentiment analysis dataset using pandas. The dataset contains two columns: **Text** and **Sentiment**. Each row represents a sentence (or review) and its corresponding sentiment label.


In [8]:

import pandas as pd

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]].dropna()
df["Sentiment"] = df["Sentiment"].str.strip()

# Check unique sentiment classes before categorization
print("Unique sentiment values before categorization:", df["Sentiment"].unique())


Unique sentiment values before categorization: ['Positive' 'Negative' 'Neutral' 'Anger' 'Fear' 'Sadness' 'Disgust'
 'Happiness' 'Joy' 'Love' 'Amusement' 'Enjoyment' 'Admiration' 'Affection'
 'Awe' 'Disappointed' 'Surprise' 'Acceptance' 'Adoration' 'Anticipation'
 'Bitter' 'Calmness' 'Confusion' 'Excitement' 'Kind' 'Pride' 'Shame'
 'Elation' 'Euphoria' 'Contentment' 'Serenity' 'Gratitude' 'Hope'
 'Empowerment' 'Compassion' 'Tenderness' 'Arousal' 'Enthusiasm'
 'Fulfillment' 'Reverence' 'Despair' 'Grief' 'Loneliness' 'Jealousy'
 'Resentment' 'Frustration' 'Boredom' 'Anxiety' 'Intimidation'
 'Helplessness' 'Envy' 'Regret' 'Curiosity' 'Indifference' 'Numbness'
 'Melancholy' 'Nostalgia' 'Ambivalence' 'Determination' 'Zest' 'Hopeful'
 'Proud' 'Grateful' 'Empathetic' 'Compassionate' 'Playful' 'Free-spirited'
 'Inspired' 'Confident' 'Bitterness' 'Yearning' 'Fearful' 'Apprehensive'
 'Overwhelmed' 'Jealous' 'Devastated' 'Frustrated' 'Envious' 'Dismissive'
 'Thrill' 'Bittersweet' 'Overjoyed' 'Insp


# Step 2: Preprocessing

## 1. Text Cleaning and Normalization
We preprocess the text data by converting it to lowercase, removing punctuation and stopwords, and applying stemming and lemmatization.


In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Text Preprocessing Function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    return ' '.join(words).strip()

# Apply preprocessing
df["ProcessedText"] = df["Text"].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\krish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



## 2. Sentiment Categorization with spaCy Embeddings
We use spaCy's pre-trained word embeddings to map raw sentiment labels into one of three standardized sentiment categories: **Positive**, **Negative**, and **Neutral**.


In [10]:

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Reference words for sentiment categories
ref_words = {
    "Positive": ["happy", "great", "good", "excellent", "amazing", "positive"],
    "Negative": ["sad", "bad", "terrible", "horrible", "negative", "awful"],
    "Neutral": ["okay", "neutral", "average", "moderate", "indifferent"]
}

# Compute average vector for each category
def compute_average_vector(words):
    vectors = [nlp(word).vector for word in words if nlp(word).has_vector]
    return np.mean(vectors, axis=0) if vectors else np.zeros((nlp("positive").vector.shape[0],))

ref_vectors = {category: compute_average_vector(words) for category, words in ref_words.items()}

# Normalize reference vectors
for category in ref_vectors:
    ref_vectors[category] /= np.linalg.norm(ref_vectors[category])

# Assign sentiment based on cosine similarity
def assign_sentiment_category(sentiment):
    doc = nlp(sentiment)
    if not doc.has_vector:
        return "Neutral"
    word_vector = doc.vector / np.linalg.norm(doc.vector)
    similarities = {category: cosine_similarity([word_vector], [ref_vec])[0][0] for category, ref_vec in ref_vectors.items()}
    return max(similarities, key=similarities.get)

# Apply sentiment classification
df["Sentiment"] = df["Sentiment"].apply(assign_sentiment_category)
print("Unique sentiment values after categorization:", df["Sentiment"].unique())


OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.


# Step 3: Define Neural Network Architecture

## Label Encoding and Vectorization
We encode sentiment labels and convert the processed text into TF-IDF features.


In [None]:

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Split dataset
X = df["ProcessedText"]
y = df["SentimentEncoded"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=30000, min_df=3, stop_words="english", sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



## Choice of Architecture
We use **MLPClassifier** with two hidden layers (100 neurons each) and ReLU activation. The choice of this architecture balances complexity and generalization, with dropout for regularization in DNN. Loss function is **categorical cross-entropy**, suitable for multi-class classification.



# Step 4: Train and Validate the Model

## Handle Class Imbalance with SMOTE
We apply SMOTE (Synthetic Minority Oversampling Technique) to balance classes in training data.


In [None]:

from imblearn.over_sampling import SMOTE

# Apply SMOTE
from collections import Counter
min_class_size = Counter(y_train).most_common()[-1][1]
k_neighbors = min(5, min_class_size - 1) if min_class_size > 1 else 1
smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)



## Train MLP Classifier
We use the Adam optimizer with a constant learning rate. The model is trained for 500 iterations.


In [None]:

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train MLP Classifier
best_mlp = MLPClassifier(activation='relu', alpha=0.001, batch_size=32,
                         hidden_layer_sizes=(100, 100), learning_rate='constant',
                         max_iter=500, solver='adam', random_state=42)
best_mlp.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = best_mlp.predict(X_test_tfidf)
print(f"MLP Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred))



# Step 5: Evaluate the Model

## Train Deep Neural Network (DNN)
We train a DNN with multiple dense layers and dropout. The model is compiled using the Adam optimizer and evaluated with accuracy and classification report.


In [None]:

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import numpy as np

# Prepare labels
y_train_dnn = to_categorical(y_train, num_classes=3)
y_test_dnn = to_categorical(y_test, num_classes=3)

# Build DNN
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train DNN
model.fit(X_train_tfidf.toarray(), y_train_dnn, epochs=10, batch_size=32, validation_split=0.1)

# Evaluate DNN
y_pred_dnn = np.argmax(model.predict(X_test_tfidf.toarray()), axis=1)
print("DNN Accuracy:", accuracy_score(y_test, y_pred_dnn))
print("DNN Classification Report:\n", classification_report(y_test, y_pred_dnn))
