In [4]:
import pandas as pd
import re
import nltk
import numpy as np
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics.pairwise import cosine_similarity

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_md")
except OSError:
    print("Downloading spaCy model...")
    import spacy.cli
    spacy.cli.download("en_core_web_md")
    nlp = spacy.load("en_core_web_md")

import spacy
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load spaCy model
nlp = spacy.load("en_core_web_md")

# Reference words for sentiment categories
ref_words = {
    "Positive": ["happy", "great", "good", "excellent", "amazing", "positive"],
    "Negative": ["sad", "bad", "terrible", "horrible", "negative", "awful"],
    "Neutral": ["okay", "neutral", "average", "moderate", "indifferent"]
}

# Compute average vector for each category
def compute_average_vector(words):
    vectors = [nlp(word).vector for word in words if nlp(word).has_vector]
    return np.mean(vectors, axis=0) if vectors else np.zeros((nlp("positive").vector.shape[0],))

ref_vectors = {category: compute_average_vector(words) for category, words in ref_words.items()}

# Normalize reference vectors
for category in ref_vectors:
    ref_vectors[category] /= np.linalg.norm(ref_vectors[category])  # Normalize to unit length

def assign_sentiment_category(sentiment):
    doc = nlp(sentiment)

    if not doc.has_vector:  # Handle OOV words
        return "Neutral"

    word_vector = doc.vector
    word_vector /= np.linalg.norm(word_vector)  # Normalize input vector

    similarities = {category: cosine_similarity([word_vector], [ref_vec])[0][0] for category, ref_vec in ref_vectors.items()}

    return max(similarities, key=similarities.get)



# df['Sentiment'] = df['Sentiment'].apply(assign_sentiment_category)

# print(df['Sentiment'].value_counts())
# Text Preprocessing Function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words]
    return ' '.join(words).strip()

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]].dropna()
df["Sentiment"] = df["Sentiment"].str.strip()

# Check unique sentiment classes before categorization
print("Unique sentiment values before categorization:", df["Sentiment"].unique())

# Apply preprocessing
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Apply sentiment classification using spaCy word embeddings
df["Sentiment"] = df["Sentiment"].apply(assign_sentiment_category)

# Check unique sentiment values after categorization
print("Unique sentiment values after categorization:", df["Sentiment"].unique())

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Check class distribution before splitting
print("Class distribution before splitting:", df["SentimentEncoded"].value_counts())

# Ensure multiple classes exist
if df["SentimentEncoded"].nunique() < 2:
    print("Warning: Dataset contains only one class after preprocessing. Consider checking dataset integrity.")

# Split dataset
X = df["ProcessedText"]
y = df["SentimentEncoded"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text to numerical features using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=30000, min_df=3, stop_words="english", sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Check class distribution after split
print("Class distribution in training set before SMOTE:", pd.Series(y_train).value_counts())

# Apply SMOTE only if there are at least 2 classes
if y_train.nunique() > 1:
    min_class_size = y_train.value_counts().min()
    k_neighbors = min(5, min_class_size - 1) if min_class_size > 1 else 1
    smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
    X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)

# Check class distribution after SMOTE
print("Class distribution in training set after SMOTE:", pd.Series(y_train).value_counts())

# Train MLP Classifier
best_mlp = MLPClassifier(
    activation='relu',
    alpha=0.001,
    batch_size=32,
    hidden_layer_sizes=(100, 100),
    learning_rate='constant',
    max_iter=500,
    solver='adam',
    random_state=42
)
best_mlp.fit(X_train_tfidf, y_train)

# Predict with MLP
y_pred = best_mlp.predict(X_test_tfidf)
print(f"MLP Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("MLP Classification Report:\n", classification_report(y_test, y_pred))

# Define DNN Model
def build_dnn_model(input_dim, num_classes):
    model = Sequential([
        Dense(512, activation='relu', input_shape=(input_dim,)),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Check again before DNN training
if y_train.nunique() < 2 or y_test.nunique() < 2:
    print("Warning: Dataset contains only one class after preprocessing. Model training may be invalid.")
else:
    y_train_dnn = to_categorical(y_train, num_classes=3)
    y_test_dnn = to_categorical(y_test, num_classes=3)
    input_dim = X_train_tfidf.shape[1]
    dnn_model = build_dnn_model(input_dim, 3)
    dnn_model.fit(X_train_tfidf.toarray(), y_train_dnn, epochs=10, batch_size=32, validation_split=0.1)
    y_pred_dnn = np.argmax(dnn_model.predict(X_test_tfidf.toarray()), axis=1)
    print("DNN Accuracy:", accuracy_score(y_test, y_pred_dnn))
    print("DNN Classification Report:\n", classification_report(y_test, y_pred_dnn))

[nltk_data] Downloading package stopwords to /home/anurag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/anurag/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unique sentiment values before categorization: ['Positive' 'Negative' 'Neutral' 'Anger' 'Fear' 'Sadness' 'Disgust'
 'Happiness' 'Joy' 'Love' 'Amusement' 'Enjoyment' 'Admiration' 'Affection'
 'Awe' 'Disappointed' 'Surprise' 'Acceptance' 'Adoration' 'Anticipation'
 'Bitter' 'Calmness' 'Confusion' 'Excitement' 'Kind' 'Pride' 'Shame'
 'Elation' 'Euphoria' 'Contentment' 'Serenity' 'Gratitude' 'Hope'
 'Empowerment' 'Compassion' 'Tenderness' 'Arousal' 'Enthusiasm'
 'Fulfillment' 'Reverence' 'Despair' 'Grief' 'Loneliness' 'Jealousy'
 'Resentment' 'Frustration' 'Boredom' 'Anxiety' 'Intimidation'
 'Helplessness' 'Envy' 'Regret' 'Curiosity' 'Indifference' 'Numbness'
 'Melancholy' 'Nostalgia' 'Ambivalence' 'Determination' 'Zest' 'Hopeful'
 'Proud' 'Grateful' 'Empathetic' 'Compassionate' 'Playful' 'Free-spirited'
 'Inspired' 'Confident' 'Bitterness' 'Yearning' 'Fearful' 'Apprehensive'
 'Overwhelmed' 'Jealous' 'Devastated' 'Frustrated' 'Envious' 'Dismissive'
 'Thrill' 'Bittersweet' 'Overjoyed' 'Insp

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-17 21:59:03.299238: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.3242 - loss: 1.1000 - val_accuracy: 0.0417 - val_loss: 1.2041
Epoch 2/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4271 - loss: 1.0702 - val_accuracy: 0.2778 - val_loss: 1.1053
Epoch 3/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5846 - loss: 0.9354 - val_accuracy: 0.9861 - val_loss: 0.4226
Epoch 4/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8346 - loss: 0.5247 - val_accuracy: 0.9306 - val_loss: 0.2361
Epoch 5/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9142 - loss: 0.2115 - val_accuracy: 0.8889 - val_loss: 0.2478
Epoch 6/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9660 - loss: 0.1175 - val_accuracy: 0.9861 - val_loss: 0.0693
Epoch 7/10
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━