In [4]:
# Install required libraries
# !pip install nltk scikit-learn joblib vaderSentiment

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.sparse import hstack
import os

# Download stopwords
nltk.download('stopwords')

# -----------------------------
# Load dataset
# -----------------------------
df = pd.read_csv(r"D:\mental-health-ai\data\Emotion_final.csv")
df.columns = df.columns.str.lower()
print(df.columns)  # should include 'text' and 'emotion'

# -----------------------------
# Preprocessing
# -----------------------------
NEGATIONS = {"not", "no", "never", "n't", "cannot", "can't", "won't", "don't", "didn't", "isn't", "wasn't", "aren't", "couldn't"}

def expand_contractions(text):
    t = text
    t = re.sub(r"n\'t", " not", t)
    t = re.sub(r"\'re", " are", t)
    t = re.sub(r"\'s", " is", t)
    t = re.sub(r"\'ve", " have", t)
    t = re.sub(r"\'ll", " will", t)
    return t

STOPWORDS = set(stopwords.words('english')) - NEGATIONS

def clean_text_keep_neg(text):
    text = str(text).lower()
    text = expand_contractions(text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    return " ".join([w for w in text.split() if w not in STOPWORDS])

df['clean_text'] = df['text'].apply(clean_text_keep_neg)

# -----------------------------
# Add VADER compound feature
# -----------------------------
analyzer = SentimentIntensityAnalyzer()
df['vader_compound'] = df['text'].apply(lambda t: analyzer.polarity_scores(str(t))['compound'])

# -----------------------------
# Train/Test split
# -----------------------------
X_text = df['clean_text']
X_sent = df['vader_compound'].values.reshape(-1,1)
y = df['emotion']

X_train_text, X_test_text, X_train_sent, X_test_sent, y_train, y_test = train_test_split(
    X_text, X_sent, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# TF-IDF Vectorizer
# -----------------------------
tfidf = TfidfVectorizer(max_features=8000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Combine TF-IDF + VADER
X_train_final = hstack([X_train_tfidf, X_train_sent])
X_test_final = hstack([X_test_tfidf, X_test_sent])

# -----------------------------
# Train Classifier
# -----------------------------
clf = LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga')
clf.fit(X_train_final, y_train)

# -----------------------------
# Evaluate
# -----------------------------
y_pred = clf.predict(X_test_final)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# -----------------------------
# Save Model and Vectorizer
# -----------------------------
os.makedirs("model", exist_ok=True)
joblib.dump(clf, "model/emotion_model_final.pkl")
joblib.dump(tfidf, "model/tfidf_vectorizer.pkl")
print("✅ Model and vectorizer saved successfully!")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lucky\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Index(['text', 'emotion'], dtype='object')
Accuracy: 0.8602050326188257
              precision    recall  f1-score   support

       anger       0.85      0.88      0.87       599
        fear       0.86      0.81      0.83       530
       happy       0.89      0.87      0.88      1406
        love       0.75      0.91      0.82       328
     sadness       0.90      0.86      0.88      1253
    surprise       0.67      0.81      0.74       176

    accuracy                           0.86      4292
   macro avg       0.82      0.86      0.84      4292
weighted avg       0.86      0.86      0.86      4292

Confusion Matrix:
 [[ 526   19   21    8   23    2]
 [  17  428   19    4   17   45]
 [  23    8 1223   73   66   13]
 [   3    4   14  299    8    0]
 [  44   28   85   14 1073    9]
 [   4   12    8    1    8  143]]
✅ Model and vectorizer saved successfully!
