# **Emotion Detection: A text classification Model**

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv("../tweet_emotions.csv")
data

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...
...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor
39996,1753919001,love,Happy Mothers Day All my love
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [2]:
data.duplicated().sum()

0

In [3]:
data.isnull().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [4]:
x=data["content"]
y=data["sentiment"]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
x_vec = vect.fit_transform(x)


In [6]:
x_vec

<40000x48212 sparse matrix of type '<class 'numpy.int64'>'
	with 475946 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

X_train, X_test, y_train, y_test = train_test_split(x_vec, y, random_state=42, test_size=0.2)

## ML Models

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Define models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
}

# Dictionary to store model accuracies and F1 scores
model_accuracies = {}
model_f1_scores = {}

# Train and evaluate each model
for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    model_accuracies[model_name] = accuracy
    model_f1_scores[model_name] = f1_score(y_test, y_pred, average='macro', zero_division=0)
    print(f"{model_name}: Accuracy = {accuracy:.4f}")
    print(f"{model_name}: F1 Score = {model_f1_scores[model_name]:.4f}")

# Find the best model
best_model_name = max(model_f1_scores, key=model_f1_scores.get)
best_accuracy = model_accuracies[best_model_name]
base_f1_score = model_f1_scores[best_model_name]

print("\nBest Model:")
print(f"{best_model_name}: Accuracy = {best_accuracy:.4f}, F1 Score = {base_f1_score:.4f}")

Logistic Regression: Accuracy = 0.3360
Logistic Regression: F1 Score = 0.1791
Random Forest: Accuracy = 0.3294
Random Forest: F1 Score = 0.1515
Support Vector Machine: Accuracy = 0.3483
Support Vector Machine: F1 Score = 0.1595
Naive Bayes: Accuracy = 0.3079
Naive Bayes: F1 Score = 0.1232
K-Nearest Neighbors: Accuracy = 0.2442
K-Nearest Neighbors: F1 Score = 0.1127
Decision Tree: Accuracy = 0.2671
Decision Tree: F1 Score = 0.1559

Best Model:
Logistic Regression: Accuracy = 0.3360, F1 Score = 0.1791
