# 0. Environment Setup

In [1]:
import pandas as pd
import nltk
import os
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score
)
from sklearn.preprocessing import label_binarize
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Load dataset

In [2]:
train_df = pd.read_csv('dataset/processed/train.csv')
print('Number of samples: ', len(train_df))
train_df.head()

Number of samples:  14691


Unnamed: 0,Comment,Sentiment
0,it’s so adorable that he says “baap” for up an...,positive
1,sir i have no words to describe your teaching ...,positive
2,the reason they said large and open space inst...,neutral
3,for ur information this is an fact that jrntr ...,neutral
4,you can really tell the progress awesome espec...,positive


In [3]:
test_df = pd.read_csv('dataset/processed/test.csv')
print('Number of samples: ', len(test_df))
test_df.head()

Number of samples:  3673


Unnamed: 0,Comment,Sentiment
0,“oh my god guys there’s an octopus eating a cr...,negative
1,my daughter will be starting her 8th grade che...,positive
2,for some future video you should definitely bu...,neutral
3,i’m chronically ill and very frequently find i...,positive
4,the pizza planet pizza being awful is just dis...,negative


In [4]:
# Define label Mapping
label_map = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# convert label
def convert_label(df):
    df['label'] = df['Sentiment'].map(label_map)
    df = df.dropna(subset=['label']) # remove uncovered label (e.g., 'Other')
    df['label'] = df['label'].astype(int)
    return df

train_df = convert_label(train_df)
test_df = convert_label(test_df)

# 2. Remove stopwords

In [5]:
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

train_df['Comment'] = train_df['Comment'].apply(remove_stopwords)
test_df['Comment'] = test_df['Comment'].apply(remove_stopwords)


# 3. Convert text data into numerical features using TF-IDF Vectorizer

In [6]:
X_train = train_df['Comment']
y_train = train_df['label']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

In [8]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)

# 4. Fit Multinomial Naive Bayes classifier

In [9]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


# 5. Evaluate the model on the test set

In [10]:
X_test = test_df['Comment']
y_test = test_df['label']

X_test_tfidf = vectorizer.transform(X_test)
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])

test_index = X_test.index
test_text = X_test.values

In [11]:
y_pred = model.predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
report = classification_report(y_test, y_pred, digits=4, output_dict=True)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))


Accuracy: 0.6768

Classification Report:
              precision    recall  f1-score   support

           0     0.8824    0.1285    0.2243       467
           1     0.6818    0.2432    0.3586       925
           2     0.6721    0.9649    0.7923      2281

    accuracy                         0.6768      3673
   macro avg     0.7454    0.4456    0.4584      3673
weighted avg     0.7013    0.6768    0.6108      3673



In [12]:
# One-vs-Rest AUC
if hasattr(model, "predict_proba"):
    y_score = model.predict_proba(X_test_tfidf)
else:
    y_score = model.decision_function(X_test_tfidf)
    if y_score.ndim == 1:
        y_score = np.vstack([-y_score, y_score]).T

auc_per_class = {}
class_names = ["Negative (0)", "Neutral (1)", "Positive (2)"]
print("\nOne-vs-Rest AUC:")
for i, cname in enumerate(class_names):
    auc = roc_auc_score(y_test_binarized[:, i], y_score[:, i])
    auc_per_class[cname] = auc
    print(f"AUC for {cname}: {auc:.4f}")
auc_dict = auc_per_class



One-vs-Rest AUC:
AUC for Negative (0): 0.8463
AUC for Neutral (1): 0.8193
AUC for Positive (2): 0.8585


In [13]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm_dict = cm
print("\nConfusion Matrix (rows = true, cols = predicted):")
print(cm)



Confusion Matrix (rows = true, cols = predicted):
[[  60   28  379]
 [   5  225  695]
 [   3   77 2201]]


In [14]:
pred_df = pd.DataFrame({
    "id": test_index,
    "comment": test_text,
    "true_label": y_test.values,
    "pred_label": y_pred
})
filename = "outputs/naive-bayes/test_predictions.csv"
os.makedirs(os.path.dirname(filename), exist_ok=True)
pred_df.to_csv(filename, index=False)
print(f"\nSaved predictions to: {filename}")


Saved predictions to: outputs/naive-bayes/test_predictions.csv
