In [23]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [25]:
# Load labeled dataset
FILE_PATH = "IndianFinancialNews_with_sentiment.csv"
df = pd.read_csv(FILE_PATH)

In [27]:
# ------------------------- #
# 🔹 Text Preprocessing
# ------------------------- #
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df["Cleaned_Text"] = df["Text"].apply(clean_text)

In [29]:
# ------------------------- #
# 🔹 Convert Labels to Numeric
# ------------------------- #
label_mapping = {"positive": 1, "negative": 0, "neutral": 2}  # Neutral = 2 (optional)
df["Sentiment_Label"] = df["Sentiment"].map(label_mapping)

In [31]:
# ------------------------- #
# 🔹 Train-Test Split
# ------------------------- #
X_train, X_test, y_train, y_test = train_test_split(df["Cleaned_Text"], df["Sentiment_Label"], test_size=0.2, random_state=42)

In [33]:
# ------------------------- #
# 🔹 Convert Text to Vectors (TF-IDF)
# ------------------------- #
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [35]:
# ------------------------- #
# 🔹 Train & Evaluate ML Models
# ------------------------- #
def train_and_evaluate(model, name):
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))

In [37]:
# Logistic Regression
train_and_evaluate(LogisticRegression(), "Logistic Regression")


Logistic Regression Accuracy: 0.8572
              precision    recall  f1-score   support

           0       0.86      0.73      0.79      2353
           1       0.89      0.90      0.89      4716
           2       0.80      0.90      0.85      2931

    accuracy                           0.86     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.86      0.86      0.86     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
# Naïve Bayes
train_and_evaluate(MultinomialNB(), "Naïve Bayes")


Naïve Bayes Accuracy: 0.7084
              precision    recall  f1-score   support

           0       0.75      0.52      0.62      2353
           1       0.67      0.91      0.77      4716
           2       0.79      0.54      0.64      2931

    accuracy                           0.71     10000
   macro avg       0.74      0.66      0.68     10000
weighted avg       0.73      0.71      0.70     10000



In [41]:
# SVM
train_and_evaluate(SVC(kernel="linear"), "SVM")


SVM Accuracy: 0.8905
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      2353
           1       0.93      0.91      0.92      4716
           2       0.84      0.95      0.89      2931

    accuracy                           0.89     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.89      0.89      0.89     10000



In [43]:
# Random Forest
train_and_evaluate(RandomForestClassifier(n_estimators=100), "Random Forest")


Random Forest Accuracy: 0.8088
              precision    recall  f1-score   support

           0       0.84      0.63      0.72      2353
           1       0.86      0.83      0.85      4716
           2       0.73      0.91      0.81      2931

    accuracy                           0.81     10000
   macro avg       0.81      0.79      0.79     10000
weighted avg       0.82      0.81      0.81     10000

