In [0]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from scripts.data_utils import Data

In [0]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [0]:
nltk.download('stopwords')
nltk.download('punkt_tab')

In [0]:
df = pd.read_csv("/Workspace/data/data.csv")
df.head()

In [0]:

data = Data(
    file_path="/Workspace/data/data.csv",
    holdout = True,
    test_size = 0.2,
    holdout_size = 0.5,
    stratify = ['Sentiment']
)
data.read()
data.split()

In [0]:
data.df['split'].value_counts()

In [0]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [0]:
data.df['preprocessed_text'] = data.df['Sentence'].apply(preprocess_text)
data.df.head()

In [0]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'SVM': SVC(),
    'Multinomial NB': MultinomialNB(),
}

param_grids = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'saga']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'Multinomial NB': {
        'alpha': [0.1, 0.5, 1.0, 2.0]  # Smoothing parameter
    }
}

In [0]:
tfidf_vectorizer = TfidfVectorizer(max_features=500)
tfidf = tfidf_vectorizer.fit(data.df['preprocessed_text'])

train_data = data.df[data.df['split'] == 'train']
test_data = data.df[data.df['split'] == 'test']
X_train, X_test, y_train, y_test = tfidf.transform(train_data['preprocessed_text']), tfidf.transform(test_data['preprocessed_text']), train_data['Sentiment'], test_data['Sentiment']

In [0]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    print(f"\n--- {name} Results ---")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)