# Logistic Regression Baseline (TF-IDF)
This notebook loads precomputed TFâ€‘IDF matrices, trains a Logistic Regression baseline

In [1]:
import os
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

SEED = 42
np.random.seed(SEED)


In [2]:
PATH_X_TRAIN = "../data/combined_split/X_train_tfidf.npz"
PATH_X_VAL   = "../data/combined_split/X_val_tfidf.npz"
PATH_X_TEST  = "../data/combined_split/X_test_tfidf.npz"

PATH_Y_TRAIN = "../data/combined_split/train_split.csv"
PATH_Y_VAL   = "../data/combined_split/val_split.csv"
PATH_Y_TEST  = "../data/combined_split/test_split.csv"

In [3]:
X_train = load_npz(PATH_X_TRAIN)
X_val   = load_npz(PATH_X_VAL)
X_test  = load_npz(PATH_X_TEST)

y_train = pd.read_csv(PATH_Y_TRAIN)['label']
y_val   = pd.read_csv(PATH_Y_VAL)['label']
y_test  = pd.read_csv(PATH_Y_TEST)['label']

print("Shapes -> X_train:", X_train.shape, " X_val:", X_val.shape, " X_test:", X_test.shape)
print("y value counts (train):")
print(y_train.value_counts())

Shapes -> X_train: (8165, 51593)  X_val: (1167, 51593)  X_test: (2333, 51593)
y value counts (train):
label
positive    5923
negative    1467
neutral      775
Name: count, dtype: int64


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

model = LogisticRegression(
    max_iter=1000,
    solver='liblinear',
    penalty='l2',
    C=0.45,
    class_weight=None
)
model.fit(X_train, y_train)

def evaluate_split(name, X, y):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    f1m = f1_score(y, y_pred, average='macro')
    return acc, f1m

val_acc, val_f1 = evaluate_split("Validation", X_val, y_val)
test_acc, test_f1 = evaluate_split("Test", X_test, y_test)

print(
    f"Validation Accuracy: {val_acc*100:.2f}% "
    f"Validation Macro-F1: {val_f1:.4f} "
    f"Test Accuracy: {test_acc*100:.2f}% "
    f"Test Macro-F1: {test_f1:.4f}"
)


Validation Accuracy: 81.92% Validation Macro-F1: 0.5201 Test Accuracy: 81.23% Test Macro-F1: 0.5146
