# 2.0 - Baseline Model

This mirrors the TF-IDF + Logistic Regression & Random Forest parts.

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# --- Load dataset ---
df = pd.read_csv("data/processed/combined_dataset.csv")

X = df["text"].astype(str).tolist()
y = df["label"].astype(int).tolist()

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
# --- TF-IDF Vectorization ---
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)


In [None]:
# --- Logistic Regression ---
log_reg = LogisticRegression(max_iter=200, random_state=42)
log_reg.fit(X_train_tfidf, y_train)
y_pred_lr = log_reg.predict(X_val_tfidf)

print("[Baseline Logistic Regression]")
print(classification_report(y_val, y_pred_lr))

joblib.dump((log_reg, vectorizer), "models/checkpoints/baseline_logreg.pkl")



In [None]:
# --- Random Forest ---
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_val_tfidf)

print("[Baseline Random Forest]")
print(classification_report(y_val, y_pred_rf))

joblib.dump((rf, vectorizer), "models/checkpoints/baseline_rf.pkl")
