# 2.0 - Baseline Model

Simple scikit-learn baseline using TF-IDF + Logistic Regression.

In [None]:
# Cell 1 - imports & paths
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

DATA_INTERIM = Path('data/interim')
MODELS_DIR = Path('models/checkpoints')
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("Paths set. DATA_INTERIM:", DATA_INTERIM.resolve())

In [None]:
# Cell 2 - load preprocessed sample
train_csv = DATA_INTERIM / 'bharat_sample_clean.csv'
if not train_csv.exists():
    print("Interim file not found:", train_csv)
    df = pd.DataFrame({'clean_text':[], 'label':[]})
else:
    df = pd.read_csv(train_csv)

if 'label' not in df.columns:
    df['label'] = 0

X = df['clean_text'].fillna('')
y = df['label']

if len(df) > 0 and y.nunique() > 1:
    strat = y
else:
    strat = None

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=strat)
print("Loaded dataset. Train size:", len(X_train))

In [None]:
# Cell 3 - vectorize & train baseline
vec = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

model = LogisticRegression(max_iter=400)
model.fit(Xtr, y_train)

# save
joblib.dump(vec, MODELS_DIR / 'baseline_vec.joblib')
joblib.dump(model, MODELS_DIR / 'baseline_model.pkl')
print("Saved baseline model and vectorizer to", MODELS_DIR)

In [None]:
# Cell 4 - evaluation
if len(X_test) > 0:
    yhat = model.predict(Xte)
    print(classification_report(y_test, yhat))
    print(confusion_matrix(y_test, yhat))
else:
    print("No test set available for evaluation.")

In [None]:
# Cell 5 - threat confidence function
def threat_confidence(text):
    p = model.predict_proba(vec.transform([text]))[0]
    if p.shape[0] == 2:
        return float(p[1])
    return float(p.max())

print('sample score', threat_confidence('This is a test example.'))