
 # Milestone II — Why these 3 models and how they work

 We keep one familiar baseline and add two models with different inductive biases so their errors are less correlated.
 The trio gives strong accuracy, calibrated probabilities (needed for UI), and robustness to noisy/misspelled text.

 - 3 models, 3 views of the text: word-level linear, NB bias, character-level margins.

 ## 1) TF-IDF + Logistic Regression (LR)
 Word n-grams + LR. Fast, smooth probabilities, interpretable features. Misses char clues; linear boundary.

 ## 2) TF-IDF + Complement Naive Bayes (CNB)
 Word n-grams + CNB. Strong on short/imbalanced data, trains very fast. Independence assumption limits boundary.

 ## 3) Char TF-IDF (3–5) + Calibrated Linear SVM
 Character n-grams capture typos/subword patterns. SVM margins are strong; calibration yields probabilities.

 **Safety knobs**: dynamic `min_df`, capped `max_features`, `class_weight="balanced"` (LR/SVM), SVM probability calibration.


## === Cell 1. Imports, paths, reproducibility ===

In [11]:

import json, warnings, random, re
from pathlib import Path
from typing import List

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier
import joblib

warnings.filterwarnings("ignore")

# Resolve dataset no matter where the notebook runs
NB_DIR = Path.cwd().resolve()
CANDIDATES = [
    NB_DIR / "data" / "assignment3_II.csv",
    NB_DIR.parent / "data" / "assignment3_II.csv",
    NB_DIR.parents[1] / "data" / "assignment3_II.csv",
]
DATA_CSV = next((p for p in CANDIDATES if p.exists()), None)
if DATA_CSV is None:
    raise FileNotFoundError("Place 'assignment3_II.csv' under project_root/data/")

PROJECT_ROOT = DATA_CSV.parent.parent
DATA_DIR  = PROJECT_ROOT / "data"
MODEL_DIR = PROJECT_ROOT / "model"
DATA_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

ENSEMBLE_PKL   = MODEL_DIR / "ensemble_soft.pkl"
MANIFEST_JSON  = MODEL_DIR / "manifest.json"

SEED = 42
random.seed(SEED); np.random.seed(SEED)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_CSV    :", DATA_CSV)
print("MODEL_DIR   :", MODEL_DIR)

PROJECT_ROOT: /Users/mac/Desktop/dem-web
DATA_CSV    : /Users/mac/Desktop/dem-web/data/assignment3_II.csv
MODEL_DIR   : /Users/mac/Desktop/dem-web/model


## === Cell 2. Load CSV & normalize columns ===
 - Accept either `Recommended IND` or `Recommended` as label.
 - Minimal cleaning; keep "not/never/no" in vocabulary.

In [12]:

df = pd.read_csv(DATA_CSV)

# Display fields (fallbacks)
df["Clothes Title"] = df.get("Clothes Title", df.get("Title", "")).fillna("")
df["Clothes Description"] = df.get("Clothes Description", df.get("Review Text", "")).fillna("")

# Ensure required columns exist
for c in [
    "Clothing ID","Clothes Title","Clothes Description","Rating",
    "Division Name","Department Name","Class Name","Review Text","Title"
]:
    if c not in df.columns:
        df[c] = ""

# Types
def to_int_safe(x, default=0):
    try: return int(x)
    except: return default

df["Clothing ID"] = df["Clothing ID"].apply(to_int_safe)
df["Rating"]      = pd.to_numeric(df["Rating"], errors="coerce").fillna(0).astype(int)
df["Review Text"] = df["Review Text"].fillna("").astype(str)
df["Title"]       = df["Title"].fillna("").astype(str)

# Label column
label_col = "Recommended IND" if "Recommended IND" in df.columns else (
            "Recommended" if "Recommended" in df.columns else None)
if label_col is None:
    raise ValueError("No label found. Expect 'Recommended IND' or 'Recommended'.")

df[label_col] = pd.to_numeric(df[label_col], errors="coerce").fillna(0).astype(int)
df = df[(df[label_col].isin([0,1]))].copy()

print("Rows:", len(df))
print("Label:", label_col, "| Positives:", int(df[label_col].sum()))

Rows: 19662
Label: Recommended IND | Positives: 16087


## === Cell 3. Light text cleaning for modeling ===

In [13]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s']", " ", text)
    return re.sub(r"\s+", " ", text).strip()

df["Review Text"] = df["Review Text"].apply(clean_text)
df["Title"]       = df["Title"].apply(clean_text)

mask_nonempty = df["Review Text"].str.len() > 0
df = df[mask_nonempty].copy()

print("Remaining samples:", len(df))

Remaining samples: 19662


## === Cell 4. Build corpus & labels ===

In [14]:
X = df["Review Text"].tolist()
y = df[label_col].astype(int).to_numpy()

print(f"Samples: {len(y)} | Positives: {int(y.sum())}")
print("Example:", (X[0][:120] + "...") if X else "(empty)")

Samples: 19662 | Positives: 16087
Example: i had such high hopes for this dress and really wanted it to work for me i initially ordered the petite small my usual s...


## === Cell 5. Define the 3 pipelines ===
 - Word TF-IDF (1–3) + LR (balanced).
 - Word TF-IDF (1–3) + Complement NB.
 - Char TF-IDF (3–5) + LinearSVC (calibrated to get `predict_proba`).


In [15]:

SEED = 42

# Keep negations/intensity words; remove most of the stopwords
STOP_WORDS_KEEP_NEG = sorted(list(ENGLISH_STOP_WORDS - {"no", "not", "never", "too", "very"}))

def choose_min_df(n_docs: int) -> int:
    """Choose a small min_df so rare bigrams like 'too tight' stay in the vocab."""
    return 1 if n_docs < 5000 else 2

def make_tfidf_lr(n_docs: int, max_features: int = 40_000) -> Pipeline:
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=max_features,
            min_df=choose_min_df(n_docs),
            stop_words=STOP_WORDS_KEEP_NEG,
            sublinear_tf=True
        )),
        ("clf", LogisticRegression(
            max_iter=2000,
            class_weight="balanced",
            solver="liblinear",
            random_state=SEED
        ))
    ])

def make_tfidf_cnb(n_docs: int, max_features: int = 40_000, alpha: float = 0.5) -> Pipeline:
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=(1, 3),
            max_features=max_features,
            min_df=choose_min_df(n_docs),
            stop_words=STOP_WORDS_KEEP_NEG,
            sublinear_tf=True
        )),
        ("clf", ComplementNB(alpha=alpha))
    ])

def make_char_svm(n_docs: int, C: float = 0.5) -> Pipeline:
    # char_wb is strong on misspellings/short phrases
    vec = TfidfVectorizer(analyzer="char_wb", ngram_range=(3, 5), sublinear_tf=True, min_df=2)
    svm = LinearSVC(C=C, class_weight="balanced", random_state=SEED)
    cal = CalibratedClassifierCV(svm, method="sigmoid", cv=3)  # enables predict_proba
    return Pipeline([("vec", vec), ("clf", cal)])

## == Cell 6. Build X (corpus) and y (labels) ===

In [16]:


def _clean_text(s: str) -> str:
    s = (s or "").lower()
    # keep basic punctuation that can form bigrams like "too tight"
    s = re.sub(r"[^a-z0-9\s.,!'?-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

dfe = df.copy()
dfe["Review Text"] = dfe["Review Text"].fillna("").astype(str).map(_clean_text)

y_all = pd.to_numeric(dfe[label_col], errors="coerce").fillna(0).astype(int)
mask = (dfe["Review Text"].str.len() > 0) & (y_all.isin([0, 1]))
dfe = dfe.loc[mask].reset_index(drop=True)

# Final arrays
X = dfe["Review Text"].tolist()
y = dfe[label_col].astype(int).to_numpy()

print(f"[Cell 6] Samples kept: {len(y)} | Positives: {int(y.sum())} | Negatives: {len(y) - int(y.sum())}")
print("Example:", (X[0][:120] + "...") if len(X) else "(empty)")

[Cell 6] Samples kept: 19662 | Positives: 16087 | Negatives: 3575
Example: i had such high hopes for this dress and really wanted it to work for me i initially ordered the petite small my usual s...


## === Cell 7. Quick CV sanity check (Acc/F1/AUC) ===



In [17]:

def _safe_cv(y_vec: np.ndarray, seed: int = SEED) -> StratifiedKFold:
    min_class = int(min((y_vec == 0).sum(), (y_vec == 1).sum()))
    n_splits = max(2, min(5, min_class))
    return StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

def _eval_model(name: str, pipe: Pipeline, X_list, y_vec):
    cv = _safe_cv(y_vec)
    metrics = {}
    for metric in ("accuracy", "f1", "roc_auc"):
        metrics[metric] = cross_val_score(pipe, X_list, y_vec, cv=cv, scoring=metric).mean()
    print(f"[{name}]  Acc={metrics['accuracy']:.3f}  F1={metrics['f1']:.3f}  AUC={metrics['roc_auc']:.3f}")
    return {"name": name, **metrics}

results = []
results.append(_eval_model("TFIDF+LR (1–3)",  make_tfidf_lr(len(X)),  X, y))
results.append(_eval_model("TFIDF+CNB (1–3)", make_tfidf_cnb(len(X)), X, y))
results.append(_eval_model("Char TFIDF+SVM (3–5)", make_char_svm(len(X)), X, y))

pd.DataFrame(results).sort_values("roc_auc", ascending=False)

[TFIDF+LR (1–3)]  Acc=0.881  F1=0.924  AUC=0.936
[TFIDF+CNB (1–3)]  Acc=0.893  F1=0.936  AUC=0.930
[Char TFIDF+SVM (3–5)]  Acc=0.892  F1=0.936  AUC=0.935


Unnamed: 0,name,accuracy,f1,roc_auc
0,TFIDF+LR (1–3),0.880582,0.924484,0.935627
2,Char TFIDF+SVM (3–5),0.892279,0.935727,0.934982
1,TFIDF+CNB (1–3),0.892839,0.935686,0.930439


## === Cell 8. Train on all data & export artifacts ===

In [18]:

# Train base models
mdl_lr  = make_tfidf_lr(len(X)).fit(X, y)
mdl_cnb = make_tfidf_cnb(len(X)).fit(X, y)
mdl_svm = make_char_svm(len(X)).fit(X, y)

# Slightly higher weight on char model (often helps short/misspelled reviews)
VOTE_WEIGHTS = (0.9, 0.9, 1.2)

ensemble = VotingClassifier(
    estimators=[
        ("tfidf_lr",  mdl_lr),
        ("tfidf_cnb", mdl_cnb),
        ("char_svm",  mdl_svm),
    ],
    voting="soft",
    weights=list(VOTE_WEIGHTS),
    n_jobs=None
).fit(X, y)

# --- Runtime post-probability heuristic (documented & used by API) ---
# Keep this set in sync with model_info.py or let the API read from manifest.
NEG_PHRASES = {
    "too tight", "too small", "itchy", "scratchy", "see through",
    "cheap fabric", "poor fit", "returned", "uncomfortable",
    "runs small", "runs large"
}
HEURISTIC_DELTA = 0.08        # <-- how much to nudge down per hit (API uses this)
THRESHOLD = 0.60              # <-- decision threshold used by API

# Save the pure sklearn ensemble (joblib)
joblib.dump(ensemble, ENSEMBLE_PKL)

# Save manifest with parameters the Flask app reads
manifest = {
    "bundle_name": "VotingClassifier (LR + CNB + CharSVM)",
    "weights": {"tfidf_lr": VOTE_WEIGHTS[0], "tfidf_cnb": VOTE_WEIGHTS[1], "char_svm": VOTE_WEIGHTS[2]},
    "threshold": THRESHOLD,
    "post_prob_heuristic": True,
    "heuristic_delta": HEURISTIC_DELTA,
    "neg_phrases": sorted(list(NEG_PHRASES)),
    "word_ngram": "(1,3)",
    "char_ngram": "(3,5)",
    "samples": int(len(X)),
    "positives": int(y.sum()),
    "notes": "Keep negation/intensity tokens; slight char-model upweight; optional heuristic nudge (too tight, runs small, ...)."
}
with open(MANIFEST_JSON, "w", encoding="utf-8") as f:
    json.dump(manifest, f, indent=2)

print(f"Saved → {ENSEMBLE_PKL}")
print(f"Saved → {MANIFEST_JSON}")

# --- Optional: quick sanity check mirroring the API (raw vs adjusted vs label) ---
import numpy as np
from pathlib import Path

def _post_prob_adjust(text: str, p: float, phrases=NEG_PHRASES, delta=HEURISTIC_DELTA, enabled=True) -> float:
    """Reduce prob if any negative phrase is present; clamp to [0,1]."""
    if not enabled:
        return p
    s = (text or "").lower()
    hits = sum(1 for ph in phrases if ph in s)
    return max(0.0, min(1.0, p - hits * delta))

def _decide_label(prob: float, thr: float = THRESHOLD) -> str:
    return "Positive" if prob >= thr else "Negative"

def predict_like_api(model, text: str) -> tuple[str, float, float]:
    """Return (label, prob_adjusted, prob_raw) using the same rules as the API."""
    if hasattr(model, "predict_proba"):
        raw = float(model.predict_proba([text])[:, 1][0])
    elif hasattr(model, "decision_function"):
        s = float(model.decision_function([text])[0])
        raw = float(1.0 / (1.0 + np.exp(-s)))
    else:
        yhat = int(model.predict([text])[0])
        raw = 0.75 if yhat == 1 else 0.25
    adj = _post_prob_adjust(text, raw, phrases=NEG_PHRASES, delta=HEURISTIC_DELTA, enabled=True)
    lab = _decide_label(adj, thr=THRESHOLD)
    return lab, adj, raw

# Reload the exact saved model to simulate Flask runtime
_model_check = joblib.load(ENSEMBLE_PKL)

_examples = [
    "Lovely ugly top, but too tight",
    "This dress is beautiful and I love it!",
    "Cheap fabric and very uncomfortable, runs small",
    "Perfect fit, high quality material.",
]
print("\nSanity check (raw -> adjusted -> label):")
for t in _examples:
    lab, p_adj, p_raw = predict_like_api(_model_check, t)
    print(f"- {t}\n  raw={p_raw:.3f}  adj={p_adj:.3f}  thr={THRESHOLD:.2f}  → {lab}\n")

Saved → /Users/mac/Desktop/dem-web/model/ensemble_soft.pkl
Saved → /Users/mac/Desktop/dem-web/model/manifest.json

Sanity check (raw -> adjusted -> label):
- Lovely ugly top, but too tight
  raw=0.614  adj=0.534  thr=0.60  → Negative

- This dress is beautiful and I love it!
  raw=0.915  adj=0.915  thr=0.60  → Positive

- Cheap fabric and very uncomfortable, runs small
  raw=0.074  adj=0.000  thr=0.60  → Negative

- Perfect fit, high quality material.
  raw=0.798  adj=0.798  thr=0.60  → Positive

