In [None]:
!pip -q install numpy pandas scikit-learn scipy

import os, json, numpy as np, pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MaxAbsScaler, normalize
from sklearn.impute import SimpleImputer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from scipy.optimize import linear_sum_assignment

DATA_PATH   = "/content/data.csv"
OUT_DIR     = "artifacts_model2_short"
EMBED_DIMS  = 192
USE_L2      = True
SEED        = 42

os.makedirs(OUT_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH).drop_duplicates().reset_index(drop=True)


LABEL_COL = None
for c in df.columns:
    if c.strip().lower() in {"label","labels","target","class","classes","category","y","outcome","diagnosis"}:
        LABEL_COL = c; break
if LABEL_COL is None and "Class" in df.columns:
    LABEL_COL = "Class"
if LABEL_COL is None:
    n = len(df)
    for c in df.columns:
        nu = df[c].nunique(dropna=False)
        if 2 <= nu <= min(50, max(2, int(0.2*n))):
            LABEL_COL = c; break
if LABEL_COL is None:
    raise ValueError("Couldn't infer label column.")

y = df[LABEL_COL].astype(str).to_numpy()
feat_cols = [c for c in df.columns if c != LABEL_COL]

def is_num(s):
    return pd.api.types.is_numeric_dtype(s)
obj_cols      = [c for c in feat_cols if df[c].dtype == "object"]
low_card_cols = [c for c in feat_cols if is_num(df[c]) and df[c].nunique() <= 5]
cat_cols      = sorted(set(obj_cols + low_card_cols))
num_cols      = [c for c in feat_cols if c not in cat_cols]


Xtr_df, Xte_df, ytr, yte = train_test_split(df[feat_cols], y, test_size=0.20, stratify=y, random_state=SEED)


def make_ohe_sparse():
    try:    return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except: return OneHotEncoder(handle_unknown="ignore", sparse=True)

cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")), ("ohe", make_ohe_sparse())])
num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")), ("scale", MaxAbsScaler())])

trs = []
if num_cols: trs.append(("num", num_pipe, num_cols))
if cat_cols: trs.append(("cat", cat_pipe, cat_cols))
pre_train = ColumnTransformer(trs, remainder="drop", sparse_threshold=1.0)

Xtr_pre = pre_train.fit_transform(Xtr_df)
Xte_pre = pre_train.transform(Xte_df)
n_features = Xtr_pre.shape[1]

svd_dims = max(2, min(EMBED_DIMS, n_features - 1))
svd = TruncatedSVD(n_components=svd_dims, random_state=SEED)

Ztr = svd.fit_transform(Xtr_pre)
Zte = svd.transform(Xte_pre)

if USE_L2:
    Ztr = normalize(Ztr); Zte = normalize(Zte)

print(f"[Info] Preprocessed features: {n_features} | SVD dims used: {svd_dims} | var≈{svd.explained_variance_ratio_.sum():.4f}")

def hungarian_map(clusters, labels):
    clus = np.unique(clusters); labs = np.unique(labels)
    M = np.zeros((len(clus), len(labs)), dtype=int)
    ci = {c:i for i,c in enumerate(clus)}; li = {l:i for i,l in enumerate(labs)}
    for c, y in zip(clusters, labels): M[ci[c], li[y]] += 1
    r, c = linear_sum_assignment(-M)
    mp = {clus[i]: labs[j] for i, j in zip(r, c)}
    for c_id in clus:
        if c_id not in mp: mp[c_id] = labs[np.argmax(M[ci[c_id]])]
    return mp


n_labels = len(np.unique(ytr))
k_grid = sorted(set([n_labels, 2*n_labels, 3*n_labels, 4*n_labels]))

Ztr_tr, Ztr_val, ytr_tr, ytr_val = train_test_split(Ztr, ytr, test_size=0.2, stratify=ytr, random_state=SEED)

best = {"k": None, "val_acc": -1, "model": None, "map": None}
for k in k_grid:
    mbk = MiniBatchKMeans(n_clusters=k, random_state=SEED, n_init=15, batch_size=4096, max_iter=250).fit(Ztr_tr)
    mapping = hungarian_map(mbk.labels_, ytr_tr)
    yval_pred = np.array([mapping[c] for c in mbk.predict(Ztr_val)], dtype=object)
    acc = accuracy_score(ytr_val, yval_pred)
    if acc > best["val_acc"]:
        best.update({"k": k, "val_acc": acc, "model": mbk, "map": mapping})

print(f"[k-choice] k∈{k_grid} | picked k={best['k']} | val_acc={best['val_acc']:.4f}")


final = MiniBatchKMeans(n_clusters=best["k"], random_state=SEED, n_init=20, batch_size=8192, max_iter=300).fit(Ztr)
c_tr, c_te = final.labels_, final.predict(Zte)
cl2lb = hungarian_map(c_tr, ytr)
fallback = Counter(ytr).most_common(1)[0][0]
ytr_pred = np.array([cl2lb.get(c, fallback) for c in c_tr], dtype=object)
yte_pred = np.array([cl2lb.get(c, fallback) for c in c_te], dtype=object)

def report(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average="macro", zero_division=0)
    print(f"{name} Accuracy: {acc:.4f} | Macro-F1: {f1m:.4f}")
    print(classification_report(y_true, y_pred, zero_division=0))
    return acc, f1m

print("\n=== MODEL 2 (SHORT+FIXED) RESULTS ===")
acc_tr, f1m_tr = report("TRAIN", ytr, ytr_pred)
acc_te, f1m_te = report("TEST ", yte, yte_pred)
print(f"Train Error: {1-acc_tr:.4f} | Test Error: {1-acc_te:.4f} | Gap: {(1-acc_te)-(1-acc_tr):.4f}")

labels = sorted(np.unique(np.concatenate([ytr, yte]).astype(str)))
cm = confusion_matrix(yte, yte_pred, labels=labels)
rows = []
for i, lab in enumerate(labels):
    TP = int(cm[i, i]); FN = int(cm[i, :].sum() - TP)
    FP = int(cm[:, i].sum() - TP); TN = int(cm.sum() - TP - FP - FN)
    rows.append({"class": lab, "TP": TP, "FP": FP, "FN": FN, "TN": TN})
fpfn_test = pd.DataFrame(rows).set_index("class")
pred_df = pd.DataFrame({"index": Xte_df.index, "y_true": yte, "cluster": c_te, "y_pred": yte_pred, "correct": (yte_pred==yte)})

fpfn_test.to_csv(os.path.join(OUT_DIR, "fpfn_test.csv"))
pred_df.to_csv(os.path.join(OUT_DIR, "predictions_test.csv"), index=False)

summary = {
    "embed": {"type": "TruncatedSVD", "dims": int(svd_dims), "l2": bool(USE_L2),
              "preprocessed_features": int(n_features), "variance_est": float(svd.explained_variance_ratio_.sum())},
    "k": int(best["k"]), "val_acc": float(best["val_acc"]),
    "train": {"accuracy": float(acc_tr), "macro_f1": float(f1m_tr), "error": float(1-acc_tr)},
    "test":  {"accuracy": float(acc_te), "macro_f1": float(f1m_te), "error": float(1-acc_te),
              "gap": float((1-acc_te)-(1-acc_tr))}
}
with open(os.path.join(OUT_DIR, "results_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)
print("\n[SUMMARY]\n", json.dumps(summary, indent=2))
print(f"\nArtifacts in: {OUT_DIR}")


[Info] Preprocessed features: 68 | SVD dims used: 67 | var≈1.0000
[k-choice] k∈[9, 18, 27, 36] | picked k=36 | val_acc=0.7891

=== MODEL 2 (SHORT+FIXED) RESULTS ===
TRAIN Accuracy: 0.8306 | Macro-F1: 0.7239
              precision    recall  f1-score   support

           1       0.85      0.89      0.87      1203
           2       0.83      0.94      0.88      1982
           3       0.99      0.85      0.92      2353
           4       0.43      0.89      0.58       379
           5       0.04      0.29      0.07        34
           6       0.80      0.79      0.80       577
           7       0.89      0.82      0.85       318
           8       0.99      0.71      0.83       978
           9       0.93      0.59      0.72       806

    accuracy                           0.83      8630
   macro avg       0.75      0.75      0.72      8630
weighted avg       0.88      0.83      0.84      8630

TEST  Accuracy: 0.8295 | Macro-F1: 0.7199
              precision    recall  f1-score   

In [None]:

import numpy as np, matplotlib.pyplot as plt, os
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

os.makedirs(OUT_DIR, exist_ok=True)

k_grid = sorted(set([n_labels, 2*n_labels, 3*n_labels, 4*n_labels]))  # e.g., [9,18,27,36]
train_errs, val_errs = [], []

for k in k_grid:
    mbk = MiniBatchKMeans(n_clusters=k, random_state=SEED, n_init=15, batch_size=4096, max_iter=250).fit(Ztr_tr)
    mp = hungarian_map(mbk.labels_, ytr_tr)
    y_tr_hat  = np.array([mp[c] for c in mbk.predict(Ztr_tr)], dtype=object)
    y_val_hat = np.array([mp[c] for c in mbk.predict(Ztr_val)], dtype=object)
    train_errs.append(1.0 - accuracy_score(ytr_tr, y_tr_hat))
    val_errs.append(1.0 - accuracy_score(ytr_val, y_val_hat))

plt.figure(figsize=(6.5,4.5), dpi=140)
plt.plot(k_grid, train_errs, marker="o", label="Train error")
plt.plot(k_grid, val_errs, marker="o", label="Validation error")
plt.title("Model 2: fitting graph (k vs. error)")
plt.xlabel("k (clusters)"); plt.ylabel("Error = 1 - accuracy"); plt.legend()
plt.tight_layout(); plt.savefig(os.path.join(OUT_DIR, "fitting_graph_k.png"), dpi=160)
plt.close()

def save_cm_png(y_true, y_pred, title, fname):
    labs = sorted(np.unique(np.concatenate([y_true, y_pred]).astype(str)))
    cm = confusion_matrix(y_true, y_pred, labels=labs)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labs)
    fig, ax = plt.subplots(figsize=(6.2,5.4), dpi=140)
    disp.plot(ax=ax, colorbar=False); ax.set_title(title); plt.xticks(rotation=30)
    plt.tight_layout(); plt.savefig(os.path.join(OUT_DIR, fname), dpi=160); plt.close()

save_cm_png(ytr, ytr_pred, "Model 2 — Confusion Matrix (TRAIN)", "cm_train.png")
save_cm_png(yte, yte_pred, "Model 2 — Confusion Matrix (TEST)",  "cm_test.png")

print("Saved:",
      os.path.join(OUT_DIR, "fitting_graph_k.png"),
      os.path.join(OUT_DIR, "cm_train.png"),
      os.path.join(OUT_DIR, "cm_test.png"))


Saved: artifacts_model2_short/fitting_graph_k.png artifacts_model2_short/cm_train.png artifacts_model2_short/cm_test.png
