<a href="https://colab.research.google.com/github/yassienshaalan/applied-ml-ai-systems-articles/blob/main/LLM_DS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q pandas numpy scikit-learn xgboost optuna

import numpy as np
import pandas as pd
import json
import time
from pathlib import Path
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from xgboost import XGBClassifier


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m307.2/404.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# =========================
# Dataset
# =========================
def load_adult():
    df = fetch_openml(name="adult", version=2, as_frame=True).frame
    df = df.rename(columns={"class": "target"})
    df["target"] = df["target"].astype(str).map(lambda x: 1 if ">50K" in x else 0)
    df = df.dropna(subset=["target"]).reset_index(drop=True)
    return df

df = load_adult()
X = df.drop(columns=["target"])
y = df["target"].to_numpy()

# =========================
# Preprocessing
# =========================
def build_preprocessor(X):
    num = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
    cat = [c for c in X.columns if c not in num]

    return ColumnTransformer([
        ("num", Pipeline([
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler())
        ]), num),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat)
    ])

# =========================
# Metrics
# =========================
def metrics(y_true, y_score):
    return {
        "auroc": roc_auc_score(y_true, y_score),
        "auprc": average_precision_score(y_true, y_score),
        "brier": brier_score_loss(y_true, y_score)
    }

# =========================
# Injections
# =========================
def inject_label_noise(y, rate=0.15, seed=42):
    rng = np.random.default_rng(seed)
    y2 = y.copy()
    idx = rng.choice(len(y), int(len(y)*rate), replace=False)
    y2[idx] = 1 - y2[idx]
    return y2

def inject_leakage(X, y, strength=0.98, seed=42):
    rng = np.random.default_rng(seed)
    leak = np.where(rng.random(len(y)) < strength, y, 1 - y)
    X2 = X.copy()
    X2["__leak__"] = leak
    return X2


In [None]:
# =========================
# Experiment Runner
# =========================
SEEDS = [7, 13, 21, 42, 99]
results = []

for seed in SEEDS:
    # IID split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.2, random_state=seed, stratify=y
    )
    X_tr, X_va, y_tr, y_va = train_test_split(
        X_tr, y_tr, test_size=0.25, random_state=seed, stratify=y_tr
    )

    prep = build_preprocessor(X_tr)

    models = {
        "logreg": LogisticRegression(max_iter=500, n_jobs=-1),
        "xgb": XGBClassifier(
            n_estimators=700,
            learning_rate=0.05,
            max_depth=4,
            subsample=0.9,
            colsample_bytree=0.9,
            eval_metric="aucpr",
            tree_method="hist",
            n_jobs=-1,
            random_state=seed
        )
    }

    for model_name, model in models.items():
        pipe = Pipeline([("prep", prep), ("model", model)])
        pipe.fit(X_tr, y_tr)

        base = metrics(y_te, pipe.predict_proba(X_te)[:,1])
        results.append({
            "seed": seed,
            "model": model_name,
            "regime": "IID",
            "injection": "NONE",
            **base
        })

        # -------- Label Noise --------
        y_tr_noisy = inject_label_noise(y_tr, seed=seed)
        pipe.fit(X_tr, y_tr_noisy)
        noisy = metrics(y_te, pipe.predict_proba(X_te)[:,1])
        results.append({
            "seed": seed,
            "model": model_name,
            "regime": "IID",
            "injection": "LABEL_NOISE",
            **noisy
        })

        # -------- Leakage --------
        X_tr_leak = inject_leakage(X_tr, y_tr, seed=seed)
        X_te_leak = inject_leakage(X_te, y_te, seed=seed+1)
        prep_leak = build_preprocessor(X_tr_leak)
        pipe = Pipeline([("prep", prep_leak), ("model", model)])
        pipe.fit(X_tr_leak, y_tr)
        leak = metrics(y_te, pipe.predict_proba(X_te_leak)[:,1])
        results.append({
            "seed": seed,
            "model": model_name,
            "regime": "IID",
            "injection": "LEAKAGE",
            **leak
        })

# =========================
# Final Outputs
# =========================
results_df = pd.DataFrame(results)
summary = (
    results_df
    .groupby(["model","injection"])[["auroc","auprc","brier"]]
    .agg(["mean","std"])
    .reset_index()
)

results_df.to_csv("final_results_long.csv", index=False)
summary.to_csv("final_summary.csv", index=False)

results_df, summary


(    seed   model regime    injection     auroc     auprc     brier
 0      7  logreg    IID         NONE  0.905609  0.762717  0.102325
 1      7  logreg    IID  LABEL_NOISE  0.892331  0.729824  0.124998
 2      7  logreg    IID      LEAKAGE  0.995354  0.986595  0.013306
 3      7     xgb    IID         NONE  0.929147  0.831054  0.087475
 4      7     xgb    IID  LABEL_NOISE  0.919190  0.812790  0.106236
 5      7     xgb    IID      LEAKAGE  0.995626  0.987684  0.012642
 6     13  logreg    IID         NONE  0.907755  0.768424  0.101381
 7     13  logreg    IID  LABEL_NOISE  0.899475  0.747593  0.122863
 8     13  logreg    IID      LEAKAGE  0.995763  0.985920  0.013242
 9     13     xgb    IID         NONE  0.931485  0.834594  0.086293
 10    13     xgb    IID  LABEL_NOISE  0.920976  0.818921  0.105414
 11    13     xgb    IID      LEAKAGE  0.996194  0.988151  0.012671
 12    21  logreg    IID         NONE  0.904831  0.767589  0.102158
 13    21  logreg    IID  LABEL_NOISE  0.894816 

In [None]:
# =========================
# CELL 4 — LLM-as-Data-Scientist (Robust JSON + Validation + Safe Loop)
# =========================
# What this cell does:
# - Queries an LLM to propose a model + hyperparameters (logreg or xgb)
# - Makes the response robust to malformed JSON using:
#     1) JSON response_format (preferred)
#     2) JSON extraction + minimal repair (fallback)
#     3) "repair" call (last resort)
# - Validates/clamps hyperparameters to avoid runtime crashes
# - Runs K iterations and logs proposals + validation metrics
#
# Prereqs:
# - You must have already run Cells 1–3 (so X, y, build_preprocessor(), metrics(), etc. exist)
# - Set OPENAI_API_KEY in Colab:
#     import os; os.environ["OPENAI_API_KEY"]="..."
#
# Output:
# - llm_df: one row per iteration with val metrics
# - llm_history: full proposals + metrics (for your article narrative)

import os
import re
import json
import numpy as np
import pandas as pd
from openai import OpenAI
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# -------------------------
# Client
# -------------------------

client = OpenAI(api_key="sk-")#provide your key and run

# -------------------------
# Packet sent to the LLM
# -------------------------
def dataset_packet(X, y):
    return {
        "n_rows": int(X.shape[0]),
        "n_features": int(X.shape[1]),
        "target_rate": float(np.mean(y)),
        "numeric_features": [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])],
        "categorical_features": [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])],
        "missing_rate_top5": X.isna().mean().sort_values(ascending=False).head(5).to_dict()
    }

SYSTEM_PROMPT = """
You are acting as a senior data scientist doing pragmatic hyperparameter search for a tabular binary classifier.

Your goal: maximize validation AUPRC.

You MUST return ONLY valid JSON with EXACT keys:
{
  "model": "logreg" or "xgb",
  "hyperparameters": { ... }
}

Rules:
- Keep hyperparameters conservative (do not propose exotic settings).
- Do not include explanations or markdown.
- Do not include any keys beyond model and hyperparameters.
""".strip()

USER_PROMPT_TEMPLATE = """
Dataset summary:
{dataset_summary}

History of previous attempts (iteration, model, hyperparameters, val metrics):
{history}

Propose the NEXT model and hyperparameters.
Return ONLY JSON.
""".strip()

# -------------------------
# Robust JSON extraction/repair
# -------------------------
def _extract_json_object(text: str) -> str:
    text = text.strip()
    if text.startswith("{") and text.endswith("}"):
        return text
    m = re.search(r"\{.*\}", text, flags=re.DOTALL)
    if not m:
        raise ValueError(f"No JSON object found in output. First 400 chars:\n{text[:400]}")
    return m.group(0)

def _repair_common_json_issues(s: str) -> str:
    s = s.strip()
    s = s.replace("“", '"').replace("”", '"').replace("’", "'")
    # heuristic single quotes -> double quotes
    s = re.sub(r"(?<!\\)'", '"', s)
    # remove trailing commas
    s = re.sub(r",\s*([}\]])", r"\1", s)
    return s

def _validate_and_normalize_proposal(p: dict) -> dict:
    if not isinstance(p, dict):
        raise ValueError("Proposal is not a dict.")
    model = p.get("model", None)
    if model not in {"logreg", "xgb"}:
        raise ValueError(f"Invalid model '{model}'. Must be 'logreg' or 'xgb'.")
    hp = p.get("hyperparameters", {})
    if hp is None:
        hp = {}
    if not isinstance(hp, dict):
        raise ValueError("hyperparameters must be a dict.")

    if model == "logreg":
        allowed = {"C", "solver", "penalty", "class_weight", "l1_ratio"}
        hp = {k: v for k, v in hp.items() if k in allowed}

        C = float(hp.get("C", 1.0))
        hp["C"] = float(np.clip(C, 1e-4, 1e3))

        solver = str(hp.get("solver", "lbfgs"))
        penalty = str(hp.get("penalty", "l2"))

        if solver not in {"lbfgs", "liblinear", "saga", "newton-cg"}:
            solver = "lbfgs"

        # enforce compatibility
        if solver in {"lbfgs", "newton-cg"} and penalty != "l2":
            penalty = "l2"
        if solver == "liblinear" and penalty not in {"l1", "l2"}:
            penalty = "l2"
        if solver == "saga" and penalty not in {"l1", "l2", "elasticnet"}:
            penalty = "l2"

        hp["solver"] = solver
        hp["penalty"] = penalty

    if model == "xgb":
        allowed = {
            "n_estimators", "learning_rate", "max_depth", "min_child_weight",
            "subsample", "colsample_bytree", "reg_lambda", "reg_alpha", "gamma"
        }
        hp = {k: v for k, v in hp.items() if k in allowed}

        hp["n_estimators"] = int(np.clip(int(hp.get("n_estimators", 600)), 100, 2000))
        hp["learning_rate"] = float(np.clip(float(hp.get("learning_rate", 0.05)), 0.005, 0.3))
        hp["max_depth"] = int(np.clip(int(hp.get("max_depth", 4)), 2, 10))
        hp["min_child_weight"] = float(np.clip(float(hp.get("min_child_weight", 1.0)), 1.0, 20.0))
        hp["subsample"] = float(np.clip(float(hp.get("subsample", 0.9)), 0.5, 1.0))
        hp["colsample_bytree"] = float(np.clip(float(hp.get("colsample_bytree", 0.9)), 0.5, 1.0))
        hp["reg_lambda"] = float(np.clip(float(hp.get("reg_lambda", 1.0)), 0.0, 50.0))
        hp["reg_alpha"] = float(np.clip(float(hp.get("reg_alpha", 0.0)), 0.0, 10.0))
        hp["gamma"] = float(np.clip(float(hp.get("gamma", 0.0)), 0.0, 10.0))

    return {"model": model, "hyperparameters": hp}

def query_llm(dataset_summary, history, model_name="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(
            dataset_summary=json.dumps(dataset_summary, indent=2),
            history=json.dumps(history[-6:], indent=2)  # cap history
        )}
    ]

    # 1) Preferred: strict JSON response_format
    try:
        resp = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=0.2,
            response_format={"type": "json_object"},
        )
        raw = resp.choices[0].message.content
        proposal = json.loads(raw)
        return _validate_and_normalize_proposal(proposal)
    except Exception:
        # 2) Fallback: extract + repair
        resp = client.chat.completions.create(
            model=model_name,
            messages=messages,
            temperature=0.2,
        )
        raw = resp.choices[0].message.content

        blob = _extract_json_object(raw)
        blob = _repair_common_json_issues(blob)

        try:
            proposal = json.loads(blob)
            return _validate_and_normalize_proposal(proposal)
        except Exception:
            # 3) Last resort: ask model to repair into valid JSON
            resp2 = client.chat.completions.create(
                model=model_name,
                messages=[
                    {"role": "system", "content": "Return ONLY valid JSON with keys: model, hyperparameters."},
                    {"role": "user", "content": f"Repair into valid JSON only:\n{raw}"},
                ],
                temperature=0.0,
                response_format={"type": "json_object"},
            )
            proposal = json.loads(resp2.choices[0].message.content)
            return _validate_and_normalize_proposal(proposal)

# -------------------------
# Run the LLM loop on a fixed split (fair comparison)
# -------------------------
LLM_ITERS = 8  # bump if you want more exploration
seed = 42

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)
X_tr, X_va, y_tr, y_va = train_test_split(X_tr, y_tr, test_size=0.25, random_state=seed, stratify=y_tr)

prep = build_preprocessor(X_tr)
packet = dataset_packet(X_tr, y_tr)

llm_history = []
llm_results = []

for i in range(LLM_ITERS):
    print(f"LLM Iteration {i+1}/{LLM_ITERS}")

    # hard guard: if the LLM fails, continue with a safe default so the suite never stops
    try:
        proposal = query_llm(packet, llm_history, model_name="gpt-4o-mini")
    except Exception as e:
        print("LLM proposal failed; using safe fallback. Error:", str(e))
        proposal = {"model": "xgb", "hyperparameters": {"n_estimators": 600, "learning_rate": 0.05, "max_depth": 4}}
        proposal = _validate_and_normalize_proposal(proposal)

    if proposal["model"] == "logreg":
        model = LogisticRegression(max_iter=500, n_jobs=-1, **proposal["hyperparameters"])
    else:
        model = XGBClassifier(
            eval_metric="aucpr",
            tree_method="hist",
            n_jobs=-1,
            random_state=seed,
            **proposal["hyperparameters"]
        )

    pipe = Pipeline([("prep", prep), ("model", model)])

    t0 = time.time()
    pipe.fit(X_tr, y_tr)
    train_time_s = time.time() - t0

    val_scores = pipe.predict_proba(X_va)[:, 1]
    val_m = metrics(y_va, val_scores)

    llm_history.append({
        "iteration": i + 1,
        "proposal": proposal,
        "val_metrics": val_m,
        "train_time_s": train_time_s
    })

    llm_results.append({
        "iteration": i + 1,
        "model": proposal["model"],
        "hyperparameters": json.dumps(proposal["hyperparameters"], sort_keys=True),
        "val_auroc": val_m["auroc"],
        "val_auprc": val_m["auprc"],
        "val_brier": val_m["brier"],
        "train_time_s": train_time_s
    })

llm_df = pd.DataFrame(llm_results).sort_values("val_auprc", ascending=False).reset_index(drop=True)
llm_df


LLM Iteration 1/8
LLM Iteration 2/8
LLM Iteration 3/8
LLM Iteration 4/8
LLM Iteration 5/8
LLM Iteration 6/8
LLM Iteration 7/8
LLM Iteration 8/8


Unnamed: 0,iteration,model,hyperparameters,val_auroc,val_auprc,val_brier,train_time_s
0,6,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930909,0.834651,0.086354,2.27953
1,8,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930895,0.834583,0.086371,2.741769
2,7,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930977,0.834575,0.086322,2.447523
3,5,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930761,0.834284,0.086497,3.797983
4,4,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930836,0.834256,0.086493,1.651811
5,3,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.930374,0.83409,0.086561,1.979546
6,2,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.1, ""learn...",0.929286,0.831238,0.087413,1.066486
7,1,xgb,"{""colsample_bytree"": 0.8, ""gamma"": 0.0, ""learn...",0.92302,0.816605,0.091688,1.772087
