In [4]:
# ╔═══════════════════════════════════════════════════════════════════════════╗
# 0 ▸ Imports
# ╚═══════════════════════════════════════════════════════════════════════════╝
import numpy as np, pandas as pd, os, warnings, json
from pathlib import Path
from numpy.random          import default_rng
from sklearn.compose       import ColumnTransformer
from sklearn.pipeline      import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute        import SimpleImputer
from sklearn.linear_model  import LogisticRegressionCV, LassoCV
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.metrics       import roc_auc_score, accuracy_score, mean_squared_error

warnings.filterwarnings("ignore")

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 1 ▸ Dataset registry  (update the paths if yours differ)
# ╚═══════════════════════════════════════════════════════════════════════════╝

# ╔═══════════════════════════════════════════════════════════════════════════╗
# Dataset catalogue – matches every entry in `csv_paths`
# ╚═══════════════════════════════════════════════════════════════════════════╝
DATASETS = {
    # ────────────────────────────────  BANK MARKETING  ────────────────────────────────
    "bank-full": {
        "path": "bank+marketing/bank/bank-full.csv",
        "sep":  ";",
        "y":    "y",                 # binary: "yes"/"no"
        "task": "cls"
    },
    "bank": {
        "path": "bank+marketing/bank/bank.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },
    "bank-additional-full": {
        "path": "bank+marketing/bank-additional/bank-additional-full.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },
    "bank-additional": {
        "path": "bank+marketing/bank-additional/bank-additional.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },

    # ────────────────────────────────  GLIOMA / TCGA  ────────────────────────────────
    "TCGA_Mutations_all": {
        "path": "glioma+grading+clinical+and+mutation+features+dataset/TCGA_GBM_LGG_Mutations_all.csv",
        "sep":  ",",
        "y":    "Grade",             # or use 'yclass' if your copy labels that way
        "task": "cls"
    },
    "TCGA_InfoWithGrade": {
        "path": "glioma+grading+clinical+and+mutation+features+dataset/TCGA_InfoWithGrade.csv",
        "sep":  ",",
        "y":    "Grade",
        "task": "cls"
    },

    # ────────────────────────────────  WINE QUALITY  ────────────────────────────────
    "winequality-red": {
        "path": "wine+quality/winequality-red.csv",
        "sep":  ";",
        "y":    "quality",           # integer 0-10 – treat as regression (paper’s choice)
        "task": "reg"
    },
    "winequality-white": {
        "path": "wine+quality/winequality-white.csv",
        "sep":  ";",
        "y":    "quality",
        "task": "reg"
    },

    # ────────────────────────────────  PIMA DIABETES  ────────────────────────────────
    "diabetes": {
        "path": "diabetes.csv",
        "sep":  ",",
        "y":    "Outcome",           # 0 / 1
        "task": "cls"
    },

    # ────────────────────────────────  SPOTIFY STREAMS  ────────────────────────────────
    "spotify": {
        "path": "Most Streamed Spotify Songs 2024.csv",
        "sep":  ",",
        "y": "Spotify Streams",   # numeric count
        "task": "reg"
    },
}
# ╔═══════════════════════════════════════════════════════════════════════════╗
# 2 ▸ Utility helpers
# ╚═══════════════════════════════════════════════════════════════════════════╝
def read_csv_safely(path, sep=","):
    """UTF-8 reader with ISO-8859-1 fallback."""
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

def drop_text_columns(df):
    """Keep only numeric + categorical (object / category / bool) columns."""
    keep = df.select_dtypes(include=["number", "object", "category", "bool"]).columns
    return df[keep]

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 3 ▸ Probabilistic Categorical Imputer
# ╚═══════════════════════════════════════════════════════════════════════════╝
from sklearn.base import BaseEstimator, TransformerMixin

class ProbabilisticCategoricalImputer(BaseEstimator, TransformerMixin):
    """
    Replace NaNs in each categorical column by randomly sampling from that
    column’s observed distribution.
    """
    def __init__(self, random_state=None):
        self.random_state = random_state
        self.rng_         = default_rng(random_state)
        self.proba_       = {}               # col → (values, probs)

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            counts = X[col].value_counts(dropna=True)
            values = counts.index.to_numpy()
            probs  = (counts / counts.sum()).to_numpy()
            self.proba_[col] = (values, probs)
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (vals, probs) in self.proba_.items():
            mask = X[col].isna()
            if mask.any():
                X.loc[mask, col] = self.rng_.choice(vals, size=mask.sum(), p=probs)
        return X.values  # keep column order intact

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 4 ▸ Pre-processor factory  (uses probabilistic imputer for categoricals)
# ╚═══════════════════════════════════════════════════════════════════════════╝
def make_preprocessor(df, seed=42):
    num_cols = df.select_dtypes(include=["number"]).columns
    cat_cols = df.columns.difference(num_cols)

    numeric_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale",  StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("impute", ProbabilisticCategoricalImputer(random_state=seed)),
        ("oh",     OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
        ("num", numeric_pipe, num_cols),
        ("cat", cat_pipe,     cat_cols)
    ])

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 5 ▸ Model runner (10 random 80/20 splits → mean ± std score)
# ╚═══════════════════════════════════════════════════════════════════════════╝
def run_plain_l1_model(X, y, task, seed=42):
    pre = make_preprocessor(X, seed=seed)

    if task == "cls":
        model = LogisticRegressionCV(
            penalty="l1", solver="saga", max_iter=10_000,
            Cs=np.logspace(-4,4,10), cv=5, scoring="roc_auc"
        )
        splitter   = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        metric_fn  = lambda m, Xt, yt: roc_auc_score(yt, m.predict_proba(Xt)[:, 1])
        metric_tag = "AUROC"
    else:  # regression
        model      = LassoCV(cv=10, max_iter=10_000)
        splitter   = ShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        metric_fn  = lambda m, Xt, yt: mean_squared_error(yt, m.predict(Xt))
        metric_tag = "MSE"

    pipe   = Pipeline([("prep", pre), ("model", model)])
    scores = []

    for tr_idx, te_idx in splitter.split(X, y):
        pipe.fit(X.iloc[tr_idx], y.iloc[tr_idx])
        scores.append(metric_fn(pipe, X.iloc[te_idx], y.iloc[te_idx]))

    return metric_tag, np.mean(scores), np.std(scores)

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 6 ▸ Main loop
# ╚═══════════════════════════════════════════════════════════════════════════╝
results = []

for name, cfg in DATASETS.items():
    path = Path(cfg["path"])
    if not path.exists():
        print(f"⚠️  {name}: file not found → {path}")
        continue

    print(f"\n🔹 {name}")
    raw = read_csv_safely(path, sep=cfg["sep"])

    if cfg["y"] not in raw.columns:
        print("   ❌ target column not found; skipping."); continue

    y = raw[cfg["y"]].copy()
    X = raw.drop(columns=[cfg["y"]])
    X = drop_text_columns(X).dropna(axis=1, how="all")  # keep only valid cols

    # optional row-drop if very few NaNs
    drop_row = X.dropna(axis=0, how="any")
    if len(drop_row) >= 0.9 * len(X):
        X, y = drop_row, y.loc[drop_row.index]
    # else: keep rows & impute inside pipeline

    # encode class labels
    if cfg["task"] == "cls":
        y = pd.Categorical(y).codes

    metric, mean_s, std_s = run_plain_l1_model(X, y, cfg["task"])
    print(f"   {metric}: {mean_s:.3f} ± {std_s:.3f}")
    results.append({"dataset": name, "task": cfg["task"],
                    metric: mean_s, f"{metric}_std": std_s})

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 7 ▸ Save results
# ╚═══════════════════════════════════════════════════════════════════════════╝
pd.DataFrame(results).to_csv("plain_l1_results.csv", index=False)
print("\n✅  Finished — results saved to plain_l1_results.csv")


🔹 bank-full


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.linear_model import LassoCV, LogisticRegressionCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score, mean_squared_error

# ────────────────────────────────────────────────────────────────────────────
#  Helper utilities
# ────────────────────────────────────────────────────────────────────────────
def load_csv(path, sep=","):
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

def coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        coerced = pd.to_numeric(
            df[col].astype(str).str.replace(r"[,\s]", "", regex=True),
            errors="coerce",
        )
        if coerced.notna().any():
            df[col] = coerced.astype(float)
    return df

def summarize(df: pd.DataFrame) -> pd.DataFrame:
    num = df.select_dtypes(include="number")
    return pd.concat(
        {
            "n_unique": num.nunique(),
            "mean":     num.mean(),
            "median":   num.median(),
            "std":      num.std(),
        },
        axis=1,
    ).round(2)

# ────────────────────────────────────────────────────────────────────────────
#  Plain-L1 baseline
# ────────────────────────────────────────────────────────────────────────────
def run_plain_l1_model(X, y, task, seed=42, n_splits=10):
    rng = np.random.RandomState(seed)

    # choose splitter + metric + model
    if task == "cls":
        splitter   = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=rng)
        base_model = LogisticRegressionCV(
            Cs=50,
            penalty="l1",
            solver="saga",
            cv=5,
            max_iter=5000,
            random_state=rng,
        )
        metric_tag = "AUROC"
        metric_fn  = lambda mdl, Xte, yte: roc_auc_score(yte, mdl.predict_proba(Xte)[:, 1])
    else:  # regression
        splitter   = KFold(n_splits=n_splits, shuffle=True, random_state=rng)
        base_model = LassoCV(cv=5, random_state=rng)
        metric_tag = "MSE"
        metric_fn  = lambda mdl, Xte, yte: mean_squared_error(yte, mdl.predict(Xte))

    scores = []
    for tr_idx, te_idx in splitter.split(X, y):
        base_model.fit(X[tr_idx], y[tr_idx])            # ← dropped .iloc
        scores.append(metric_fn(base_model, X[te_idx], y[te_idx]))  # ← dropped .iloc

    return metric_tag, np.mean(scores), np.std(scores)

# ────────────────────────────────────────────────────────────────────────────
#  File list  (path, delimiter, task, target-column)
# ────────────────────────────────────────────────────────────────────────────
csv_paths = {
    # name             path                                            sep   task  target
    "bank":            ("bank+marketing/bank/bank.csv",                ";", "cls", "y"),
    "diabetes":        ("diabetes.csv",                                ",", "cls", "Outcome"),
    "winequality-red": ("wine+quality/winequality-red.csv",            ";", "reg", "quality"),
    "winequality-white":("wine+quality/winequality-white.csv",         ";", "reg", "quality"),
    "spotify":         ("Most Streamed Spotify Songs 2024.csv",        ",", "reg", "Streams"),
}

out_dir = Path("numeric_summaries")
out_dir.mkdir(exist_ok=True)

results = []

# ────────────────────────────────────────────────────────────────────────────
#  Main loop
# ────────────────────────────────────────────────────────────────────────────
for name, (path_str, sep, task, target) in csv_paths.items():
    path = Path(path_str)
    print(f"\n🔹 {name} ({task})")
    if not path.exists():
        print("   ⚠️  File not found, skipping.")
        continue

    # 1 ▸ read + coerce numeric
    df = coerce_numeric(load_csv(path, sep=sep))

    # 2 ▸ split X / y
    if target not in df.columns:
        print(f"   ⚠️  Target '{target}' not found, skipping.")
        continue
    y = df[target].copy()
    X = df.drop(columns=[target])

    # 3 ▸ keep numeric features only + drop NaNs
    X = X.select_dtypes(include="number").dropna(axis=1, how="any")
    X = X.dropna(axis=0, how="any")
    y = y.loc[X.index]

    if X.empty:
        print("   ℹ️  No numeric cols after cleaning; skipping.")
        continue

    # 4 ▸ convert to NumPy (keeps pipeline fast & avoids .iloc)
    X = X.to_numpy()
    y = pd.Categorical(y).codes if task == "cls" else y.to_numpy()

    # 5 ▸ plain-L1 model
    metric, mean_s, std_s = run_plain_l1_model(X, y, task)
    print(f"   {metric}: {mean_s:.3f} ± {std_s:.3f}")
    results.append({"dataset": name, "task": task,
                    metric: mean_s, f"{metric}_std": std_s})

# ────────────────────────────────────────────────────────────────────────────
#  Summary table
# ────────────────────────────────────────────────────────────────────────────
results_df = pd.DataFrame(results)
print("\n📊  Plain-L1 baseline results")
display(results_df)



🔹 bank (cls)
   AUROC: 0.631 ± 0.019

🔹 diabetes (cls)
   AUROC: 0.723 ± 0.051

🔹 winequality-red (reg)
   MSE: 0.433 ± 0.053

🔹 winequality-white (reg)
   MSE: 0.581 ± 0.039

🔹 spotify (reg)
   ⚠️  Target 'Streams' not found, skipping.

📊  Plain-L1 baseline results


Unnamed: 0,dataset,task,AUROC,AUROC_std,MSE,MSE_std
0,bank,cls,0.631458,0.018879,,
1,diabetes,cls,0.723185,0.051458,,
2,winequality-red,reg,,,0.43283,0.052549
3,winequality-white,reg,,,0.581187,0.039482


In [7]:
# ╔═══════════════════════════════════════════════════════════════════════════╗
# 0 ▸ Imports
# ╚═══════════════════════════════════════════════════════════════════════════╝
import numpy as np, pandas as pd, warnings
from pathlib import Path
from numpy.random          import default_rng
from sklearn.base          import BaseEstimator, TransformerMixin
from sklearn.compose       import ColumnTransformer
from sklearn.pipeline      import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute        import SimpleImputer
from sklearn.linear_model  import LogisticRegressionCV, LassoCV
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit
from sklearn.metrics       import roc_auc_score, mean_squared_error

warnings.filterwarnings("ignore")

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 1 ▸ Dataset registry  (paths & targets mirror your `csv_paths`)
# ╚═══════════════════════════════════════════════════════════════════════════╝
DATASETS = {
    # ───── Bank marketing (4 variants) ─────
    "bank-full": {
        "path": "bank+marketing/bank/bank-full.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },
    "bank": {
        "path": "bank+marketing/bank/bank.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },
    "bank-additional-full": {
        "path": "bank+marketing/bank-additional/bank-additional-full.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },
    "bank-additional": {
        "path": "bank+marketing/bank-additional/bank-additional.csv",
        "sep":  ";",
        "y":    "y",
        "task": "cls"
    },

    # ───── Glioma / TCGA ─────
    "TCGA_Mutations_all": {
        "path": "glioma+grading+clinical+and+mutation+features+dataset/TCGA_GBM_LGG_Mutations_all.csv",
        "sep":  ",",
        "y":    "Grade",            # change to 'yclass' if that’s your label
        "task": "cls"
    },
    "TCGA_InfoWithGrade": {
        "path": "glioma+grading+clinical+and+mutation+features+dataset/TCGA_InfoWithGrade.csv",
        "sep":  ",",
        "y":    "Grade",
        "task": "cls"
    },

    # ───── Wine quality ─────
    "winequality-red": {
        "path": "wine+quality/winequality-red.csv",
        "sep":  ";",
        "y":    "quality",
        "task": "reg"
    },
    "winequality-white": {
        "path": "wine+quality/winequality-white.csv",
        "sep":  ";",
        "y":    "quality",
        "task": "reg"
    },

    # ───── Diabetes (Pima) ─────
    "diabetes": {
        "path": "diabetes.csv",
        "sep":  ",",
        "y":    "Outcome",
        "task": "cls"
    },

    # ───── Spotify streams ─────
    "spotify": {
        "path": "Most Streamed Spotify Songs 2024.csv",
        "sep":  ",",
        "y":    "Spotify Streams",
        "task": "reg"
    },
}

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 2 ▸ Robust CSV reader
# ╚═══════════════════════════════════════════════════════════════════════════╝
def read_csv_safely(path, sep=","):
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 3 ▸ Feature-type inference  (detect numeric-coded categoricals)
# ╚═══════════════════════════════════════════════════════════════════════════╝
def infer_feature_types(df, cat_thresh=15):
    """
    Returns two lists: numeric_cols, categorical_cols.
    Any numeric column with ≤ `cat_thresh` unique values is treated as categorical.
    """
    numeric_raw = df.select_dtypes(include=["number"]).columns.tolist()
    categorical = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    # find low-cardinality numerics
    for col in numeric_raw:
        if df[col].nunique(dropna=True) <= cat_thresh:
            categorical.append(col)

    numeric = [c for c in df.columns if c not in categorical]
    return numeric, categorical

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 4 ▸ Probabilistic imputer for categoricals
# ╚═══════════════════════════════════════════════════════════════════════════╝
class ProbCatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, random_state=None):
        self.rng_ = default_rng(random_state)
        self.proba_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        for col in X.columns:
            counts = X[col].value_counts(dropna=True)
            self.proba_[col] = (counts.index.to_numpy(),
                                (counts / counts.sum()).to_numpy())
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (vals, probs) in self.proba_.items():
            m = X[col].isna()
            if m.any():
                X.loc[m, col] = self.rng_.choice(vals, size=m.sum(), p=probs)
        return X.values

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 5 ▸ Pre-processor factory
# ╚═══════════════════════════════════════════════════════════════════════════╝
def make_preprocessor(df, seed=42):
    num_cols, cat_cols = infer_feature_types(df)

    num_pipe = Pipeline([
        ("impute", SimpleImputer(strategy="median")),
        ("scale",  StandardScaler())
    ])

    cat_pipe = Pipeline([
        ("impute", ProbCatImputer(random_state=seed)),
        ("as_str", FunctionTransformer(lambda x: x.astype(str), feature_names_out="one-to-one")),
        ("oh",     OneHotEncoder(handle_unknown="ignore"))
    ])

    return ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols)
    ])

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 6 ▸ Model runner  (10× 80/20 splits → mean ± std AUROC or MSE)
# ╚═══════════════════════════════════════════════════════════════════════════╝
def run_plain_l1(X, y, task, seed=42):
    pre = make_preprocessor(X, seed)

    if task == "cls":
        model = LogisticRegressionCV(
            penalty="l1", solver="saga", max_iter=10_000,
            Cs=np.logspace(-4,4,10), cv=5, scoring="roc_auc"
        )
        splitter   = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        metric_fn  = lambda m, Xt, yt: roc_auc_score(yt, m.predict_proba(Xt)[:,1])
        metric_tag = "AUROC"
    else:
        model      = LassoCV(cv=10, max_iter=10_000)
        splitter   = ShuffleSplit(n_splits=10, test_size=0.2, random_state=seed)
        metric_fn  = lambda m, Xt, yt: mean_squared_error(yt, m.predict(Xt))
        metric_tag = "MSE"

    pipe   = Pipeline([("prep", pre), ("model", model)])
    scores = []
    for tr, te in splitter.split(X, y):
        pipe.fit(X.iloc[tr], y.iloc[tr])
        scores.append(metric_fn(pipe, X.iloc[te], y.iloc[te]))
    return metric_tag, np.mean(scores), np.std(scores)

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 7 ▸ Main loop
# ╚═══════════════════════════════════════════════════════════════════════════╝
rows = []

for name, cfg in DATASETS.items():
    path = Path(cfg["path"])
    if not path.exists():
        print(f"⚠️  skipping {name}: file not found → {path}")
        continue

    raw = read_csv_safely(path, sep=cfg["sep"])
    if cfg["y"] not in raw.columns:
        print(f"⚠️  skipping {name}: target column '{cfg['y']}' not found."); continue

    y = raw[cfg["y"]]
    X = raw.drop(columns=[cfg["y"]])

    # optional: drop columns that are entirely NaN
    X = X.dropna(axis=1, how="all")

    # keep rows if ≥90 % complete; otherwise let the pipeline impute
    drop_rows = X.dropna(axis=0, how="any")
    if len(drop_rows) >= 0.9 * len(X):
        X, y = drop_rows, y.loc[drop_rows.index]
    if cfg["task"] == "cls":
        y = pd.Categorical(y).codes

    mtag, mean_s, std_s = run_plain_l1(X, y, cfg["task"])
    rows.append({"dataset": name, "task": cfg["task"],
                 mtag: mean_s, f"{mtag}_std": std_s})
    print(f"{name:25s} → {mtag}: {mean_s:.3f} ± {std_s:.3f}")

# ╔═══════════════════════════════════════════════════════════════════════════╗
# 8 ▸ Save summary
# ╚═══════════════════════════════════════════════════════════════════════════╝
pd.DataFrame(rows).to_csv("plain_l1_results.csv", index=False)
print("\n✅  Finished: results saved to plain_l1_results.csv")


AttributeError: 'numpy.ndarray' object has no attribute 'iloc'