In [1]:
import pandas as pd, numpy as np
from pathlib import Path

# ──────────────────────────────────────────────────────────────────────────────
# Helper utilities (assumes you have these already, but they are repeated here
# so the cell is completely self-contained)
# ──────────────────────────────────────────────────────────────────────────────
def load_csv(path, sep=","):
    """Read CSV with a fallback encoding."""
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

def coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Turn numeric-like object columns into float.""" 
    for col in df.columns:
        coerced = pd.to_numeric(
            df[col].astype(str).str.replace(r"[,\s]", "", regex=True),
            errors="coerce",
        )
        if coerced.notna().any():
            df[col] = coerced.astype(float)
    return df

def summarize(df: pd.DataFrame) -> pd.DataFrame:
    """Return n_unique / mean / median / std for numeric cols (2-decimals)."""
    num = df.select_dtypes(include="number")
    return pd.concat(
        {
            "n_unique": num.nunique(),
            "mean":     num.mean(),
            "median":   num.median(),
            "std":      num.std(),
        },
        axis=1,
    ).round(2)

# ──────────────────────────────────────────────────────────────────────────────
# File list  (path, delimiter)
# ──────────────────────────────────────────────────────────────────────────────
csv_paths = {
    "bank-full":            ("bank+marketing/bank/bank-full.csv",               ";"),
    "bank":                 ("bank+marketing/bank/bank.csv",                    ";"),
    "bank-additional-full": ("bank+marketing/bank-additional/bank-additional-full.csv", ";"),
    "bank-additional":      ("bank+marketing/bank-additional/bank-additional.csv",     ";"),
    "TCGA_Mutations_all":   ("glioma+grading+clinical+and+mutation+features+dataset/TCGA_GBM_LGG_Mutations_all.csv", ","),
    "TCGA_InfoWithGrade":   ("glioma+grading+clinical+and+mutation+features+dataset/TCGA_InfoWithGrade.csv",         ","),
    "winequality-red":      ("wine+quality/winequality-red.csv",   ";"),
    "winequality-white":    ("wine+quality/winequality-white.csv", ";"),
    "diabetes":             ("diabetes.csv", ","),
    "spotify":              ("Most Streamed Spotify Songs 2024.csv", ","),
}

out_dir = Path("numeric_summaries")
out_dir.mkdir(exist_ok=True)

# ──────────────────────────────────────────────────────────────────────────────
# 1-5 ▸ load → numeric-only → drop-NaNs (cols then rows) → summarise → save
# ──────────────────────────────────────────────────────────────────────────────
for name, (path_str, sep) in csv_paths.items():
    path = Path(path_str)
    print(f"\n🔹 {name} — delimiter '{sep}'")
    if not path.exists():
        print("   ⚠️  File not found, skipping.")
        continue

    # 1 ▸ read + coerce numeric-looking strings
    df = coerce_numeric(load_csv(path, sep=sep))

    # 2 ▸ keep only numeric columns
    df = df.select_dtypes(include="number")

    # 3 ▸ drop columns with any NaNs, then rows with any NaNs
    df = df.dropna(axis=1, how="any").dropna(axis=0, how="any")

    if df.empty:
        print("   ℹ️  Nothing left after cleaning; skipping.")
        continue

    # 4 ▸ calculate summary statistics
    summary = summarize(df)

    # 5 ▸ save + quick report
    print(f"   Rows (after cleaning): {len(df):,}")
    print(f"   Numeric features     : {len(summary)}")
    display(summary.head())           # comment out if not in Jupyter

    outfile = out_dir / f"{name}_summary.csv"
    summary.to_csv(outfile)
    print(f"   ✅  saved → {outfile}")

print("\n🎉  Done. All summaries are in numeric_summaries/")



🔹 bank-full — delimiter ';'
   Rows (after cleaning): 45,211
   Numeric features     : 7


Unnamed: 0,n_unique,mean,median,std
age,77,40.94,39.0,10.62
balance,7168,1362.27,448.0,3044.77
day,31,15.81,16.0,8.32
duration,1573,258.16,180.0,257.53
campaign,48,2.76,2.0,3.1


   ✅  saved → numeric_summaries/bank-full_summary.csv

🔹 bank — delimiter ';'
   Rows (after cleaning): 4,521
   Numeric features     : 7


Unnamed: 0,n_unique,mean,median,std
age,67,41.17,39.0,10.58
balance,2353,1422.66,444.0,3009.64
day,31,15.92,16.0,8.25
duration,875,263.96,185.0,259.86
campaign,32,2.79,2.0,3.11


   ✅  saved → numeric_summaries/bank_summary.csv

🔹 bank-additional-full — delimiter ';'
   Rows (after cleaning): 41,188
   Numeric features     : 10


Unnamed: 0,n_unique,mean,median,std
age,78,40.02,38.0,10.42
duration,1544,258.29,180.0,259.28
campaign,42,2.57,2.0,2.77
pdays,27,962.48,999.0,186.91
previous,8,0.17,0.0,0.49


   ✅  saved → numeric_summaries/bank-additional-full_summary.csv

🔹 bank-additional — delimiter ';'
   Rows (after cleaning): 4,119
   Numeric features     : 10


Unnamed: 0,n_unique,mean,median,std
age,67,40.11,38.0,10.31
duration,828,256.79,181.0,254.7
campaign,25,2.54,2.0,2.57
pdays,21,960.42,999.0,191.92
previous,7,0.19,0.0,0.54


   ✅  saved → numeric_summaries/bank-additional_summary.csv

🔹 TCGA_Mutations_all — delimiter ','
   ℹ️  Nothing left after cleaning; skipping.

🔹 TCGA_InfoWithGrade — delimiter ','
   Rows (after cleaning): 839
   Numeric features     : 24


Unnamed: 0,n_unique,mean,median,std
Grade,2,0.42,0.0,0.49
Gender,2,0.42,0.0,0.49
Age_at_diagnosis,766,50.94,51.55,15.7
Race,4,0.11,0.0,0.37
IDH1,2,0.48,0.0,0.5


   ✅  saved → numeric_summaries/TCGA_InfoWithGrade_summary.csv

🔹 winequality-red — delimiter ';'
   Rows (after cleaning): 1,599
   Numeric features     : 12


Unnamed: 0,n_unique,mean,median,std
fixed acidity,96,8.32,7.9,1.74
volatile acidity,143,0.53,0.52,0.18
citric acid,80,0.27,0.26,0.19
residual sugar,91,2.54,2.2,1.41
chlorides,153,0.09,0.08,0.05


   ✅  saved → numeric_summaries/winequality-red_summary.csv

🔹 winequality-white — delimiter ';'
   Rows (after cleaning): 4,898
   Numeric features     : 12


Unnamed: 0,n_unique,mean,median,std
fixed acidity,68,6.85,6.8,0.84
volatile acidity,125,0.28,0.26,0.1
citric acid,87,0.33,0.32,0.12
residual sugar,310,6.39,5.2,5.07
chlorides,160,0.05,0.04,0.02


   ✅  saved → numeric_summaries/winequality-white_summary.csv

🔹 diabetes — delimiter ','
   Rows (after cleaning): 768
   Numeric features     : 9


Unnamed: 0,n_unique,mean,median,std
Pregnancies,17,3.85,3.0,3.37
Glucose,136,120.89,117.0,31.97
BloodPressure,47,69.11,72.0,19.36
SkinThickness,51,20.54,23.0,15.95
Insulin,186,79.8,30.5,115.24


   ✅  saved → numeric_summaries/diabetes_summary.csv

🔹 spotify — delimiter ','
   Rows (after cleaning): 4,600
   Numeric features     : 3


Unnamed: 0,n_unique,mean,median,std
All Time Rank,4577,2290.68,2290.5,1322.88
Track Score,862,41.84,29.9,38.54
Explicit Track,2,0.36,0.0,0.48


   ✅  saved → numeric_summaries/spotify_summary.csv

🎉  Done. All summaries are in numeric_summaries/


In [2]:
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score

In [8]:
# ──────────────────────────────────────────────────────────────────────────────
# 0 ▸ Imports
# ──────────────────────────────────────────────────────────────────────────────
import pandas as pd, numpy as np, warnings, json
from pathlib import Path

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing  import StandardScaler
from sklearn.pipeline       import Pipeline
from sklearn.linear_model   import Lasso, LogisticRegression
from sklearn.metrics        import r2_score, roc_auc_score, accuracy_score

warnings.filterwarnings("ignore")

# ──────────────────────────────────────────────────────────────────────────────
# 1 ▸ Helper functions  (load_csv ◂─► coerce_numeric ◂─► clean_features)
# ──────────────────────────────────────────────────────────────────────────────
def load_csv(path, sep=","):
    """Read CSV with UTF-8 fallback → ISO-8859-1."""
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

def coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Turn numeric-like object columns into float dtype."""
    for col in df.columns:
        coerced = pd.to_numeric(
            df[col].astype(str).str.replace(r"[,\s]", "", regex=True),
            errors="coerce",
        )
        if coerced.notna().any():          # at least one real number
            df[col] = coerced.astype(float)
    return df

def clean_features(df: pd.DataFrame) -> pd.DataFrame:
    """Numeric-only → drop NaN columns → drop NaN rows."""
    df = coerce_numeric(df)
    df = df.select_dtypes(include="number")       # keep numeric cols only
    df = df.dropna(axis=1, how="any")             # drop cols w/ NaN
    df = df.dropna(axis=0, how="any")             # drop rows w/ NaN
    return df

# ──────────────────────────────────────────────────────────────────────────────
# 2 ▸ Dataset list & target-column mapping
# ──────────────────────────────────────────────────────────────────────────────
csv_paths = {
    "bank-full":            ("bank+marketing/bank/bank-full.csv",               ";"),
    "bank":                 ("bank+marketing/bank/bank.csv",                    ";"),
    "bank-additional-full": ("bank+marketing/bank-additional/bank-additional-full.csv", ";"),
    "bank-additional":      ("bank+marketing/bank-additional/bank-additional.csv",     ";"),
    "TCGA_InfoWithGrade":   ("glioma+grading+clinical+and+mutation+features+dataset/TCGA_InfoWithGrade.csv", ","),
    "winequality-red":      ("wine+quality/winequality-red.csv",   ";"),
    "winequality-white":    ("wine+quality/winequality-white.csv", ";"),
    "diabetes":             ("diabetes.csv", ","),
    "spotify":              ("Most Streamed Spotify Songs 2024.csv", ","),
}

y_cols = {
    # classification
    "bank-full":            "y",
    "bank":                 "y",
    "bank-additional-full": "y",
    "bank-additional":      "y",
    "diabetes":             "Outcome",
    "TCGA_InfoWithGrade":   "Grade",
    # regression
    "winequality-red":      "quality",
    "winequality-white":    "quality",
    "spotify":              "Spotify Streams",       # <── updated column name
}

class_tasks = {
    "bank-full", "bank", "bank-additional-full", "bank-additional",
    "diabetes", "TCGA_InfoWithGrade",
}

# ──────────────────────────────────────────────────────────────────────────────
# 3 ▸ Hyper-parameter grids
# ──────────────────────────────────────────────────────────────────────────────
lasso_grid  = {"model__alpha": 10.0**np.linspace(-4, 1, 10)}
logreg_grid = {"model__C":     10.0**np.linspace(-3, 3, 13)}

# ──────────────────────────────────────────────────────────────────────────────
# 4 ▸ Main loop
# ──────────────────────────────────────────────────────────────────────────────
results = []

for name, (csv_path, sep) in csv_paths.items():
    path = Path(csv_path)
    print(f"\n🔹 {name}")
    if not path.exists():
        print("   ⚠️  File not found, skipping.")
        continue

    # -- load
    raw = load_csv(path, sep=sep)
    if name not in y_cols or y_cols[name] not in raw.columns:
        print("   ❌  Target column missing; skipping.")
        continue

    y_raw = raw[y_cols[name]]
    y_raw = pd.to_numeric(
    y_raw.astype(str).str.replace(r"[,\s]", "", regex=True),
    errors="coerce"
)
    X_raw = raw.drop(columns=[y_cols[name]])

    # -- preprocess X
    X = clean_features(X_raw)
    y = y_raw.loc[X.index]          # align indices
    if y.isna().any():
        mask = y.notna()
        X, y = X.loc[mask], y[mask]

    if X.empty:
        print("   ℹ️  No usable numeric features; skipping.")
        continue

    task = "classification" if name in class_tasks else "regression"

    # -- split 50-50 (stratify if classification)
    strat = y if task == "classification" else None
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.5, random_state=42, stratify=strat
    )

    # -- pipeline & CV
    if task == "regression":
        pipe = Pipeline(
            [("scaler", StandardScaler()), ("model", Lasso(max_iter=10000))]
        )
        grid = lasso_grid
        cv   = 10
        scorer = "r2"
    else:
        y_tr = pd.Categorical(y_tr).codes
        y_te = pd.Categorical(y_te).codes
        pipe = Pipeline(
            [("scaler", StandardScaler()),
             ("model",  LogisticRegression(penalty="l1",
                                           solver="liblinear",
                                           max_iter=2000))]
        )
        grid = logreg_grid
        cv   = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
        scorer = "roc_auc"

    gscv = GridSearchCV(pipe, grid, cv=cv, n_jobs=-1, scoring=scorer)
    gscv.fit(X_tr, y_tr)
    best = gscv.best_estimator_
    best_param = list(gscv.best_params_.values())[0]
    print("   Best hyper-parameter:", best_param)

    # -- evaluate
    if task == "regression":
        y_pred = best.predict(X_te)
        r2  = r2_score(y_te, y_pred)
        err = 1 - r2
        print(f"   R² = {r2:.3f}  ⇒  test-error (1-R²) = {err:.3f}")
        results.append({"dataset": name, "task": "reg", "alpha": best_param,
                        "R2": r2, "1-R2": err})
    else:
        prob = best.predict_proba(X_te)[:, 1]
        auc  = roc_auc_score(y_te, prob)
        acc  = accuracy_score(y_te, best.predict(X_te))
        err  = 1 - acc
        print(f"   AUROC = {auc:.3f},  Accuracy = {acc:.3f},  Error = {err:.3f}")
        results.append({"dataset": name, "task": "cls", "C": best_param,
                        "AUROC": auc, "Accuracy": acc, "Error": err})

# ──────────────────────────────────────────────────────────────────────────────
# 5 ▸ Save consolidated results
# ──────────────────────────────────────────────────────────────────────────────
pd.DataFrame(results).to_csv("lasso_results.csv", index=False)
print("\n🎉  Done — per-dataset results in lasso_results.csv")



🔹 bank-full
   ℹ️  No usable numeric features; skipping.

🔹 bank
   ℹ️  No usable numeric features; skipping.

🔹 bank-additional-full
   ℹ️  No usable numeric features; skipping.

🔹 bank-additional
   ℹ️  No usable numeric features; skipping.

🔹 TCGA_InfoWithGrade
   Best hyper-parameter: 1.0
   AUROC = 0.927,  Accuracy = 0.871,  Error = 0.129

🔹 winequality-red
   Best hyper-parameter: 0.016681005372000592
   R² = 0.325  ⇒  test-error (1-R²) = 0.675

🔹 winequality-white
   Best hyper-parameter: 0.004641588833612782
   R² = 0.261  ⇒  test-error (1-R²) = 0.739

🔹 diabetes
   Best hyper-parameter: 100.0
   AUROC = 0.836,  Accuracy = 0.758,  Error = 0.242

🔹 spotify
   Best hyper-parameter: 10.0
   R² = 0.096  ⇒  test-error (1-R²) = 0.904

🎉  Done — per-dataset results in lasso_results.csv


In [9]:
raw = load_csv("bank+marketing/bank/bank.csv", sep=";")
num = coerce_numeric(raw.drop(columns=["y"])).select_dtypes("number")
print(num.isna().sum().loc[lambda s: s>0])   # shows every numeric col has NaNs

Series([], dtype: int64)
