In [1]:
import pandas as pd, numpy as np
from pathlib import Path

# ──────────────────────────────────────────────────────────────────────────────
# Helper utilities (assumes you have these already, but they are repeated here
# so the cell is completely self-contained)
# ──────────────────────────────────────────────────────────────────────────────
def load_csv(path, sep=","):
    """Read CSV with a fallback encoding."""
    try:
        return pd.read_csv(path, sep=sep)
    except UnicodeDecodeError:
        return pd.read_csv(path, sep=sep, encoding="ISO-8859-1")

def coerce_numeric(df: pd.DataFrame) -> pd.DataFrame:
    """Turn numeric-like object columns into float."""
    for col in df.columns:
        coerced = pd.to_numeric(
            df[col].astype(str).str.replace(r"[,\s]", "", regex=True),
            errors="coerce",
        )
        if coerced.notna().any():
            df[col] = coerced.astype(float)
    return df

def summarize(df: pd.DataFrame) -> pd.DataFrame:
    """Return n_unique / mean / median / std for numeric cols (2-decimals)."""
    num = df.select_dtypes(include="number")
    return pd.concat(
        {
            "n_unique": num.nunique(),
            "mean":     num.mean(),
            "median":   num.median(),
            "std":      num.std(),
        },
        axis=1,
    ).round(2)

# ──────────────────────────────────────────────────────────────────────────────
# File list  (path, delimiter)
# ──────────────────────────────────────────────────────────────────────────────
csv_paths = {
    "bank-full":            ("bank+marketing/bank/bank-full.csv",               ";"),
    "bank":                 ("bank+marketing/bank/bank.csv",                    ";"),
    "bank-additional-full": ("bank+marketing/bank-additional/bank-additional-full.csv", ";"),
    "bank-additional":      ("bank+marketing/bank-additional/bank-additional.csv",     ";"),
    "TCGA_Mutations_all":   ("glioma+grading+clinical+and+mutation+features+dataset/TCGA_GBM_LGG_Mutations_all.csv", ","),
    "TCGA_InfoWithGrade":   ("glioma+grading+clinical+and+mutation+features+dataset/TCGA_InfoWithGrade.csv",         ","),
    "winequality-red":      ("wine+quality/winequality-red.csv",   ";"),
    "winequality-white":    ("wine+quality/winequality-white.csv", ";"),
    "diabetes":             ("diabetes.csv", ","),
    "spotify":              ("Most Streamed Spotify Songs 2024.csv", ","),
}

out_dir = Path("numeric_summaries")
out_dir.mkdir(exist_ok=True)

# ──────────────────────────────────────────────────────────────────────────────
# 1-5 ▸ load → numeric-only → drop-NaNs (cols then rows) → summarise → save
# ──────────────────────────────────────────────────────────────────────────────
for name, (path_str, sep) in csv_paths.items():
    path = Path(path_str)
    print(f"\n🔹 {name} — delimiter '{sep}'")
    if not path.exists():
        print("   ⚠️  File not found, skipping.")
        continue

    # 1 ▸ read + coerce numeric-looking strings
    df = coerce_numeric(load_csv(path, sep=sep))

    # 2 ▸ keep only numeric columns
    df = df.select_dtypes(include="number")

    # 3 ▸ drop columns with any NaNs, then rows with any NaNs
    df = df.dropna(axis=1, how="any").dropna(axis=0, how="any")

    if df.empty:
        print("   ℹ️  Nothing left after cleaning; skipping.")
        continue

    # 4 ▸ calculate summary statistics
    summary = summarize(df)

    # 5 ▸ save + quick report
    print(f"   Rows (after cleaning): {len(df):,}")
    print(f"   Numeric features     : {len(summary)}")
    display(summary.head())           # comment out if not in Jupyter

    outfile = out_dir / f"{name}_summary.csv"
    summary.to_csv(outfile)
    print(f"   ✅  saved → {outfile}")

print("\n🎉  Done. All summaries are in numeric_summaries/")



🔹 bank-full — delimiter ';'
   Rows (after cleaning): 45,211
   Numeric features     : 7


Unnamed: 0,n_unique,mean,median,std
age,77,40.94,39.0,10.62
balance,7168,1362.27,448.0,3044.77
day,31,15.81,16.0,8.32
duration,1573,258.16,180.0,257.53
campaign,48,2.76,2.0,3.1


   ✅  saved → numeric_summaries/bank-full_summary.csv

🔹 bank — delimiter ';'
   Rows (after cleaning): 4,521
   Numeric features     : 7


Unnamed: 0,n_unique,mean,median,std
age,67,41.17,39.0,10.58
balance,2353,1422.66,444.0,3009.64
day,31,15.92,16.0,8.25
duration,875,263.96,185.0,259.86
campaign,32,2.79,2.0,3.11


   ✅  saved → numeric_summaries/bank_summary.csv

🔹 bank-additional-full — delimiter ';'
   Rows (after cleaning): 41,188
   Numeric features     : 10


Unnamed: 0,n_unique,mean,median,std
age,78,40.02,38.0,10.42
duration,1544,258.29,180.0,259.28
campaign,42,2.57,2.0,2.77
pdays,27,962.48,999.0,186.91
previous,8,0.17,0.0,0.49


   ✅  saved → numeric_summaries/bank-additional-full_summary.csv

🔹 bank-additional — delimiter ';'
   Rows (after cleaning): 4,119
   Numeric features     : 10


Unnamed: 0,n_unique,mean,median,std
age,67,40.11,38.0,10.31
duration,828,256.79,181.0,254.7
campaign,25,2.54,2.0,2.57
pdays,21,960.42,999.0,191.92
previous,7,0.19,0.0,0.54


   ✅  saved → numeric_summaries/bank-additional_summary.csv

🔹 TCGA_Mutations_all — delimiter ','
   ℹ️  Nothing left after cleaning; skipping.

🔹 TCGA_InfoWithGrade — delimiter ','
   Rows (after cleaning): 839
   Numeric features     : 24


Unnamed: 0,n_unique,mean,median,std
Grade,2,0.42,0.0,0.49
Gender,2,0.42,0.0,0.49
Age_at_diagnosis,766,50.94,51.55,15.7
Race,4,0.11,0.0,0.37
IDH1,2,0.48,0.0,0.5


   ✅  saved → numeric_summaries/TCGA_InfoWithGrade_summary.csv

🔹 winequality-red — delimiter ';'
   Rows (after cleaning): 1,599
   Numeric features     : 12


Unnamed: 0,n_unique,mean,median,std
fixed acidity,96,8.32,7.9,1.74
volatile acidity,143,0.53,0.52,0.18
citric acid,80,0.27,0.26,0.19
residual sugar,91,2.54,2.2,1.41
chlorides,153,0.09,0.08,0.05


   ✅  saved → numeric_summaries/winequality-red_summary.csv

🔹 winequality-white — delimiter ';'
   Rows (after cleaning): 4,898
   Numeric features     : 12


Unnamed: 0,n_unique,mean,median,std
fixed acidity,68,6.85,6.8,0.84
volatile acidity,125,0.28,0.26,0.1
citric acid,87,0.33,0.32,0.12
residual sugar,310,6.39,5.2,5.07
chlorides,160,0.05,0.04,0.02


   ✅  saved → numeric_summaries/winequality-white_summary.csv

🔹 diabetes — delimiter ','
   Rows (after cleaning): 768
   Numeric features     : 9


Unnamed: 0,n_unique,mean,median,std
Pregnancies,17,3.85,3.0,3.37
Glucose,136,120.89,117.0,31.97
BloodPressure,47,69.11,72.0,19.36
SkinThickness,51,20.54,23.0,15.95
Insulin,186,79.8,30.5,115.24


   ✅  saved → numeric_summaries/diabetes_summary.csv

🔹 spotify — delimiter ','
   Rows (after cleaning): 4,600
   Numeric features     : 3


Unnamed: 0,n_unique,mean,median,std
All Time Rank,4577,2290.68,2290.5,1322.88
Track Score,862,41.84,29.9,38.54
Explicit Track,2,0.36,0.0,0.48


   ✅  saved → numeric_summaries/spotify_summary.csv

🎉  Done. All summaries are in numeric_summaries/
