# Binary Classification — Bank Marketing (UCI)


# Pinned installs for reproducibility in Colab


# Show GPU status (Colab: Runtime -> Change runtime type -> GPU)



In [None]:
# === PyCaret Binary Classification: Bank Marketing (one-cell runner) ===
# Works best on Colab with: Runtime → Change runtime type → T4 GPU, Runtime version = Python 3.11

# 1) Install deps (pin to PyCaret 3.x which supports Python 3.11)
!pip -q install "pycaret>=3.0.4,<4" "pandas-datareader>=0.10.0" xgboost lightgbm catboost --upgrade

# 2) Imports + environment info
import sys, subprocess, zipfile, io, requests, pandas as pd
import matplotlib.pyplot as plt

print("Python:", sys.version)
try:
    print(subprocess.check_output(["nvidia-smi"]).decode())
except Exception:
    print("No NVIDIA GPU detected (ok: PyCaret will fall back to CPU).")

# 3) Load dataset (UCI Bank Marketing — not a PyCaret demo dataset)
zip_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
content = requests.get(zip_url).content
zf = zipfile.ZipFile(io.BytesIO(content))
with zf.open("bank-additional/bank-additional-full.csv") as f:
    df = pd.read_csv(f, sep=";")

# Basic cleanups: remove known leakage column, drop NAs
df = df.drop(columns=["duration"], errors="ignore").dropna().reset_index(drop=True)
print("Data shape:", df.shape)
print("Target distribution:\n", df["y"].value_counts())

# 4) PyCaret classification workflow (AutoML)
from pycaret.classification import (
    setup, compare_models, tune_model, finalize_model,
    plot_model, save_model, predict_model
)

exp = setup(
    data=df,
    target="y",
    session_id=42,
    use_gpu=True,                    # uses GPU where supported, otherwise falls back
    silent=True
)

# Compare a bunch of models and pick the top by AUC
top = compare_models(sort="AUC")

# Tune the top model for AUC
best = tune_model(top, optimize="AUC")

# Quick plot (confusion matrix)
plot_model(best, plot="confusion_matrix")

# Lock in the preprocessing + tuned model
final = finalize_model(best)

# 5) Save the pipeline (download from the Files panel when done)
save_path = save_model(final, "bank_marketing_classifier")
print("Saved model artifact at:", save_path)

# 6) Tiny inference demo
sample = df.sample(5, random_state=7)
preds = predict_model(final, data=sample)
print("Sample predictions:")
display(preds)



# Download & load bank marketing dataset (UCI) - not a PyCaret demo dataset
# Source: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing
import pandas as pd, zipfile, io, requests

zip_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
content = requests.get(zip_url).content
zf = zipfile.ZipFile(io.BytesIO(content))
with zf.open("bank-additional/bank-additional-full.csv") as f:
    df = pd.read_csv(f, sep=';')

print(df.shape, df.columns.tolist()[:10])

# It's common to drop 'duration' as it's known to leak target information
df = df.drop(columns=["duration"], errors="ignore")
df = df.dropna()

# Target is 'y' (yes/no)
from pycaret.classification import setup, compare_models, tune_model, finalize_model, evaluate_model, pull, save_model, predict_model, plot_model

exp = setup(
    data=df,
    target="y",
    session_id=42,
    use_gpu=True,      # GPU where supported, else CPU fallback
    silent=True,
    categorical_features=[c for c in df.columns if df[c].dtype=='object']
)

top = compare_models(n_select=1, sort="AUC")
best = tune_model(top, optimize="AUC")
plot_model(best, plot="confusion_matrix")
final = finalize_model(best)

save_path = save_model(final, "bank_marketing_classifier")
print("Saved pipeline at:", save_path)

# Example inference on a small sample
sample = df.sample(5, random_state=7)
preds = predict_model(final, data=sample)
preds.head()
