
# Bank Marketing — Data Cleaning, EDA & Decision Tree Classifier

This notebook performs a full data cleaning, exploratory data analysis (EDA), and builds a Decision Tree classifier
to predict whether a customer will subscribe to a term deposit (`y` = yes/no) using the UCI Bank Marketing dataset.

**How to use**
1. Place the dataset CSV (`bank-additional-full.csv` or `bank-additional/bank-additional-full.csv`) in the same folder as this notebook, **or**
2. Let the notebook download the dataset from the UCI repository automatically (requires internet in the environment).
3. Run cells step-by-step in Jupyter, VS Code (Jupyter), or Google Colab.

**Outputs**
- Cleaned dataset saved as `bank_cleaned.csv`
- Trained Decision Tree model and evaluation metrics
- Plots for EDA and model interpretation


In [None]:

# === Setup ===
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

# Display settings
pd.set_option('display.max_columns', 200)
sns.set(style='whitegrid')
%matplotlib inline


In [None]:

# === Load dataset (try local, else download from UCI) ===
local_paths = [
    "bank-additional-full.csv",
    "bank-additional/bank-additional-full.csv",
    "bank.csv",
    "bank-additional.csv"
]

df = None
for p in local_paths:
    if os.path.exists(p):
        df = pd.read_csv(p, sep=';')
        print(f"Loaded local file: {p}")
        break

if df is None:
    print("Local file not found. Attempting to download from UCI repository...")
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank-additional.zip"
    try:
        import requests, zipfile, io
        r = requests.get(url)
        z = zipfile.ZipFile(io.BytesIO(r.content))
        # The zip contains directory 'bank-additional' with 'bank-additional-full.csv'
        fname = None
        for name in z.namelist():
            if name.endswith("bank-additional-full.csv"):
                fname = name
                break
        if fname is None:
            raise RuntimeError("Dataset file not found inside zip archive.")
        z.extract(fname, ".")
        extracted_path = fname  # e.g., 'bank-additional/bank-additional-full.csv'
        df = pd.read_csv(extracted_path, sep=';')
        print("Downloaded and extracted:", extracted_path)
    except Exception as e:
        raise RuntimeError("Could not load dataset locally and failed to download. Please upload the CSV file to the environment.") from e

print('\nDataset shape:', df.shape)
df.head()


In [None]:

# === Data Cleaning ===
# Overview
print("\nColumns:\n", df.columns.tolist())
print("\nInfo:")
display(df.info())

# The UCI bank dataset uses 'unknown' to indicate missing values for categorical features.
df = df.replace('unknown', np.nan)

print("\nMissing values per column:")
print(df.isnull().sum())

# Strategy:
# - For numeric columns: fill NaN with median
# - For categorical columns: fill NaN with mode
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        med = df[col].median()
        df[col].fillna(med, inplace=True)
        print(f"Filled numeric {col} NaNs with median: {med}")

for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        mode = df[col].mode()[0]
        df[col].fillna(mode, inplace=True)
        print(f"Filled categorical {col} NaNs with mode: {mode}")

# Convert month and day_of_week to categorical ordered types if desired
if 'month' in df.columns:
    month_order = ['jan','feb','mar','apr','may','jun','jul','aug','sep','oct','nov','dec']
    df['month'] = pd.Categorical(df['month'], categories=month_order, ordered=True)

# Save a cleaned copy (before encoding)
cleaned_path = "bank_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print("\nSaved cleaned dataset to:", cleaned_path)

print("\nAfter cleaning, missing values:")
print(df.isnull().sum())
df.head()


In [None]:

# === Exploratory Data Analysis (EDA) ===
# Target distribution
plt.figure(figsize=(6,4))
sns.countplot(data=df, x='y')
plt.title('Target Distribution (y)')
plt.xlabel('Subscribed (yes/no)')
plt.show()

# Convert target to 0/1 for analysis
df['y_binary'] = df['y'].map({'no':0, 'yes':1})

# Categorical feature overview: show top categories for a few features
cat_features = ['job','marital','education','default','housing','loan','contact','poutcome']
for feat in cat_features:
    if feat in df.columns:
        plt.figure(figsize=(8,3))
        order = df[feat].value_counts().index
        sns.countplot(data=df, x=feat, order=order)
        plt.title(f'Distribution of {feat}')
        plt.xticks(rotation=45)
        plt.show()

# Average subscription rate by job
if 'job' in df.columns:
    job_rate = df.groupby('job')['y_binary'].mean().sort_values(ascending=False)
    display(job_rate.head(10))

    plt.figure(figsize=(10,4))
    job_rate.plot(kind='bar')
    plt.title('Subscription rate by Job')
    plt.ylabel('Proportion subscribed')
    plt.show()

# Numeric feature summaries and distributions
num_feats = ['age','duration','campaign','pdays','previous','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']
existing_num_feats = [c for c in num_feats if c in df.columns]
for c in existing_num_feats:
    plt.figure(figsize=(8,3))
    sns.histplot(df[c], bins=40, kde=True)
    plt.title(f'Distribution of {c}')
    plt.show()

# Relationship between numeric features and target (boxplots)
for c in ['age','duration','campaign','pdays']:
    if c in df.columns:
        plt.figure(figsize=(8,4))
        sns.boxplot(x='y', y=c, data=df)
        plt.title(f'{c} by Subscription')
        plt.show()

# Correlation heatmap for numeric features (use y_binary too)
plt.figure(figsize=(10,8))
corr = df.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Correlation Matrix (numeric)')
plt.show()


In [None]:

# === Modeling: Decision Tree Classifier ===
# Prepare data: encode categorical variables with get_dummies (one-hot), label encode target
X = df.drop(columns=['y','y_binary']) if 'y' in df.columns else df.drop(columns=['y_binary'])
y = df['y_binary']

# One-hot encode categorical columns
X_encoded = pd.get_dummies(X, drop_first=True)

print("Shape after encoding:", X_encoded.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42, stratify=y)

# Train Decision Tree
clf = DecisionTreeClassifier(max_depth=6, random_state=42)
clf.fit(X_train, y_train)

# Predictions & evaluation
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:,1]

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ROC AUC
try:
    roc_auc = roc_auc_score(y_test, y_proba)
    print(f"ROC AUC: {roc_auc:.4f}")
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure(figsize=(6,4))
    plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.3f}')
    plt.plot([0,1],[0,1], linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()
except Exception as e:
    print('ROC AUC could not be computed:', e)

# Feature importances (top 20)
feat_imp = pd.Series(clf.feature_importances_, index=X_encoded.columns).sort_values(ascending=False).head(20)
plt.figure(figsize=(10,5))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title('Top 20 Feature Importances (Decision Tree)')
plt.show()

# Visualize the tree (smaller depth)
plt.figure(figsize=(20,10))
plot_tree(clf, feature_names=X_encoded.columns, class_names=['no','yes'], filled=True, rounded=True, max_depth=3)
plt.show()


In [None]:

# === Save final cleaned & encoded dataset if needed ===
# Save the cleaned (pre-encoding) dataset already saved earlier as bank_cleaned.csv
print("Cleaned dataset is saved as 'bank_cleaned.csv' in the current directory.")
