In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA  # allowed for visualization


sns.set(style='whitegrid')

DATA_PATH = 'bank-full.csv'

if not os.path.exists(DATA_PATH):
    print(f"WARNING: {DATA_PATH} not found in working directory. Please place the CSV here or change DATA_PATH.")

try:
    df = pd.read_csv(DATA_PATH, sep=';')
except Exception as e:
    df = pd.read_csv(DATA_PATH)

print('loaded dataset shape:', df.shape)

display(df.head())
print('\ninfo:')
df.info()

print('\nnumber of samples:', df.shape[0])
print('number of features (including target):', df.shape[1])

missing = df.isna().sum()
missing = missing[missing > 0]
print('\nmissing values (if any):')
print(missing if not missing.empty else 'no missing values detected (check for special codes like 999)')

if 'y' in df.columns:
    print('\nTarget (y) distribution:')
    print(df['y'].value_counts())
    print('\nproportions:')
    print(df['y'].value_counts(normalize=True))

numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
cat_features = df.select_dtypes(exclude=[np.number]).columns.tolist()
if 'y' in cat_features:
    cat_features.remove('y')

print('numeric features:', numeric_features)
print('categorical features:', cat_features)

pd.options.display.float_format = '{:.3f}'.format
print(df[numeric_features].describe().T)

nnum = len(numeric_features)
cols = 3
rows = int(np.ceil(nnum / cols))
fig, axes = plt.subplots(rows, cols, figsize=(5*cols, 3.2*rows))
axes = axes.ravel()
for i, col in enumerate(numeric_features):
    ax = axes[i]
    df[col].hist(bins=30, ax=ax)
    ax.set_title(col)
plt.tight_layout()
plt.show()

selected = ['age'] + [c for c in numeric_features if c not in ['age']][:6]
fig, axes = plt.subplots(len(selected), 1, figsize=(8, 2.2*len(selected)))
for ax, col in zip(axes, selected):
    sns.boxplot(x=df[col], ax=ax)
    ax.set_title(f'boxplot: {col}')
plt.tight_layout()
plt.show()

for col in cat_features:
    print('\n----', col, '----')
    vc = df[col].value_counts()
    print(vc.head(10))

cats_to_plot = ['job', 'marital', 'education', 'month']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()
for ax, col in zip(axes, cats_to_plot):
    vc = df[col].value_counts().iloc[:10]
    sns.barplot(x=vc.values, y=vc.index, ax=ax)
    ax.set_title(col)
plt.tight_layout()
plt.show()

if 'y' in df.columns:
    df['_y_num'] = df['y'].map({'no': 0, 'yes': 1})

corr = df[numeric_features + (['_y_num'] if '_y_num' in df.columns else [])].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='vlag', center=0)
plt.title('correlation matrix (numeric features)')
plt.show()

if 'pdays' in df.columns:
    print('pdays unique values (top):', df['pdays'].value_counts().head())
    cnt_999 = (df['pdays'] == 999).sum()
    print('pdays == 999 count:', cnt_999, f'({100*cnt_999/len(df):.2f}% of rows)')

if 'duration' in df.columns:
    print('\nDuration stats:')
    print(df['duration'].describe())
    print('num samples with duration>1000:', (df['duration']>1000).sum())

ordinal_cols = []
one_hot_cols = cat_features.copy()
numeric_cols = numeric_features.copy()
for t in ['_y_num', 'y']:
    if t in numeric_cols:
        numeric_cols.remove(t)

num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numeric_cols),
    ('cat', cat_pipeline, one_hot_cols)
])

X_trans = preprocessor.fit_transform(df)

oh = preprocessor.named_transformers_['cat']['onehot']
cat_feature_names = []
if hasattr(oh, 'get_feature_names_out'):
    cat_feature_names = oh.get_feature_names_out(one_hot_cols).tolist()
else:
    for col in one_hot_cols:
        vals = df[col].unique()
        for v in vals:
            cat_feature_names.append(f"{col}_{v}")

all_feature_names = numeric_cols + cat_feature_names
X_df = pd.DataFrame(X_trans, columns=all_feature_names)

print('\ntransformed data shape:', X_df.shape)

print(X_df.head())
print('\nany NaNs after transform?', X_df.isna().sum().sum())

if 'pdays' in df.columns:
    df['was_contacted_before'] = (df['pdays'] != 999).astype(int)
if 'duration' in df.columns:
    df['duration_log1p'] = np.log1p(df['duration'])

X_data = X_df.drop(columns=['duration'], errors='ignore').to_numpy()

def euclidean(a, b):
    return np.sqrt(np.sum((a - b) ** 2, axis=1))

def kmeans_numpy(X, k, max_iters=100, random_state=42):
    np.random.seed(random_state)
    
    indices = np.random.choice(X.shape[0], k, replace=False)
    centroids = X[indices]
    
    for _ in range(max_iters):
        distances = np.array([euclidean(X, c) for c in centroids])
        labels = np.argmin(distances, axis=0)
        
        new_centroids = np.array([X[labels == i].mean(axis=0) for i in range(k)])
        
        if np.allclose(centroids, new_centroids, atol=1e-4):
            break
        centroids = new_centroids
    
    inertia = sum(np.min(np.square(distances), axis=0))
    return labels, centroids, inertia

inertias = []
K_range = range(2, 11)
for k in K_range:
    labels, centroids, inertia = kmeans_numpy(X_data, k)
    inertias.append(inertia)

plt.figure(figsize=(8, 5))
plt.plot(K_range, inertias, marker='o')
plt.title('Elbow Method (Custom K-Means)')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia (sum of squared distances)')
plt.show()

k_opt = 4
labels, centroids, inertia = kmeans_numpy(X_data, k_opt)

df['cluster'] = labels
print('cluster sizes:\n', df['cluster'].value_counts())

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_data)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='Set2', s=40)
plt.title(f'Custom K-Means (k={k_opt}) visualized with PCA')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.show()
from scipy.spatial.distance import cdist

def fast_dunn_index(X, labels, sample_size=1000, random_state=42):
    np.random.seed(random_state)
    unique_labels = np.unique(labels)
    clusters = [X[labels == k] for k in unique_labels]

    for i in range(len(clusters)):
        if len(clusters[i]) > sample_size:
            clusters[i] = clusters[i][np.random.choice(len(clusters[i]), sample_size, replace=False)]

    intra_diams = []
    for C in clusters:
        if len(C) > 1:
            d = cdist(C, C, 'euclidean')
            intra_diams.append(np.max(d))
        else:
            intra_diams.append(0)
    max_intra = np.max(intra_diams)

    centroids = np.array([C.mean(axis=0) for C in clusters])
    inter = cdist(centroids, centroids, 'euclidean')
    np.fill_diagonal(inter, np.inf)
    min_inter = np.min(inter)

    dunn = min_inter / max_intra if max_intra > 0 else 0
    return dunn

dunn_value = fast_dunn_index(X_data, labels)
print(f"Approx. Dunn Index for k={k_opt}: {dunn_value:.4f}")









FileNotFoundError: [Errno 2] No such file or directory: '/bank-full.csv'