## Load Data

In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [2]:
bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features
y = bank_marketing.data.targets

In [3]:
# Create a DataFrame from the features and add the target column
df = pd.DataFrame(X, columns=bank_marketing.metadata.features)
df['target'] = y

# Display the DataFrame
print(len(df))
df.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,target
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no


In [4]:
df.to_csv("bank_marketing.csv",index=False)

In [3]:
# Step 2: Load the Bank Marketing Feature Dataset
data = X
print(len(data))
data.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [6]:
data.to_csv("bank_marketing_features.csv",index=False)

## Data Preprocessing

In [4]:
data = pd.read_csv("bank_marketing_features.csv")
print(len(data))
data.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  int64 
 1   job          44923 non-null  object
 2   marital      45211 non-null  object
 3   education    43354 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  int64 
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      32191 non-null  object
 9   day_of_week  45211 non-null  int64 
 10  month        45211 non-null  object
 11  duration     45211 non-null  int64 
 12  campaign     45211 non-null  int64 
 13  pdays        45211 non-null  int64 
 14  previous     45211 non-null  int64 
 15  poutcome     8252 non-null   object
dtypes: int64(7), object(9)
memory usage: 5.5+ MB


In [6]:
job_unique_values = data['job'].unique()
print("Job Unique values :",job_unique_values)

marital_unique_values = data['marital'].unique()
print("marital Unique values :",marital_unique_values)

education_unique_values = data['education'].unique()
print("education Unique values :",education_unique_values)

default_unique_values = data['default'].unique()
print("default Unique values :",default_unique_values)

loan_unique_values = data['loan'].unique()
print("loan Unique values :",loan_unique_values)

contact_unique_values = data['contact'].unique()
print("contact Unique values :",contact_unique_values)

campaign_unique_values = data['campaign'].unique()
print("campaign Unique values :",campaign_unique_values)

pdays_unique_values = data['pdays'].unique()
print("pdays Unique values :",pdays_unique_values)

previous_unique_values = data['previous'].unique()
print("previous Unique values :",previous_unique_values)

poutcome_unique_values = data['poutcome'].unique()
print("poutcome Unique values :",poutcome_unique_values)

Job Unique values : ['management' 'technician' 'entrepreneur' 'blue-collar' nan 'retired'
 'admin.' 'services' 'self-employed' 'unemployed' 'housemaid' 'student']
marital Unique values : ['married' 'single' 'divorced']
education Unique values : ['tertiary' 'secondary' nan 'primary']
default Unique values : ['no' 'yes']
loan Unique values : ['no' 'yes']
contact Unique values : [nan 'cellular' 'telephone']
campaign Unique values : [ 1  2  3  5  4  6  7  8  9 10 11 12 13 19 14 24 16 32 18 22 15 17 25 21
 43 51 63 41 26 28 55 50 38 23 20 29 31 37 30 46 27 58 33 35 34 36 39 44]
pdays Unique values : [ -1 151 166  91  86 143 147  89 140 176 101 174 170 167 195 165 129 188
 196 172 118 119 104 171 117 164 132 131 123 159 186 111 115 116 173 178
 110 152  96 103 150 175 193 181 185 154 145 138 126 180 109 158 168  97
 182 127 130 194 125 105 102  26 179  28 183 155 112 120 137 124 187 190
 113 162 134 169 189   8 144 191 184 177   5  99 133  93  92  10 100 156
 198 106 153 146 128   7 121 160 

### 1. Handeling Null values

In [7]:
from sklearn.impute import SimpleImputer
import pandas as pd

In [8]:
data = data.drop(columns=['poutcome'])
print(len(data))
data.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0


In [9]:
columns = ["age", "job", "marital", "education","default","balance","housing","loan", "contact","day_of_week","month","duration","campaign","pdays","previous"] 

imputer = SimpleImputer(strategy='most_frequent') 
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=columns)
data_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          45211 non-null  object
 1   job          45211 non-null  object
 2   marital      45211 non-null  object
 3   education    45211 non-null  object
 4   default      45211 non-null  object
 5   balance      45211 non-null  object
 6   housing      45211 non-null  object
 7   loan         45211 non-null  object
 8   contact      45211 non-null  object
 9   day_of_week  45211 non-null  object
 10  month        45211 non-null  object
 11  duration     45211 non-null  object
 12  campaign     45211 non-null  object
 13  pdays        45211 non-null  object
 14  previous     45211 non-null  object
dtypes: object(15)
memory usage: 5.2+ MB


In [10]:
print(len(data_imputed))
data_imputed.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous
0,58,management,married,tertiary,no,2143,yes,no,cellular,5,may,261,1,-1,0
1,44,technician,single,secondary,no,29,yes,no,cellular,5,may,151,1,-1,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,cellular,5,may,76,1,-1,0
3,47,blue-collar,married,secondary,no,1506,yes,no,cellular,5,may,92,1,-1,0
4,33,blue-collar,single,secondary,no,1,no,no,cellular,5,may,198,1,-1,0


### 2. Encode Categorical Column

In [11]:
# Encode categorical variables
label_encoder = LabelEncoder()
# Apply Label Encoding to all categorical columns
for col in ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month']:
    data_imputed[col] = label_encoder.fit_transform(data_imputed[col])

print(len(data_imputed))
data_imputed.head()

45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous
0,58,4,1,2,0,2143,1,0,0,5,8,261,1,-1,0
1,44,9,2,1,0,29,1,0,0,5,8,151,1,-1,0
2,33,2,1,1,0,2,1,1,0,5,8,76,1,-1,0
3,47,1,1,1,0,1506,1,0,0,5,8,92,1,-1,0
4,33,1,2,1,0,1,0,0,0,5,8,198,1,-1,0


### 3. Features Normalization

In [12]:
from sklearn.preprocessing import StandardScaler

# Define the numerical columns you want to normalize
numerical_cols = ['age', 'balance', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous']

# Initialize the scaler
scaler = StandardScaler()

# Apply z-score normalization only to the numerical columns
data_imputed[numerical_cols] = scaler.fit_transform(data_imputed[numerical_cols])

# Print the transformed data
print(len(data_imputed))
data_imputed.head()


45211


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous
0,1.606965,4,1,2,0,0.256419,1,0,0,-1.298476,8,0.011016,-0.569351,-0.411453,-0.25194
1,0.288529,9,2,1,0,-0.437895,1,0,0,-1.298476,8,-0.416127,-0.569351,-0.411453,-0.25194
2,-0.747384,2,1,1,0,-0.446762,1,1,0,-1.298476,8,-0.707361,-0.569351,-0.411453,-0.25194
3,0.571051,1,1,1,0,0.047205,1,0,0,-1.298476,8,-0.645231,-0.569351,-0.411453,-0.25194
4,-0.747384,1,2,1,0,-0.447091,0,0,0,-1.298476,8,-0.23362,-0.569351,-0.411453,-0.25194


In [13]:
X_scaled = data_imputed.values
X_scaled

array([[ 1.60696496,  4.        ,  1.        , ..., -0.56935064,
        -0.41145311, -0.25194037],
       [ 0.28852927,  9.        ,  2.        , ..., -0.56935064,
        -0.41145311, -0.25194037],
       [-0.74738448,  2.        ,  1.        , ..., -0.56935064,
        -0.41145311, -0.25194037],
       ...,
       [ 2.92540065,  5.        ,  1.        , ...,  0.72181052,
         1.43618859,  1.05047333],
       [ 1.51279098,  1.        ,  1.        , ...,  0.39902023,
        -0.41145311, -0.25194037],
       [-0.37068857,  2.        ,  1.        , ..., -0.24656035,
         1.4761376 ,  4.52357654]])

## EVALUATION METRICS

In [14]:
def dunn_index(X, labels):
    """
    Calculates the Dunn index for a given clustering result.


    Parameters:
        X (array-like): The data points.
        labels (array-like): Cluster labels for each data point.


    Returns:
        float: The Dunn index.
    """
    unique_clusters = np.unique(labels)
    num_clusters = len(unique_clusters)
    max_intracluster_dist = 0.0
    min_intercluster_dist = np.inf


    # Calculate maximum intracluster distance
    for i in unique_clusters:
        cluster_i_points = X[labels == i]
       
        # Handle clusters with single or no data points
        if len(cluster_i_points) <= 1:
            intracluster_dist = 0  # or any other appropriate value
        else:
            intracluster_dist = np.max(pdist(cluster_i_points))  
           
        if intracluster_dist > max_intracluster_dist:
            max_intracluster_dist = intracluster_dist


    # Calculate minimum intercluster distance
    for i in range(num_clusters):
        for j in range(i + 1, num_clusters):
            cluster_i_points = X[labels == unique_clusters[i]]
            cluster_j_points = X[labels == unique_clusters[j]]
            intercluster_dist = np.min(cdist(cluster_i_points, cluster_j_points))
            if intercluster_dist < min_intercluster_dist:
                min_intercluster_dist = intercluster_dist


    # Calculate Dunn index
    if max_intracluster_dist == 0:  # Handle case where all clusters have single points
        return 0  # or any other appropriate value
    else:
        return min_intercluster_dist / max_intracluster_dist


In [15]:
def beta_cv(X, labels):
    n_clusters = len(np.unique(labels))
    unique_clusters = np.unique(labels)
    # Cohesion: Intra-cluster distances (within-cluster compactness)
    intra_distances = []
    for cluster in unique_clusters:
        points_in_cluster = X[labels == cluster]
        intra_distances.append(np.mean(pdist(points_in_cluster)))
    # Separation: Inter-cluster distances (between-cluster separation)
    inter_distances = []
    for i in range(n_clusters):
        for j in range(i+1, n_clusters):
            points_cluster_i = X[labels == unique_clusters[i]]
            points_cluster_j = X[labels == unique_clusters[j]]
            inter_distances.append(np.mean(pairwise_distances(points_cluster_i, points_cluster_j)))
    cohesion = np.mean(intra_distances)
    separation = np.mean(inter_distances)
    return cohesion / separation  # Beta CV = Cohesion / Separation


## K-Means Clustering

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, pairwise_distances
from scipy.spatial.distance import pdist
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist

In [17]:
# Initialize K-Means with a certain number of clusters (say 5)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_scaled)

# Add the cluster labels to the dataset
data['Cluster'] = kmeans.labels_

In [18]:
data.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,Cluster
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,4
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,1
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,4
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,4
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,4
5,35,management,married,tertiary,no,231,yes,no,,5,may,139,1,-1,0,4
6,28,management,single,tertiary,no,447,yes,yes,,5,may,217,1,-1,0,4
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,,5,may,380,1,-1,0,4
8,58,retired,married,primary,no,121,yes,no,,5,may,50,1,-1,0,1
9,43,technician,single,secondary,no,593,yes,no,,5,may,55,1,-1,0,1


In [19]:
# Calculate the Silhouette Score to evaluate the quality of the clusters
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(X_scaled, kmeans.labels_)
print(f"Silhouette Score: {silhouette_avg}")

sse = kmeans.inertia_  # Sum of Squared Errors (inertia)
print(f"SSE (Sum of Squared Errors): {sse}")

dunn = dunn_index(X_scaled, kmeans.labels_)
print(f"Dunn's Index: {dunn}")

beta_cv_value = beta_cv(X_scaled, kmeans.labels_)
print(f"Beta CV (Cohesion vs Separation Index): {beta_cv_value}")

Silhouette Score: 0.20451077113343166
SSE (Sum of Squared Errors): 561148.3777066192
Dunn's Index: 0.006767220814328692
Beta CV (Cohesion vs Separation Index): 0.6043759781831832


## K MEDOIDS:

In [40]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn_extra.cluster import KMedoids
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.cluster import cluster_visualizer
from pyclustering.utils import distance_metric, type_metric
from sklearn.metrics import silhouette_score, davies_bouldin_score
import numpy as np

In [41]:
# Initialize K-Medoids
# Choose initial medoids randomly (indices can be adjusted for testing)
initial_medoids = [0, 100]  # Random indices as initial medoids

# Set the distance metric (Euclidean)
metric = distance_metric(type_metric.EUCLIDEAN)

# Apply K-Medoids using PyClustering
#kmedoids_instance = kmedoids(X_sampled_scaled, initial_medoids, metric=metric)
kmedoids_instance = kmedoids(X_scaled, initial_medoids, metric=metric)
kmedoids_instance.process()

# Get clusters and medoids
clusters = kmedoids_instance.get_clusters()
medoids = kmedoids_instance.get_medoids()

In [None]:
# Convert cluster labels for each point
# Create a list of labels based on clusters
labels = np.zeros(len(X_scaled), dtype=int)
for cluster_id, cluster in enumerate(clusters):
    for index in cluster:
        labels[index] = cluster_id

In [None]:
# Calculate SSE (Sum of Squared Errors for K-Medoids)
sse_kmedoids = np.sum([np.linalg.norm(X_sampled_scaled[i] - X_sampled_scaled[medoids[labels[i]]])**2 for i in range(len(X_sampled_scaled))])
print(f"SSE (Sum of Squared Errors for K-Medoids): {sse_kmedoids}")

# Calculate Silhouette Score
silhouette_avg_kmedoids = silhouette_score(X_sampled_scaled, labels)
print(f"Silhouette Score for K-Medoids: {silhouette_avg_kmedoids}")

dunn = dunn_index(X_sampled_scaled, labels)
print(f"Dunn's Index: {dunn}")

beta_cv_value = beta_cv(X_sampled_scaled, labels)
print(f"Beta CV (Cohesion vs Separation Index): {beta_cv_value}")

## EM CLUSTERING:

In [20]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=5, random_state=42)
gmm.fit(X_scaled)
data['Cluster'] = gmm.predict(X_scaled)

In [21]:
data.head(10)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,Cluster
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,4
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,4
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,4
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,4
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,4
5,35,management,married,tertiary,no,231,yes,no,,5,may,139,1,-1,0,4
6,28,management,single,tertiary,no,447,yes,yes,,5,may,217,1,-1,0,4
7,42,entrepreneur,divorced,tertiary,yes,2,yes,no,,5,may,380,1,-1,0,2
8,58,retired,married,primary,no,121,yes,no,,5,may,50,1,-1,0,4
9,43,technician,single,secondary,no,593,yes,no,,5,may,55,1,-1,0,4


In [22]:
# Step 6: Calculate SSE (Sum of Squared Errors for GMM)
# We can calculate SSE as the sum of squared distances from each point to its assigned Gaussian component mean
means = gmm.means_
labels = gmm.predict(X_scaled)

#SSE
sse_gmm = np.sum([np.linalg.norm(X_scaled[i] - means[labels[i]])**2 for i in range(len(X_scaled))])
print(f"SSE (Sum of Squared Errors for GMM): {sse_gmm}")

# Step 7: Calculate Silhouette Score
silhouette_avg_gmm = silhouette_score(X_scaled, labels)
print(f"Silhouette Score for GMM: {silhouette_avg_gmm}")

dunn = dunn_index(X_scaled, labels)
print(f"Dunn's Index: {dunn}")

beta_cv_value = beta_cv(X_scaled, labels)
print(f"Beta CV (Cohesion vs Separation Index): {beta_cv_value}")


SSE (Sum of Squared Errors for GMM): 1200097.238497798
Silhouette Score for GMM: 0.045876959273043104
Dunn's Index: 0.002964801927608569
Beta CV (Cohesion vs Separation Index): 0.9180522778251379


## DBSCAN Clustering

In [16]:
from sklearn.cluster import DBSCAN
import numpy as np

In [17]:
# DBSCAN model
dbscan = DBSCAN(eps=3, min_samples=2)
dbscan.fit(X_scaled)

In [18]:
# Results
print("Labels:", dbscan.labels_)
print("Core Sample Indices:", dbscan.core_sample_indices_)
print("Number of clusters:", len(set(dbscan.labels_)) - (1 if -1 in dbscan.labels_ else 0))
print("Number of noise points:", list(dbscan.labels_).count(-1))
labels = dbscan.labels_

Labels: [0 0 0 ... 0 0 0]
Core Sample Indices: [    0     1     2 ... 45208 45209 45210]
Number of clusters: 12
Number of noise points: 167


In [24]:
#Calculate Silhouette Score
silhouette_avg_dbscan = silhouette_score(X_scaled, labels)
print(f"Silhouette Score for dbscan: {silhouette_avg_dbscan}")


dunn = dunn_index(X_scaled, dbscan.labels_)
print(f"Dunn's Index: {dunn}")


beta_cv_value = beta_cv(X_scaled, dbscan.labels_)
print(f"Beta CV (Cohesion vs Separation Index): {beta_cv_value}")

Silhouette Score for dbscan: 0.34944842718687924
Dunn's Index: 0.024192593944684322
Beta CV (Cohesion vs Separation Index): 0.24807653265347737


## SLINK Clustering

In [26]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.metrics import silhouette_score

In [29]:
# Single-linkage clustering (SLINK-like behavior)
single_link_clustering = AgglomerativeClustering(n_clusters=20, linkage='single')
labels = single_link_clustering.fit_predict(X_scaled)
# Results
print("Labels:", single_link_clustering.labels_)


Labels: [0 0 0 ... 0 0 0]


In [39]:
silhouette_avg_slink = silhouette_score(X_scaled, labels)
print(f"Silhouette Score for slink: {silhouette_avg_slink}")


dunn = dunn_index(X_scaled, single_link_clustering.labels_)
print(f"Dunn's Index: {dunn}")


beta_cv_value = beta_cv(X_scaled,  single_link_clustering.labels_)
print(f"Beta CV (Cohesion vs Separation Index): {beta_cv_value}")

Silhouette Score for slink: 0.5308427576997019
Dunn's Index: 0.1753186313381719


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Beta CV (Cohesion vs Separation Index): nan


In [37]:
unique_values = np.unique(m)
print("Unique values:", unique_values)

Unique values: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
