In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('')
data.info()
data.drop(['CustomerID', 'Genre'], axis = 1, inplace = True)

In [None]:
numerical_columns = data.select_dtypes(include = ['int64', 'float64']).columns.tolist()
numerical_columns

In [None]:
data[numerical_columns]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

sc = StandardScaler()
data_scaled = sc.fit_transform(data)

#find epsilon value
neighbor = NearestNeighbors(n_neighbors = 5)
neighbordist = neighbor.fit(data_scaled)
distance, indices = neighbordist.kneighbors(data_scaled)
distance

In [None]:
import matplotlib.pyplot as plt
distance = np.sort(distance, axis = 0)
plt.plot(distance[:, 4])
plt.show()

In [None]:
eps = [0.6, 0.7, 0.8, 0.9]
min_pts = [4, 5, 6] #2 * no. of dimention i.e. 2*3 = 6 to be chosen

from sklearn.cluster import DBSCAN
result = []
for e in eps:
    for n in min_pts:
        dbscan = DBSCAN(eps = e, min_samples = n)
        y_cluster = dbscan.fit_predict(data_scaled)
        n_cluster = len(set(y_cluster)) - (1 if -1 in y_cluster else 0)
        n_noise = list(y_cluster).count(-1)
        result.append((e, n, n_cluster, n_noise))

result_df = pd.DataFrame(result, columns = ['eps', 'minsamples', 'n_clusters', 'n_noise'])
result_df

In [None]:
#eps = 0.6 max number of clusters #min_samples = 2 * 3 = 6 as eps increases number of clusters decreases
db_model = DBSCAN(eps = 0.6, min_samples = 6)
y_cluster = db_model.fit_predict(data_scaled)

data['Clusters'] = y_cluster
cluster_analysis = data.groupby('Clusters').mean()
cluster_analysis

In [None]:
#with noise data points to identify outlier
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'],
            c = data['Clusters'], cmap = 'viridis')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('DBSCAN for Mall Customer')
plt.show()

In [None]:
#without noise data points
data = data[data['Clusters']!=-1]
plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'],
            c = data['Clusters'], cmap = 'viridis')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.title('DBSCAN for Mall Customer')
plt.show()

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

ss_score = silhouette_score(data_scaled, y_cluster)
dv_score = davies_bouldin_score(data_scaled, y_cluster)
ch_score = calinski_harabasz_score(data_scaled, y_cluster)
print('Silhouette Score', ss_score)
print('Davis Bouldin', dv_score)
print('Calinski Score', ch_score)