In [95]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

In [161]:
data = pd.read_csv('auto93.csv')

In [162]:
data.head()

Unnamed: 0,Clndrs,Volume,HpX,Model,origin,Lbs-,Acc+,Mpg+
0,8,304.0,193,70,1,4732,18.5,10
1,8,360.0,215,70,1,4615,14.0,10
2,8,307.0,200,70,1,4376,15.0,10
3,8,318.0,210,70,1,4382,13.5,10
4,8,429.0,208,72,1,4633,11.0,10


In [163]:
X = data[['Clndrs', 'Volume', 'HpX', 'Model', 'origin']]
y = data[['Lbs-', 'Acc+', 'Mpg+']]

In [164]:
X.replace('?', np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.replace('?', np.nan, inplace=True)


In [165]:
column_names = X.columns.tolist()
y_column_names = y.columns.tolist()
features = y_column_names
print(features)

['Lbs-', 'Acc+', 'Mpg+']


In [166]:
centroids_data = []

In [167]:
#using standard scaler
#X = StandardScaler().fit_transform(X.astype(float))
print(X.shape)

(398, 5)


In [168]:
#using minmax scaler to retain original values
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [169]:
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

In [170]:
#pca = PCA(n_components=2)
#X_pca = pca.fit_transform(X)

In [171]:
dbscan = DBSCAN(eps=0.1, min_samples=3)

In [172]:
dbscan.fit(X)

In [173]:
labels = dbscan.labels_
data['Cluster'] = labels

In [174]:
unique_labels = np.unique(labels)
#print(f"\t\t     {column_names}")
for label in unique_labels:
    if label == -1:
        continue  # Skip noise points
    cluster_size = np.sum(labels == label)
    cluster_centroid = np.mean(X[labels == label], axis=0)
    #getting back original values to print
    cluster_centroid_original_scale = scaler.inverse_transform([cluster_centroid])
    cluster_centroid_formatted = [f"{value:.2f}" for value in cluster_centroid_original_scale[0]]
    y_centroid = np.mean(y.values[labels == label], axis=0)
    y_centroid_original_scale = y_centroid  # Assuming y values are not scaled
    y_centroid_formatted = [f"{value:.2f}" for value in y_centroid_original_scale]
    
    centroid_data = {'Cluster': label, **dict(zip(column_names, cluster_centroid_formatted)), 
                     **dict(zip(y_column_names, y_centroid_formatted))}
    
    centroids_data.append(centroid_data)
    #print(f"Cluster {label}: Centroid = {cluster_centroid_formatted}, Size = {cluster_size}")
    '''print(f"Cluster {label}:")
    for col, value in zip(column_names, cluster_centroid_formatted):
        print(f"    {col}: {value}")
    print(f"Size = {cluster_size}\n")'''

In [175]:
centroids_df = pd.DataFrame(centroids_data)
print(centroids_df)

    Cluster Clndrs  Volume     HpX  Model origin     Lbs-   Acc+   Mpg+
0         0   8.00  309.67  201.00  70.00   1.00  4496.67  15.67  10.00
1         1   8.00  438.25  211.50  72.75   1.00  4817.75  11.12  10.00
2         2   8.00  334.79  150.30  74.10   1.00  4084.21  13.07  14.68
3         3   8.00  446.60  216.60  70.00   1.00  4103.60   9.50  12.00
4         4   6.00  230.63   97.58  76.06   1.00  3315.12  17.28  20.00
5         5   6.00  243.38   98.88  71.00   1.00  3171.88  14.75  20.00
6         6   6.00  199.00   91.75  70.00   1.00  2710.50  15.50  20.00
7         7   4.00  121.00  111.33  72.67   2.00  2820.33  14.67  20.00
8         8   4.00  124.69   81.45  78.23   1.00  2445.36  16.43  28.41
9         9   4.00  102.88   78.20  75.33   2.00  2308.35  16.32  28.43
10       10   4.00  109.50   93.07  72.43   3.00  2344.07  15.57  24.29
11       11   4.00  126.50   96.00  78.00   3.00  2445.00  14.65  25.00
12       12   4.00   95.96   69.27  79.18   3.00  2094.04  16.93

In [176]:
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f'Number of clusters formed: {n_clusters}')

Number of clusters formed: 13


In [189]:
centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')

  centroids_df = centroids_df.apply(pd.to_numeric, errors='ignore')


In [190]:
maxValues = centroids_df.max()
minValues = centroids_df.min()

In [191]:
def normalize_values(min_val, max_val, value):
    return (value - min_val) / (max_val - min_val)

In [192]:
print(centroids_df.min())

Cluster       0.00
Clndrs        4.00
Volume       95.96
HpX          69.27
Model        70.00
origin        1.00
Lbs-       2094.04
Acc+          9.50
Mpg+         10.00
dtype: float64


In [198]:
d2h = []
for index, centroid in centroids_df.iterrows():
    d = 0
    for column in features:
        if column.endswith("+"):
            d += (abs(1.0 - normalize_values(minValues[column], maxValues[column], centroid[column])))**2
        else:
            d += (abs(0.0 - normalize_values(minValues[column], maxValues[column], centroid[column])))**2
    d2h.append((d/len(features))**(1/2))

d2h = [round(value, 3) for value in d2h]
centroids_df['d2h'] = d2h

# Print the centroids DataFrame
print(centroids_df)


    Cluster  Clndrs  Volume     HpX  Model  origin     Lbs-   Acc+   Mpg+  \
0         0     8.0  309.67  201.00  70.00     1.0  4496.67  15.67  10.00   
1         1     8.0  438.25  211.50  72.75     1.0  4817.75  11.12  10.00   
2         2     8.0  334.79  150.30  74.10     1.0  4084.21  13.07  14.68   
3         3     8.0  446.60  216.60  70.00     1.0  4103.60   9.50  12.00   
4         4     6.0  230.63   97.58  76.06     1.0  3315.12  17.28  20.00   
5         5     6.0  243.38   98.88  71.00     1.0  3171.88  14.75  20.00   
6         6     6.0  199.00   91.75  70.00     1.0  2710.50  15.50  20.00   
7         7     4.0  121.00  111.33  72.67     2.0  2820.33  14.67  20.00   
8         8     4.0  124.69   81.45  78.23     1.0  2445.36  16.43  28.41   
9         9     4.0  102.88   78.20  75.33     2.0  2308.35  16.32  28.43   
10       10     4.0  109.50   93.07  72.43     3.0  2344.07  15.57  24.29   
11       11     4.0  126.50   96.00  78.00     3.0  2445.00  14.65  25.00   

In [199]:
print(d2h)

[0.779, 0.936, 0.7, 0.891, 0.422, 0.446, 0.382, 0.415, 0.161, 0.153, 0.267, 0.297, 0.026]


In [200]:
#plt.scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='rainbow')
#plt.xlabel('PC1')
#plt.ylabel('PC2')
#plt.title('dbscan clusters (after pca)')
#plt.show()

In [201]:
core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
print("Indices of core samples:", dbscan.core_sample_indices_)

Indices of core samples: [  0   2   3   4   5   6   7   8   9  11  12  13  14  15  16  18  19  20
  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  56  58  59
  60  61  62  63  64  65  66  67  68  69  70  73  74  75  76  78  79  80
  82  83  84  86  90  91  92  93  94  95  98  99 100 101 102 103 104 105
 108 109 110 111 112 113 114 115 116 117 119 120 121 123 125 126 127 128
 129 130 131 132 133 135 136 137 138 139 140 142 143 144 145 147 148 149
 151 152 153 155 157 158 159 160 161 162 165 166 167 168 169 170 171 172
 173 174 175 177 178 179 180 182 184 185 186 187 188 189 190 191 194 196
 197 198 199 200 201 202 203 204 205 206 209 211 212 214 215 216 217 218
 219 220 221 222 224 226 227 228 229 230 231 233 234 235 237 238 240 243
 244 245 249 250 251 252 253 254 255 256 257 258 259 260 262 265 266 267
 268 269 270 271 272 273 274 276 277 278 279 280 281 282 283 285 286 287
 288 289 291 294 295 296 2