In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder

In [2]:
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']

In [3]:
abo_path = 'D:/Users/masoodw/ML_FINANCE/asigmo/github/asigmo/data/Iris/iris.data'
p_df_raw = pd.read_csv(abo_path, sep=',', encoding="UTF-8", names=column_names)

In [4]:
p_df_raw.columns = column_names

In [5]:
p_df_raw.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
p_df_raw.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
class           0
dtype: int64

In [7]:
df_X = p_df_raw.drop("class", axis=1)
df_y = LabelEncoder().fit_transform(list(p_df_raw["class"]))     

In [8]:
df_y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [33]:
db = DBSCAN(eps=1.2, min_samples=5).fit(df_X)
labels = db.labels_

In [34]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [35]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 2
Estimated number of noise points: 0


### Internal Measures

In [36]:
from sklearn import metrics

In [37]:
# Perfect labelings are both homogeneous and complete, hence have score 1.0:
# Labelings that assign all classes members to the same clusters are complete be not homogeneous
# Labelings that have pure clusters with members coming from the same classes are homogeneous but un-necessary splits harms completeness
# If classes members are completely split across different clusters, the assignment is totally incomplete

sill = metrics.silhouette_score(df_X, labels, metric='euclidean', sample_size=None)  ### best score is 1
db = metrics.davies_bouldin_score(df_X, labels) ### best score is 0
sill, db

(0.6863930543445408, 0.3835952094491425)

### External Measures

In [31]:
# A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class.
# A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster.
# Both scores have positive values between 0.0 and 1.0, larger values being desirable.
# The V-measure is the harmonic mean between homogeneity and completeness:

In [37]:
print('V_score')
print(metrics.homogeneity_completeness_v_measure(df_y, labels))
print('Accuracy')
print(metrics.accuracy_score(df_y, labels))
print('Confusion Matrix')
print(metrics.confusion_matrix(df_y, labels))
print('Confusion Report')
print(metrics.classification_report(df_y, labels))
print('f1 score')
print(metrics.f1_score(df_y, labels, average='weighted'))

V_score
(0.5793801642856945, 0.9999999999999997, 0.7336804366512104)
Accuracy
0.6666666666666666
Confusion Matrix
[[50  0  0]
 [ 0 50  0]
 [ 0 50  0]]
Confusion Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.50      1.00      0.67        50
           2       0.00      0.00      0.00        50

    accuracy                           0.67       150
   macro avg       0.50      0.67      0.56       150
weighted avg       0.50      0.67      0.56       150

f1 score
0.5555555555555555


  _warn_prf(average, modifier, msg_start, len(result))


## Get the best parameters out and measure both internal and external measures

In [44]:
internal_measures = []
for i in np.arange(0.3, 1, 0.01):    
    db = DBSCAN(eps=?, min_samples=10).fit(df_X)
    labels = db.labels_
    v_measure = metrics.homogeneity_completeness_v_measure(df_y, ?)[2]
    silhouette = metrics.silhouette_score(?, ?, metric='euclidean', sample_size=None)
    internal_measures.append([k,v_measure,silhouette])    

In [46]:
#

### Internal Measures

### Externa Measures

## Related Links
### https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
### https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py