In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler, LabelEncoder, PowerTransformer

In [6]:
from skopt.space import Real, Integer, Categorical
import joblib
import gc
import itertools
from skopt import gp_minimize
from sklearn import metrics

In [7]:
column_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
abo_path = 'D:/Users/masoodw/ML_FINANCE/asigmo/github/asigmo/data/Iris/iris.data'
p_df_raw = pd.read_csv(abo_path, sep=',', encoding="UTF-8", names=column_names)

In [8]:
df_X = p_df_raw.drop("class", axis=1)
df_y = LabelEncoder().fit_transform(list(p_df_raw["class"]))     

In [9]:
df_X.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
dtype: int64

In [10]:
space  = [ Real(0.2, 10, name='eps')          
          ,Integer(5, 50, name='min_samples')
          ,Categorical(['euclidean', 'l1', 'l2'])
         ]

In [11]:
def objective(values):
    
    #print('\nNext set of params.....',params)
            
        # Fit model on feature_set and calculate validation AUROC
        
    db = DBSCAN(eps=values[0], 
                min_samples=values[1],
                metric=values[2],
               ).fit(df_X)
    
    labels = db.labels_
    
    #sill = metrics.silhouette_score(df_X, labels, metric='euclidean', sample_size=None)  ### best score is 1
    v_measure = metrics.homogeneity_completeness_v_measure(df_y, labels)[2]
    
    #sill = metrics.silhouette_score(df_X, db.labels_, metric='euclidean', sample_size=None)  ### best score is 1
    
    print('v_measure.....', v_measure)
    #print('sill.....', sill)
    
    #gc.collect()
    
    #return  np.mean(train_auc_list)
    return -v_measure
    

In [12]:
res_gp = gp_minimize(objective, space, n_calls=50, random_state=0, n_random_starts=10, acq_func='LCB')
"Best score=%.4f" % res_gp.fun

v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.6301893568173853
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.7336804366512104
v_measure..... 0.6889051517417055
v_measure..... 0.5046811542838697
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.0
v_measure..... 0.7130987813117572
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.0
v_measure..... 0.7336804366512104
v_measure..... 0.7336804366512104
v_measure..... 0.0
v_measure..... 0.7098236529971127
v_measure..... 0.192571172828806
v_measure..... 0.7098236529971127
v_measure..... 0.73368043665

'Best score=-0.7337'

In [13]:
res_gp.x

[1.878699264684464, 7, 'l1']

In [14]:
db = DBSCAN(eps=res_gp.x[0], min_samples=res_gp.x[1], metric=res_gp.x[2] ).fit(df_X)
labels = db.labels_

In [15]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

Estimated number of clusters: 2
Estimated number of noise points: 0


In [16]:
print('Accuracy')
print(metrics.accuracy_score(df_y, labels))
print('Confusion Matrix')
print(metrics.confusion_matrix(df_y, labels))
print('Confusion Report')
print(metrics.classification_report(df_y, labels))
print('f1 score')
print(metrics.f1_score(df_y, labels, average='weighted'))

Accuracy
0.6666666666666666
Confusion Matrix
[[50  0  0]
 [ 0 50  0]
 [ 0 50  0]]
Confusion Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.50      1.00      0.67        50
           2       0.00      0.00      0.00        50

    accuracy                           0.67       150
   macro avg       0.50      0.67      0.56       150
weighted avg       0.50      0.67      0.56       150

f1 score
0.5555555555555555


  _warn_prf(average, modifier, msg_start, len(result))
