In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('venues_all.csv')
df.head()

Unnamed: 0,name,categories,lat,lng
0,Sayaji,Hotel,18.599535,73.754995
1,Natural Ice Cream,Ice Cream Shop,18.591192,73.75244
2,Barbeque Nation,BBQ Joint,18.59939,73.75509
3,Courtyard by Marriott,Hotel,18.591527,73.746831
4,Little Italy,Italian Restaurant,18.591513,73.743668


In [3]:
df.shape

(424, 4)

In [4]:
from sklearn.cluster import DBSCAN

In [5]:
Clus_dataSet = df[['lat','lng']] 
Clus_dataSet.head()

Unnamed: 0,lat,lng
0,18.599535,73.754995
1,18.591192,73.75244
2,18.59939,73.75509
3,18.591527,73.746831
4,18.591513,73.743668


In [6]:
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [7]:
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
Clus_dataSet[0:5]

array([[ 0.86033558, -1.25272439],
       [ 0.66849366, -1.30300637],
       [ 0.85698296, -1.25085174],
       [ 0.67620385, -1.41335426],
       [ 0.67587536, -1.47558923]])

In [40]:
db = DBSCAN(eps=0.25, min_samples=8).fit(Clus_dataSet)
db

DBSCAN(algorithm='auto', eps=0.25, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=8, n_jobs=None, p=None)

In [41]:
labels = db.labels_
labels[:250]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  2,  0,  2,  0,  1,
        4,  0,  1,  1,  0,  1,  1,  3,  4,  0,  0,  3,  0,  2,  2,  2,  0,
        1,  0,  2,  2,  0,  1,  2, -1,  0,  0,  0,  0,  3,  1,  0,  2,  1,
        0,  2,  1,  0,  2,  0,  0,  1,  1,  0,  1,  2,  1,  2,  4,  4,  0,
       -1,  3,  2,  0,  0,  1,  1,  2,  0,  0, -1,  1,  1,  0,  0,  2,  2,
        0, -1,  0,  0, -1,  1,  1, -1,  4,  1,  2,  2, -1,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  3,  4,  3,  2,  4,  3,  4,  4,  4,  4,  4,  2,
        3, -1,  4,  3,  3,  3,  4,  4,  3, -1,  3,  3, -1, -1,  4,  3,  3,
       -1,  4,  3,  4,  4,  4,  4,  3,  3,  3,  2, -1,  2, -1, -1,  4,  2,
        2,  3,  2,  2, -1,  2,  3,  2,  2, -1,  2,  2,  0,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  5,  1,  1,  1,  5,  1,  5,  5,  1,  5,  5,  5,  1,  5,
        1,  5,  1,  5,  1,  5,  5,  1,  5,  1,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  1,  1

In [42]:
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters

8

In [43]:
print(metrics.silhouette_score(Clus_dataSet, labels))


0.4985971344620749


In [44]:
import folium
from folium import plugins

In [45]:
color_options = [
    'black', 'blue', 'cadetblue', 'darkblue', 'darkgreen', 'darkpurple',
    'darkred', 'gray', 'green', 'lightblue', 'lightgreen', 'lightred',
    'orange', 'pink', 'purple', 'red'
]

In [46]:
map_ = folium.Map(location=[df.lat[0], df.lng[0]], zoom_start=11)
# add markers to map
for lat, lng, Categories, name, lab in zip(df['lat'], df['lng'], df['categories'], df['name'], labels):
    label = '{}, {}'.format(name, Categories, lab)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color=color_options[lab],
        fill=True,
        fill_opacity=1,
        parse_html=False).add_to(map_)  
    
map_

In [39]:
# iterate for best hyperparameters
param_eps = [0.15,0.2,0.225,0.25,0.275,0.3,0.325,0.35]
param_min_samples = [3,4,5,6,7,8,9]
for eps in param_eps:
    for min_samples in param_min_samples:
        print('Value of eps = ',eps)
        print('value of of min samples = ', min_samples) 
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(Clus_dataSet)
        labels = db.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        print('number of clusters = ', n_clusters)
        score = metrics.silhouette_score(Clus_dataSet, labels)
        print('Silhouette score = {:0.4f}'.format(score))

Value of eps =  0.15
value of of min samples =  3
number of clusters =  16
Silhouette score = 0.3436
Value of eps =  0.15
value of of min samples =  4
number of clusters =  14
Silhouette score = 0.3924
Value of eps =  0.15
value of of min samples =  5
number of clusters =  14
Silhouette score = 0.3921
Value of eps =  0.15
value of of min samples =  6
number of clusters =  12
Silhouette score = 0.3808
Value of eps =  0.15
value of of min samples =  7
number of clusters =  11
Silhouette score = 0.3471
Value of eps =  0.15
value of of min samples =  8
number of clusters =  10
Silhouette score = 0.3231
Value of eps =  0.15
value of of min samples =  9
number of clusters =  10
Silhouette score = 0.2987
Value of eps =  0.2
value of of min samples =  3
number of clusters =  11
Silhouette score = 0.3777
Value of eps =  0.2
value of of min samples =  4
number of clusters =  8
Silhouette score = 0.4359
Value of eps =  0.2
value of of min samples =  5
number of clusters =  9
Silhouette score = 0.