In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('venues_all.csv')
df.head()

Unnamed: 0,name,categories,lat,lng
0,Sayaji,Hotel,18.599535,73.754995
1,Natural Ice Cream,Ice Cream Shop,18.591192,73.75244
2,Barbeque Nation,BBQ Joint,18.59939,73.75509
3,Courtyard by Marriott,Hotel,18.591527,73.746831
4,Little Italy,Italian Restaurant,18.591513,73.743668


In [3]:
df.shape

(424, 4)

In [4]:
from sklearn.cluster import DBSCAN

In [5]:
Clus_dataSet = df[['lat','lng']] 
Clus_dataSet.head()

Unnamed: 0,lat,lng
0,18.599535,73.754995
1,18.591192,73.75244
2,18.59939,73.75509
3,18.591527,73.746831
4,18.591513,73.743668


In [6]:
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [7]:
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
Clus_dataSet[0:5]

array([[ 0.86033558, -1.25272439],
       [ 0.66849366, -1.30300637],
       [ 0.85698296, -1.25085174],
       [ 0.67620385, -1.41335426],
       [ 0.67587536, -1.47558923]])

In [51]:
db = DBSCAN(eps=0.35, min_samples=9).fit(Clus_dataSet)
db

DBSCAN(algorithm='auto', eps=0.35, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=9, n_jobs=None, p=None)

In [52]:
labels = db.labels_
labels[:250]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  0,  0,  0,  0,  0,  0,  1,  1,  0,  0,  1,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  0,
        0,  1,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0, -1,  0,  0,  1,  1,  0,  0,  0, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  0,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1,  1,  1,  1,
        0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  1,  0,  1,  0,
        0,  1,  0,  0, -1,  0,  1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  2,  0,  0,  0,  2,  0,  2,  2,  0,  2,  2,  2,  0,  2,
        0,  2,  0,  2,  0,  2,  2,  0,  2,  0,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  0,  0

In [53]:
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters

4

In [54]:
print(metrics.silhouette_score(Clus_dataSet, labels))


0.5051792230206322


In [44]:
import folium
from folium import plugins

In [68]:
color_options = [
    'black', 'purple', 'darkblue', 'darkgreen', 'darkpurple',
    'darkred', 'green', 'lightgreen',
    'blue', 'orange', 'red'
]

In [69]:
map_ = folium.Map(location=[df.lat[0], df.lng[0]], zoom_start=11)
# add markers to map
for lat, lng, Categories, name, lab in zip(df['lat'], df['lng'], df['categories'], df['name'], labels):
    label = '{}, {}'.format(name, Categories, lab)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color=color_options[lab],
        fill=True,
        fill_opacity=1,
        parse_html=False).add_to(map_)  
    
map_

In [50]:
# iterate for best hyperparameters
param_eps = [0.15,0.2,0.225,0.25,0.275,0.3,0.325,0.35]
param_min_samples = [3,4,5,6,7,8,9]
for eps in param_eps:
    for min_samples in param_min_samples:
        #print('Value of eps = ',eps)
        #print('value of of min samples = ', min_samples) 
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(Clus_dataSet)
        labels = db.labels_
        n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        #print('number of clusters = ', n_clusters)
        score = metrics.silhouette_score(Clus_dataSet, labels)
        #print('Silhouette score = {:0.4f}'.format(score))
        max_score = 0
        if score>max_score:
            max_score = score
            best_eps = eps
            best_min_samples = min_samples
print('The best value for eps = ',best_eps)
print('The best value for min_samples is = ',best_min_samples)
print('The highest Silhouette score is = ',max_score)

The best value for eps =  0.35
The best value for min_samples is =  9
The highest Silhouette score is =  0.5051792230206322
