# Analyzing business potential of areas in Pune, India

In [1]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import requests

In [2]:
def get_coordinates(address_list):
    geolocator = Nominatim(user_agent="wain")
    locations = pd.DataFrame(columns=['Area name', 'latitude', 'longitude'])
    for address in address_list:
        try:
            location = geolocator.geocode(address)
            latitude = location.latitude
            longitude = location.longitude
        except:
            latitude = 'Address not found'
            longitude = 'Address not found'
        locations = locations.append(
            {
                'Area name': address,
                'latitude': latitude,
                'longitude': longitude
            },
            ignore_index=True)
    return locations

In [3]:
address = ['Shankar Kalat nagar','Pimpri-Chinchwad','Aundh,Pune','Kothrud','Koregaon Park']
locations = get_coordinates(address)
locations

Unnamed: 0,Area name,latitude,longitude
0,Shankar Kalat nagar,18.604093,73.754065
1,Pimpri-Chinchwad,18.627929,73.800983
2,"Aundh,Pune",18.561883,73.810196
3,Kothrud,18.503889,73.807673
4,Koregaon Park,18.537553,73.893925


In [4]:
# hide this cell
# credential foursquare API
CLIENT_ID = 'DMMNDCKP4HCEBKNG0OZUKP4HQ4G2ARKETXFJLZOI1DY001IT' # your Foursquare ID
CLIENT_SECRET = 'BNOTBJCDHCHRJ3Z1EAMLB0TXSCA4RUH1HSABRTIV33ERSMAM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [5]:
import json
from pandas.io.json import json_normalize

In [6]:
def get_venues(lat, lon):
    LIMIT = 100
    radius = 5000
    try:
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lon, radius, LIMIT)
        results = requests.get(url).json()
        venues = results['response']['groups'][0]['items']
        nearby_venues = json_normalize(venues)  # flatten JSON
        filtered_columns = [
            'venue.name', 'venue.categories', 'venue.location.lat',
            'venue.location.lng'
        ]
        nearby_venues = nearby_venues.loc[:, filtered_columns]
        nearby_venues['venue.categories'] = nearby_venues.apply(
            get_category_type, axis=1)
        nearby_venues.columns = [
            col.split(".")[-1] for col in nearby_venues.columns
        ]

    except:
        return None

    return nearby_venues

In [7]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [8]:
df_venues = pd.DataFrame(columns=['name','categories','lat','lng'])
for i in range (0,len(locations.index)):
    df_venues = df_venues.append(get_venues(locations.latitude[i],locations.longitude[i]))
    
df_venues.shape

(492, 4)

In [9]:
df_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Sayaji,Hotel,18.599535,73.754995
1,Natural Ice Cream,Ice Cream Shop,18.591192,73.75244
2,Barbeque Nation,BBQ Joint,18.59939,73.75509
3,Courtyard by Marriott,Hotel,18.591527,73.746831
4,Little Italy,Italian Restaurant,18.591513,73.743668


In [10]:
df_venues = df_venues.drop_duplicates()

In [11]:
df_venues.shape

(424, 4)

In [12]:
import folium

In [23]:
map_ = folium.Map(location=[locations.latitude[2], locations.longitude[2]], zoom_start=12)

# add markers to map
for lat, lng, Categories, name in zip(df_venues['lat'], df_venues['lng'], df_venues['categories'], df_venues['name']):
    label = '{}, {}'.format(name, Categories)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=1,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_)  
    
map_

In [14]:
df_venues.to_csv('venues_all.csv',index=False)

In [16]:
Clus_dataSet = df_venues[['lat','lng']] 
Clus_dataSet.head()

Unnamed: 0,lat,lng
0,18.599535,73.754995
1,18.591192,73.75244
2,18.59939,73.75509
3,18.591527,73.746831
4,18.591513,73.743668


In [24]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from folium import plugins

In [41]:
Clus_dataSet = StandardScaler().fit_transform(Clus_dataSet)
Clus_dataSet[0:5]

array([[ 0.86938101, -1.25073464],
       [ 0.67636425, -1.3009986 ],
       [ 0.86600786, -1.24886266],
       [ 0.68412166, -1.41130692],
       [ 0.68379116, -1.47351958]])

In [56]:
# iterate for best hyperparameters
param_eps = [0.15, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35]
param_min_samples = [3, 4, 5, 6, 7, 8, 9]
max_score = 0
for eps in param_eps:
    for min_samples in param_min_samples:
        #print('Value of eps = ',eps)
        #print('value of of min samples = ', min_samples)
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(Clus_dataSet)
        labels = db.labels_
        #n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
        #print('number of clusters = ', n_clusters)
        score = metrics.silhouette_score(Clus_dataSet, labels)
        #print('Silhouette score = {:0.4f}'.format(score))
        if score > max_score:
            max_score = score
            #print(max_score)
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            num_cluster = n_clusters
            best_eps = eps
            best_min_samples = min_samples
print('The best value for eps = ', best_eps)
print('The best value for min_samples is = ', best_min_samples)
print('Number of clusters = ', num_cluster)
print('The highest Silhouette score is = ', max_score)

The best value for eps =  0.25
The best value for min_samples is =  8
Number of clusters =  8
The highest Silhouette score is =  0.4999415527799792


In [57]:
db = DBSCAN(eps=best_eps, min_samples=best_min_samples).fit(Clus_dataSet)
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

In [58]:
color_options = [
    'black', 'blue', 'cadetblue', 'darkblue', 'darkgreen', 'darkpurple',
    'darkred', 'gray', 'green', 'lightblue', 'lightgreen', 'lightred',
    'orange', 'pink', 'purple', 'red'
]
map_clus = folium.Map(location=[locations.latitude[2], locations.longitude[2]],
                      zoom_start=11)
# add markers to map
for lat, lng, Categories, name, lab in zip(df_venues['lat'], df_venues['lng'],
                                           df_venues['categories'],
                                           df_venues['name'], labels):
    label = '{}, {}'.format(name, Categories, lab)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
                        radius=1,
                        popup=label,
                        color=color_options[lab],
                        fill=True,
                        fill_opacity=1,
                        parse_html=False).add_to(map_clus)

map_clus

In [49]:
labels

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  0,  2,  0,  2,  0,  1,
        4,  0,  1,  1,  0,  1,  1,  3,  4,  0,  0,  3,  0,  2,  2,  2,  0,
        1,  0,  2,  2,  0,  1,  2, -1,  0,  0,  0,  0,  3,  1,  0,  2,  1,
        0,  2,  1,  0,  2,  0,  0,  1,  1,  0,  1,  2,  1,  2,  4,  4,  0,
       -1,  3,  2,  0,  0,  1,  1,  2,  0,  0, -1,  1,  1,  0,  0,  2,  2,
        0, -1,  0,  0, -1,  1,  1, -1,  4, -1,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  3,  4,  3,  2,  4,  3,  4,  4,  4,  4,  4,  2,  3, -1,  4,
        3,  3,  3,  4,  4,  3, -1,  3,  3, -1, -1,  4,  3,  3, -1,  4,  3,
        4,  4,  3,  3,  3,  4,  2,  4, -1,  2, -1,  4,  2,  3,  2,  2,  2,
       -1,  2,  2,  2,  2,  2,  2,  2, -1,  2,  0,  2,  2,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,
        1,  1,  1,  5,  1,  1,  1,  5,  1,  5,  5,  1,  5,  5,  5,  1,  5,
        1,  5,  1,  5,  1,  5,  5,  1,  5,  1,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  1,  1