## Get Toronto Geo data from wiki pedia

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

response = requests.get('https://en.wikipedia.org/wiki/Demographics_of_Toronto_neighbourhoods')
soup = BeautifulSoup(response.text, features="html.parser")

table_data = soup.select('table.wikitable.sortable > tbody > tr')
theads = table_data[0].select('th')
trows = table_data[1:]

columns = []
for thead in theads:
    columns.append(thead.get_text().strip())

data_rows = [[] for col in columns]
for row in trows:
    tds = row.select('td')
    for idx, td in enumerate(tds):
        data_rows[idx].append(td.get_text().strip())

df = pd.DataFrame()
for idx, col in enumerate(columns):
    df[col] = data_rows[idx]

df.to_csv(r'toronto_neighbor.csv', index=False)

In [23]:
df.head()

Unnamed: 0,Name,FM,Census Tracts,Population,Land area (km2),Density (people/km2),% Change in Population since 2001,Average Income,Transit Commuting %,% Renters,Second most common language (after English) by name,Second most common language (after English) by percentage,Map
0,Toronto CMA Average,,All,5113149,5903.63,866,9.0,40704,10.6,11.4,,,
1,Agincourt,S,"0377.01, 0377.02, 0377.03, 0377.04, 0378.02, 0...",44577,12.45,3580,4.6,25750,11.1,5.9,Cantonese (19.3%),19.3% Cantonese,
2,Alderwood,E,"0211.00, 0212.00",11656,4.94,2360,-4.0,35239,8.8,8.5,Polish (6.2%),06.2% Polish,
3,Alexandra Park,OCoT,0039.00,4355,0.32,13609,0.0,19687,13.8,28.0,Cantonese (17.9%),17.9% Cantonese,
4,Allenby,OCoT,0140.00,2513,0.58,4333,-1.0,245592,5.2,3.4,Russian (1.4%),01.4% Russian,


In [47]:
df[df['Name'] == 'Yonge-St.Clair']['FM'].values
address = 'Humbermede, Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
print(location)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

None


## Get Geojson of Toronto

In [4]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

import folium # plotting library

address = 'Humbermede, Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6615225 -79.4097446


In [7]:
!wget https://raw.githubusercontent.com/jasonicarter/toronto-geojson/master/toronto_crs84.geojson

--2019-06-29 07:37:42--  https://raw.githubusercontent.com/jasonicarter/toronto-geojson/master/toronto_crs84.geojson
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1972831 (1.9M) [text/plain]
Saving to: ‘toronto_crs84.geojson’


2019-06-29 07:37:42 (49.2 MB/s) - ‘toronto_crs84.geojson’ saved [1972831/1972831]



In [19]:
import json
with open('toronto_crs84.geojson') as json_data:
    tronto_data = json.load(json_data)
neighborhoods_data = tronto_data['features']
neighborhoods_data[0]

{'type': 'Feature',
 'properties': {'AREA_S_CD': '097', 'AREA_NAME': 'Yonge-St.Clair (97)'},
 'geometry': {'type': 'Polygon',
  'coordinates': [[[-79.39119482699992, 43.68108112399995],
    [-79.39140543199991, 43.68096955399996],
    [-79.39322377799992, 43.68016563999995],
    [-79.39580883199991, 43.67897993999995],
    [-79.39734938999993, 43.67827481299995],
    [-79.39745605399992, 43.67822540699995],
    [-79.39756389799992, 43.67816700199996],
    [-79.39767131899993, 43.67811759699996],
    [-79.39777954499992, 43.678068201999956],
    [-79.39788853599991, 43.678014288999954],
    [-79.39793136699991, 43.67799495999996],
    [-79.39794405299993, 43.67802629499996],
    [-79.39801215899993, 43.678203873999955],
    [-79.39814090099992, 43.678530735999956],
    [-79.39835150999991, 43.679039164999956],
    [-79.39856296799991, 43.67955427099996],
    [-79.39873316599993, 43.67996223099995],
    [-79.3989394799999, 43.68046042199996],
    [-79.39906544799993, 43.68076654099995],


## Extract position (latitude, longitude) and combine into a new data frame

In [48]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

for data in neighborhoods_data:
    neighborhood_name = data['properties']['AREA_NAME']
    area_code = data['properties']['AREA_S_CD']
    if area_code.startswith('0'):
        area_code = area_code[1:]
    area_code = ' (' + area_code + ')'
    print(area_code)
    neighborhood_name = neighborhood_name.replace(area_code, '')
    print(neighborhood_name)
    borough = df[df['Name'] == neighborhood_name]['FM'].values
    if len(borough) == 0:
        continue
    borough = borough[0]
    address = neighborhood_name + ', Toronto'

    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    if location is None:
        continue
    latitude = location.latitude
    longitude = location.longitude
    
    neighborhood_lat = latitude
    neighborhood_lon = longitude
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)


 (97)
Yonge-St.Clair
 (27)
York University Heights
 (38)
Lansing-Westgate
 (31)
Yorkdale-Glen Park
 (16)
Stonegate-Queensway
 (118)
Tam O'Shanter-Sullivan
 (63)
The Beaches
 (03)
Thistletown-Beaumond Heights (3)
 (55)
Thorncliffe Park
 (59)
Danforth East York
 (106)
Humewood-Cedarvale
 (14)
Islington-City Centre West
 (66)
Danforth
 (28)
Rustic
 (139)
Scarborough Village
 (85)
South Parkdale
 (70)
South Riverdale
 (40)
St.Andrew-Windfields
 (61)
Taylor-Massey
 (21)
Humber Summit
 (22)
Humbermede
 (133)
Centennial Scarborough
 (75)
Church-Yonge Corridor
 (120)
Clairlea-Birchmount
 (123)
Cliffcrest
 (44)
Flemingdon Park
 (92)
Corso Italia-Davenport
 (125)
Ionview
 (90)
Junction Area
 (57)
Broadview North
 (10)
Princess-Rosethorn
 (68)
North Riverdale
 (13)
Etobicoke West Mall
 (102)
Forest Hill North
 (25)
Glenfield-Jane Heights
 (65)
Greenwood-Coxwell
 (140)
Guildwood
 (81)
Trinity-Bellwoods
 (43)
Victoria Village
 (77)
Waterfront Communities-The Island
 (136)
West Hill
 (01)
West Humbe

In [49]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,NY,York University Heights,43.758781,-79.519434
1,OCoT,The Beaches,43.671024,-79.296712
2,EY,Thorncliffe Park,43.704553,-79.345407
3,S,Scarborough Village,43.743742,-79.211632
4,NY,Humber Summit,43.760078,-79.57176


In [50]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 6 boroughs and 44 neighborhoods.


In [51]:
address = 'York University Heights, NY'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of York University Heights are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of York University Heights are 42.6477671, -73.774189255986.


In [52]:
# create map of York University Heights using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

## Show North York area map

In [56]:
ny_data = neighborhoods[neighborhoods['Borough'] == 'NY'].reset_index(drop=True)
ny_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,NY,York University Heights,43.758781,-79.519434
1,NY,Humber Summit,43.760078,-79.57176
2,NY,Flemingdon Park,43.718432,-79.333204
3,NY,Victoria Village,43.732658,-79.311189
4,NY,Henry Farm,43.769509,-79.354296


In [57]:
address = 'NY, Toronto'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 41.6302032, -74.8584829076306.


In [58]:
# create map of Manhattan using latitude and longitude values
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(ny_data['Latitude'], ny_data['Longitude'], ny_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

In [62]:
CLIENT_ID = '2OGJEGMXUMQB4OESKUJ22DTXAZAYAGPYT1OR310CXAULRYDE' # your Foursquare ID
CLIENT_SECRET = 'ZD3ZKJOB2P2DRCSEONN31KSMT4OBCFMTIPQNOF3T3FJIQJBA' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 2OGJEGMXUMQB4OESKUJ22DTXAZAYAGPYT1OR310CXAULRYDE
CLIENT_SECRET:ZD3ZKJOB2P2DRCSEONN31KSMT4OBCFMTIPQNOF3T3FJIQJBA


In [60]:
ny_data.loc[0, 'Neighborhood']

'York University Heights'

In [61]:
neighborhood_latitude = ny_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = ny_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = ny_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of York University Heights are 43.7587808, -79.5194336.


## Use Fourthsquare api to generate detailed info

In [65]:
search_query = 'York University Heights'
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    lat, 
    lng, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=2OGJEGMXUMQB4OESKUJ22DTXAZAYAGPYT1OR310CXAULRYDE&client_secret=ZD3ZKJOB2P2DRCSEONN31KSMT4OBCFMTIPQNOF3T3FJIQJBA&v=20180604&ll=43.787048,-79.3337137&radius=500&limit=100'

In [66]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d1728a3a87921002c4826dd'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Pleasant View',
  'headerFullLocation': 'Pleasant View, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 16,
  'suggestedBounds': {'ne': {'lat': 43.7915480045, 'lng': -79.32749192586706},
   'sw': {'lat': 43.782547995499996, 'lng': -79.33993547413294}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4c44346cdd1f2d7fbc767ff9',
       'name': 'Rosemary & Thyme',
       'location': {'address': '2798 Victoria Park Ave.',
        'crossStreet': 'at Van Horne Ave.',
        'lat': 43.788243709036514,
        'lng': -79.330209703676,
        'labeledL

In [67]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [69]:
ny_venues = getNearbyVenues(names=ny_data['Neighborhood'],
                            latitudes=ny_data['Latitude'],
                            longitudes=ny_data['Longitude'])

York University Heights
Humber Summit
Flemingdon Park
Victoria Village
Henry Farm
Bathurst Manor
Bayview Village
Don Valley Village
Maple Leaf
Pleasant View


In [70]:
print(ny_venues.shape)
ny_venues.head()

(84, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,York University Heights,43.758781,-79.519434,No Frills,43.75926,-79.518927,Grocery Store
1,York University Heights,43.758781,-79.519434,Pho Com Viet Nam,43.756631,-79.518336,Vietnamese Restaurant
2,York University Heights,43.758781,-79.519434,Shoppers Drug Mart,43.756227,-79.515911,Pharmacy
3,York University Heights,43.758781,-79.519434,Pizza Hut,43.75634,-79.517818,Pizza Place
4,York University Heights,43.758781,-79.519434,KFC,43.756549,-79.519042,Fast Food Restaurant


## Cluster Neighborhoods

In [71]:
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

ny_onehot.head()

Unnamed: 0,Neighborhood,Bakery,Bank,Baseball Field,Basketball Court,Beer Store,Breakfast Spot,Burger Joint,Bus Line,Café,...,Restaurant,Sandwich Place,Science Museum,Shopping Mall,Skating Rink,Sporting Goods Shop,Tennis Court,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant
0,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,York University Heights,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()
ny_grouped

Unnamed: 0,Neighborhood,Bakery,Bank,Baseball Field,Basketball Court,Beer Store,Breakfast Spot,Burger Joint,Bus Line,Café,...,Restaurant,Sandwich Place,Science Museum,Shopping Mall,Skating Rink,Sporting Goods Shop,Tennis Court,Thai Restaurant,Thrift / Vintage Store,Vietnamese Restaurant
0,Bathurst Manor,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,...,0.0,0.083333,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0
2,Don Valley Village,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Flemingdon Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,...,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Henry Farm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
5,Humber Summit,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Maple Leaf,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Pleasant View,0.0625,0.0,0.0,0.0,0.0,0.0625,0.0625,0.0,0.0,...,0.0625,0.0,0.0,0.0625,0.0625,0.0,0.0,0.0,0.125,0.0
8,Victoria Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
9,York University Heights,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.0625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625


In [73]:
num_top_venues = 5

for hood in ny_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = ny_grouped[ny_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor----
               venue  freq
0         Playground  0.25
1     Baseball Field  0.25
2               Park  0.25
3  Convenience Store  0.25
4        Pizza Place  0.00


----Bayview Village----
                 venue  freq
0          Pizza Place  0.08
1                 Bank  0.08
2       Hardware Store  0.08
3       Breakfast Spot  0.08
4  Sporting Goods Shop  0.08


----Don Valley Village----
            venue  freq
0     Coffee Shop   0.3
1  Sandwich Place   0.2
2     Pizza Place   0.1
3   Grocery Store   0.1
4            Park   0.1


----Flemingdon Park----
                  venue  freq
0         Movie Theater  0.08
1  Fast Food Restaurant  0.08
2         Deli / Bodega  0.08
3         Grocery Store  0.08
4        Science Museum  0.08


----Henry Farm----
          venue  freq
0  Tennis Court   0.5
1          Park   0.5
2        Bakery   0.0
3      Pharmacy   0.0
4  Liquor Store   0.0


----Humber Summit----
                        venue  freq
0                      

In [74]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [78]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bathurst Manor,Convenience Store,Baseball Field,Playground,Park,Vietnamese Restaurant,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant
1,Bayview Village,Hardware Store,Sandwich Place,Fish Market,Fast Food Restaurant,Outdoor Supply Store,Coffee Shop,Persian Restaurant,Pizza Place,Gas Station,Breakfast Spot
2,Don Valley Village,Coffee Shop,Sandwich Place,Pet Store,Park,Pizza Place,Grocery Store,Bank,Breakfast Spot,Convenience Store,Gas Station
3,Flemingdon Park,Gym,Office,Deli / Bodega,Grocery Store,Movie Theater,Coffee Shop,Caribbean Restaurant,Café,Pakistani Restaurant,Science Museum
4,Henry Farm,Tennis Court,Park,Vietnamese Restaurant,Convenience Store,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant,Falafel Restaurant


In [81]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 0, 0, 3, 0, 4, 0, 2, 0], dtype=int32)

In [82]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ny_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,NY,York University Heights,43.758781,-79.519434,0,Pizza Place,Fast Food Restaurant,Grocery Store,Vietnamese Restaurant,Fried Chicken Joint,Liquor Store,Falafel Restaurant,Discount Store,Coffee Shop,Pharmacy
1,NY,Humber Summit,43.760078,-79.57176,0,Bakery,Park,Empanada Restaurant,Construction & Landscaping,Gym,Baseball Field,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market
2,NY,Flemingdon Park,43.718432,-79.333204,0,Gym,Office,Deli / Bodega,Grocery Store,Movie Theater,Coffee Shop,Caribbean Restaurant,Café,Pakistani Restaurant,Science Museum
3,NY,Victoria Village,43.732658,-79.311189,2,Thai Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Bus Line,Vietnamese Restaurant,Convenience Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant
4,NY,Henry Farm,43.769509,-79.354296,3,Tennis Court,Park,Vietnamese Restaurant,Convenience Store,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant,Falafel Restaurant


In [90]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [85]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,York University Heights,Pizza Place,Fast Food Restaurant,Grocery Store,Vietnamese Restaurant,Fried Chicken Joint,Liquor Store,Falafel Restaurant,Discount Store,Coffee Shop,Pharmacy
1,Humber Summit,Bakery,Park,Empanada Restaurant,Construction & Landscaping,Gym,Baseball Field,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market
2,Flemingdon Park,Gym,Office,Deli / Bodega,Grocery Store,Movie Theater,Coffee Shop,Caribbean Restaurant,Café,Pakistani Restaurant,Science Museum
6,Bayview Village,Hardware Store,Sandwich Place,Fish Market,Fast Food Restaurant,Outdoor Supply Store,Coffee Shop,Persian Restaurant,Pizza Place,Gas Station,Breakfast Spot
7,Don Valley Village,Coffee Shop,Sandwich Place,Pet Store,Park,Pizza Place,Grocery Store,Bank,Breakfast Spot,Convenience Store,Gas Station
9,Pleasant View,Thrift / Vintage Store,Bakery,Convenience Store,Park,Japanese Restaurant,Pharmacy,Pizza Place,Burger Joint,Restaurant,Breakfast Spot


In [86]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Bathurst Manor,Convenience Store,Baseball Field,Playground,Park,Vietnamese Restaurant,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant


In [87]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Victoria Village,Thai Restaurant,Mediterranean Restaurant,Middle Eastern Restaurant,Bus Line,Vietnamese Restaurant,Convenience Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant


In [88]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Henry Farm,Tennis Court,Park,Vietnamese Restaurant,Convenience Store,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant,Falafel Restaurant


In [89]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Maple Leaf,Bakery,Basketball Court,Deli / Bodega,Convenience Store,Gym,Grocery Store,Gas Station,Fried Chicken Joint,Fish Market,Fast Food Restaurant
