# Scrape wiki website

In [120]:
!pip install BeautifulSoup4



In [121]:
import requests
import pandas as pd
import numpy as np # library to handle data in a vectorized manner
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors

# Get URL

In [122]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url).text
re = BeautifulSoup(results, 'html.parser')
table = re.find('table')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>Ea

# Create empty dataframe

In [123]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
frame = pd.DataFrame(columns = column_names)

# Find info under 'td' and filter out not assigned

In [124]:
for tr in table.findAll("tr"):
    row = []
    for td in tr.findAll('td'):
        row.append(td.text.strip())

    if len(row) == 3 and row[2] != '':
            frame.loc[len(frame)] = row
#print (rows)

frame.isnull().sum() #check for bad rows
frame.Neighborhood = frame.Neighborhood.replace('Not assigned', frame.Borough)  

In [125]:
final_frame =frame.groupby(['PostalCode','Borough'])['Neighborhood'].agg(','.join).reset_index()
final_frame

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,Malvern / Rouge
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,M1E,Scarborough,Guildwood / Morningside / West Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [126]:
final_frame.shape

(103, 3)

In [127]:
!pip install geopy



In [128]:
from  geopy.geocoders import Nominatim

In [129]:
file = pd.read_csv('http://cocl.us/Geospatial_data')
file.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [130]:
file = file.rename(columns = {'Postal Code':'PostalCode'})
file

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [131]:
final_frame = final_frame.set_index('PostalCode').join(file.set_index('PostalCode')).reset_index()

In [132]:
final_frame

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
1,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek,43.784535,-79.160497
2,M1E,Scarborough,Guildwood / Morningside / West Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437


# Use geopy library to get the latitude and longitude values of Toronto

In [133]:
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


# create map of Toronto using latitude and longitude values

In [134]:
!conda install -c conda-forge folium=0.5.0 --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [135]:
import folium
from sklearn.cluster import KMeans

In [136]:
final_frame['Borough'].unique() #find the unique values of 'Borough'

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

# Cluster with string containing 'York'

In [137]:
toronto_york=final_frame[final_frame['Borough'].str.contains("York")]
toronto_york.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
17,M2H,North York,Hillcrest Village,43.803762,-79.363452
18,M2J,North York,Fairview / Henry Farm / Oriole,43.778517,-79.346556
19,M2K,North York,Bayview Village,43.786947,-79.385975
20,M2L,North York,York Mills / Silver Hills,43.75749,-79.374714
21,M2M,North York,Willowdale / Newtonbrook,43.789053,-79.408493


In [138]:
CLIENT_ID = 'WFK1SF3GTUYLQUHZ15BDG1XOEBSVEAMLQ3DMUWKTJAZOCFGH' # your Foursquare ID
CLIENT_SECRET = 'G1ZQ50ZJ5XIOFMYYCOD1OWG5BCMGR4GDA0ZYTIJLB1XKLD2A' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 500

In [139]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [140]:
york_venues = getNearbyVenues(names=toronto_york['Neighborhood'],
                                   latitudes=toronto_york['Latitude'],
                                   longitudes=toronto_york['Longitude']
                                  )

Hillcrest Village
Fairview / Henry Farm / Oriole
Bayview Village
York Mills / Silver Hills
Willowdale / Newtonbrook
Willowdale
York Mills West
Willowdale
Parkwoods
Don Mills
Don Mills
Bathurst Manor / Wilson Heights / Downsview North
Northwood Park / York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill / Woodbine Gardens
Woodbine Heights
Leaside
Thorncliffe Park
East Toronto
Bedford Park / Lawrence Manor East
Lawrence Manor / Lawrence Heights
Glencairn
Humewood-Cedarvale
Caledonia-Fairbanks
North Park / Maple Leaf Park / Upwood Park
Del Ray / Mount Dennis / Keelsdale and Silverthorn
Runnymede / The Junction North
Humber Summit
Humberlea / Emery
Weston


In [141]:
york_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,Fairview / Henry Farm / Oriole,43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


In [142]:
york_venues.shape

(338, 7)

# Check how many venues were returned for each neighborhood

In [143]:
york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bathurst Manor / Wilson Heights / Downsview North,19,19,19,19,19,19
Bayview Village,4,4,4,4,4,4
Bedford Park / Lawrence Manor East,26,26,26,26,26,26
Caledonia-Fairbanks,4,4,4,4,4,4
Del Ray / Mount Dennis / Keelsdale and Silverthorn,4,4,4,4,4,4
Don Mills,27,27,27,27,27,27
Downsview,13,13,13,13,13,13
East Toronto,3,3,3,3,3,3
Fairview / Henry Farm / Oriole,67,67,67,67,67,67
Glencairn,4,4,4,4,4,4


In [144]:
# one hot encoding
york = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york['Neighborhood'] =york_venues['Neighborhood'] 
york['Neighborhood']
# move neighborhood column to the first column
fixed_columns = [york.columns[-1]] + list(york.columns[:-1])
york = york[fixed_columns]

york

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Hillcrest Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Fairview / Henry Farm / Oriole,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
333,Humberlea / Emery,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
334,Humberlea / Emery,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
335,Weston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
336,Weston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [150]:
york_grouped = york.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bagel Shop,Bakery,Bank,...,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Women's Store,Yoga Studio
0,Bathurst Manor / Wilson Heights / Downsview North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.105263,...,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bedford Park / Lawrence Manor East,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0
4,Del Ray / Mount Dennis / Keelsdale and Silvert...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Don Mills,0.0,0.0,0.0,0.0,0.074074,0.037037,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Downsview,0.0,0.076923,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Fairview / Henry Farm / Oriole,0.0,0.0,0.014925,0.0,0.014925,0.0,0.0,0.029851,0.029851,...,0.0,0.014925,0.014925,0.0,0.014925,0.0,0.0,0.0,0.029851,0.0
9,Glencairn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
york_grouped.shape

(28, 121)

# Let's print each neighborhood along with the top 5 most common venues¶

In [147]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Bathurst Manor / Wilson Heights / Downsview North----
              venue  freq
0       Coffee Shop  0.11
1              Bank  0.11
2     Deli / Bodega  0.05
3  Sushi Restaurant  0.05
4    Sandwich Place  0.05


----Bayview Village----
                 venue  freq
0                 Bank  0.25
1                 Café  0.25
2   Chinese Restaurant  0.25
3  Japanese Restaurant  0.25
4        Metro Station  0.00


----Bedford Park / Lawrence Manor East----
                venue  freq
0    Sushi Restaurant  0.08
1         Pizza Place  0.08
2          Restaurant  0.08
3         Coffee Shop  0.08
4  Italian Restaurant  0.08


----Caledonia-Fairbanks----
               venue  freq
0               Park  0.50
1      Women's Store  0.25
2               Pool  0.25
3  Accessories Store  0.00
4          Locksmith  0.00


----Del Ray / Mount Dennis / Keelsdale and Silverthorn----
                  venue  freq
0  Fast Food Restaurant  0.50
1        Sandwich Place  0.25
2        Discount Store  0.25


In [151]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [185]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bathurst Manor / Wilson Heights / Downsview North,Coffee Shop,Bank,Fried Chicken Joint,Supermarket,Pharmacy,Pizza Place,Deli / Bodega,Diner,Restaurant,Bridal Shop
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
2,Bedford Park / Lawrence Manor East,Sandwich Place,Coffee Shop,Pizza Place,Sushi Restaurant,Italian Restaurant,Restaurant,Indian Restaurant,Locksmith,Café,Pharmacy
3,Caledonia-Fairbanks,Park,Women's Store,Pool,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
4,Del Ray / Mount Dennis / Keelsdale and Silvert...,Fast Food Restaurant,Sandwich Place,Discount Store,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice


In [186]:
# set number of clusters
kclusters = 5

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 1, 4, 0, 0, 3, 0, 0], dtype=int32)

In [188]:
york_merged = toronto_york

In [201]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_.astype(int))

In [203]:
york_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,M2H,North York,Hillcrest Village,43.803762,-79.363452,0.0,Dog Run,Golf Course,Pool,Mediterranean Restaurant,Yoga Studio,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
18,M2J,North York,Fairview / Henry Farm / Oriole,43.778517,-79.346556,0.0,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Food Court,Tea Room,Restaurant,Sporting Goods Shop,Bank,Convenience Store
19,M2K,North York,Bayview Village,43.786947,-79.385975,0.0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
20,M2L,North York,York Mills / Silver Hills,43.75749,-79.374714,1.0,Park,Cafeteria,Yoga Studio,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
21,M2M,North York,Willowdale / Newtonbrook,43.789053,-79.408493,,,,,,,,,,,
22,M2N,North York,Willowdale,43.77012,-79.408493,0.0,Pizza Place,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Grocery Store,Restaurant,Sandwich Place,Café,Plaza,Juice Bar
23,M2P,North York,York Mills West,43.752758,-79.400049,3.0,Park,Convenience Store,Bank,Bar,Yoga Studio,Electronics Store,Construction & Landscaping,Cosmetics Shop,Curling Ice,Deli / Bodega
24,M2R,North York,Willowdale,43.782736,-79.442259,0.0,Pizza Place,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Grocery Store,Restaurant,Sandwich Place,Café,Plaza,Juice Bar
25,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Fireworks Store,Yoga Studio,Dog Run,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
26,M3B,North York,Don Mills,43.745906,-79.352188,0.0,Beer Store,Asian Restaurant,Restaurant,Gym,Coffee Shop,Japanese Restaurant,Athletics & Sports,Bike Shop,Discount Store,Sandwich Place


In [212]:
# remove NaN in the cluster labels column 
york_merged.dropna(subset = ['Cluster Labels'], inplace=True)
york_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Dog Run,Golf Course,Pool,Mediterranean Restaurant,Yoga Studio,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
18,M2J,North York,Fairview / Henry Farm / Oriole,43.778517,-79.346556,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Food Court,Tea Room,Restaurant,Sporting Goods Shop,Bank,Convenience Store
19,M2K,North York,Bayview Village,43.786947,-79.385975,0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
20,M2L,North York,York Mills / Silver Hills,43.75749,-79.374714,1,Park,Cafeteria,Yoga Studio,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
22,M2N,North York,Willowdale,43.77012,-79.408493,0,Pizza Place,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Grocery Store,Restaurant,Sandwich Place,Café,Plaza,Juice Bar


In [209]:
#convert cluster labels to integer instead of float 
york_merged['Cluster Labels'] = york_merged['Cluster Labels'].astype(int)
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

york_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,M2H,North York,Hillcrest Village,43.803762,-79.363452,0,Dog Run,Golf Course,Pool,Mediterranean Restaurant,Yoga Studio,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice
18,M2J,North York,Fairview / Henry Farm / Oriole,43.778517,-79.346556,0,Clothing Store,Coffee Shop,Fast Food Restaurant,Japanese Restaurant,Food Court,Tea Room,Restaurant,Sporting Goods Shop,Bank,Convenience Store
19,M2K,North York,Bayview Village,43.786947,-79.385975,0,Chinese Restaurant,Café,Bank,Japanese Restaurant,Yoga Studio,Electronics Store,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega
20,M2L,North York,York Mills / Silver Hills,43.75749,-79.374714,1,Park,Cafeteria,Yoga Studio,Comfort Food Restaurant,Construction & Landscaping,Convenience Store,Cosmetics Shop,Curling Ice,Deli / Bodega,Department Store
22,M2N,North York,Willowdale,43.77012,-79.408493,0,Pizza Place,Ramen Restaurant,Coffee Shop,Sushi Restaurant,Grocery Store,Restaurant,Sandwich Place,Café,Plaza,Juice Bar


In [211]:

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighborhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters