## PART 1: Getting table from Wikipedia Page

In [2]:
import pandas as pd
import numpy as np

In [3]:
import requests
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
from bs4 import BeautifulSoup


In [4]:
df = pd.read_html(website_url)[0]
df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)

In [5]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df = df[df.Borough != 'Not assigned']
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [7]:
df.shape

(103, 3)

## PART 2: Getting Latitudes and Longitudes

In [3]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [8]:
#import geocoder # import geocoder

postal_code = df['Postal Code']
postal_code

2      M3A
3      M4A
4      M5A
5      M6A
6      M7A
      ... 
160    M8X
165    M4Y
168    M7Y
169    M8Y
178    M8Z
Name: Postal Code, Length: 103, dtype: object

In [None]:
# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format('M5G'))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

In [30]:
coord = pd.read_csv('Geospatial_Coordinates.csv', header = 0)
coord

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [23]:
#df = df.set_index(df['Postal Code'])
df = df.drop('Postal Code', 1)
df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [32]:
coord = coord.set_index(coord['Postal Code'])
coord = coord.drop('Postal Code', 1)
coord.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [33]:
table = pd.concat([df, coord], axis=1, join='inner')
table.head(15)

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M3B,North York,Don Mills,43.745906,-79.352188
M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [34]:
table = table.reset_index()

In [35]:
table

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## PART 3 : Neighborhood clusters in Toronto

In [36]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans


Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.



In [37]:
!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 4.9.1
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

Libraries imported.


In [39]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Ontario are 43.65238435, -79.38356765.


In [41]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(table['Latitude'], table['Longitude'], table['Borough'], table['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [50]:
# one hot encoding
toronto_onehot = pd.get_dummies(table[['Borough']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = table['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,Parkwoods,0,0,0,0,0,0,1,0,0,0
1,Victoria Village,0,0,0,0,0,0,1,0,0,0
2,"Regent Park, Harbourfront",0,1,0,0,0,0,0,0,0,0
3,"Lawrence Manor, Lawrence Heights",0,0,0,0,0,0,1,0,0,0
4,"Queen's Park, Ontario Provincial Government",0,1,0,0,0,0,0,0,0,0


In [65]:
toronto_onehot.shape

(103, 11)

In [66]:
column = toronto_onehot.columns[1:]
column

Index(['Central Toronto', 'Downtown Toronto', 'East Toronto', 'East York',
       'Etobicoke', 'Mississauga', 'North York', 'Scarborough', 'West Toronto',
       'York'],
      dtype='object')

In [102]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()

toronto_grouped = toronto_grouped.set_index('Neighbourhood')
toronto_grouped = toronto_grouped[column].astype(float)
toronto_grouped

Unnamed: 0_level_0,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"Alderwood, Long Branch",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
"Willowdale, Willowdale West",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Woodbine Heights,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
York Mills West,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [103]:
toronto_grouped.shape

(99, 10)

In [104]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 4, 1, 1, 1, 2, 3, 0, 0, 2], dtype=int32)

In [106]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)


toronto_grouped.set_index('Neighbourhood',inplace = True)
toronto_grouped.head() # check the last columns!

Unnamed: 0_level_0,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Agincourt,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"Alderwood, Long Branch",4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"Bathurst Manor, Wilson Heights, Downsview North",1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Bayview Village,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"Bedford Park, Lawrence Manor East",1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [116]:
#table = table[['Neighbourhood', 'Latitude', 'Longitude']]
#table.set_index('Neighbourhood', inplace=True)
print(table.shape, toronto_grouped.shape)

(103, 2) (99, 11)


In [118]:
toronto_merged = pd.concat([table, toronto_grouped], axis=1, join='inner')
toronto_merged.head(15)

Unnamed: 0_level_0,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Parkwoods,43.753259,-79.329656,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Victoria Village,43.725882,-79.315572,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"Regent Park, Harbourfront",43.65426,-79.360636,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Islington Avenue, Humber Valley Village",43.667856,-79.532242,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"Malvern, Rouge",43.806686,-79.194353,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Don Mills,43.745906,-79.352188,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"Garden District, Ryerson",43.657162,-79.378937,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
toronto_merged.reset_index(inplace=True)
toronto_merged.head(3)

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,East York,Etobicoke,Mississauga,North York,Scarborough,West Toronto,York
0,Parkwoods,43.753259,-79.329656,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,Victoria Village,43.725882,-79.315572,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,"Regent Park, Harbourfront",43.65426,-79.360636,2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters