Coursera Capstone Project - Week 3, 
Toronto Neighborhood K-Mean Clustering

In [8]:
import numpy as np # library to handle data in a vectorized manner
import urllib.request
import wget
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)

Libraries imported.


In [9]:
#pip install shapely

In [10]:
page

<http.client.HTTPResponse at 0x7fee7d7244d0>

In [11]:
#import beautiful soup to format HTML
from bs4 import BeautifulSoup

In [12]:
#jupyter prefers parse type
soup = BeautifulSoup(page, "lxml")

#table starts with <table> tag <table class = "wikitable sorttable">

In [13]:
right_table = soup.find('table', class_='wikitable sortable')

In [14]:
A=[]
B=[]
C=[]
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [15]:
import pandas as pd

In [16]:
df = pd.DataFrame(A, columns = ['Postal Code'])
df['Borough']=B
df['Neighbourhood']=C
df.replace(r'\s', '', regex = True, inplace = True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Notassigned,Notassigned
1,M2A,Notassigned,Notassigned
2,M3A,NorthYork,Parkwoods
3,M4A,NorthYork,VictoriaVillage
4,M5A,DowntownToronto,"RegentPark,Harbourfront"
5,M6A,NorthYork,"LawrenceManor,LawrenceHeights"
6,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment"
7,M8A,Notassigned,Notassigned
8,M9A,Etobicoke,"IslingtonAvenue,HumberValleyVillage"
9,M1B,Scarborough,"Malvern,Rouge"


In [17]:
#Drop not assigned boroughs
cleaned_df = df[df.Borough != 'Notassigned']
toronto_df = pd.DataFrame(cleaned_df)

In [18]:
#display data
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,NorthYork,Parkwoods
3,M4A,NorthYork,VictoriaVillage
4,M5A,DowntownToronto,"RegentPark,Harbourfront"
5,M6A,NorthYork,"LawrenceManor,LawrenceHeights"
6,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment"


In [19]:
toronto_df.shape

(103, 3)

In [20]:
wget_url = 'http://cocl.us/Geospatial_data'
file_name = wget.download(wget_url)

In [21]:
toronto_geo = pd.read_csv(file_name)
toronto_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
result = toronto_df.merge(toronto_geo, on = 'Postal Code')
result.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,NorthYork,Parkwoods,43.753259,-79.329656
1,M4A,NorthYork,VictoriaVillage,43.725882,-79.315572
2,M5A,DowntownToronto,"RegentPark,Harbourfront",43.65426,-79.360636
3,M6A,NorthYork,"LawrenceManor,LawrenceHeights",43.718518,-79.464763
4,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment",43.662301,-79.389494


In [23]:
result['Coordinates'] = list(zip(result['Latitude'], result['Longitude']))

In [24]:
result.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Coordinates
0,M3A,NorthYork,Parkwoods,43.753259,-79.329656,"(43.7532586, -79.3296565)"
1,M4A,NorthYork,VictoriaVillage,43.725882,-79.315572,"(43.725882299999995, -79.31557159999998)"
2,M5A,DowntownToronto,"RegentPark,Harbourfront",43.65426,-79.360636,"(43.6542599, -79.3606359)"
3,M6A,NorthYork,"LawrenceManor,LawrenceHeights",43.718518,-79.464763,"(43.718517999999996, -79.46476329999999)"
4,M7A,DowntownToronto,"Queen'sPark,OntarioProvincialGovernment",43.662301,-79.389494,"(43.6623015, -79.3894938)"


In [25]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [26]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(result['Latitude'], result['Longitude'], result['Borough'], result['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto