This Notebook will be used for the IBM DATA SCIENCE CERTIFICATION Capstone

In [83]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

from IPython.display import Image 
from IPython.core.display import HTML 
from IPython.display import display_html

import folium
from sklearn.cluster import KMeans

print("Hello Capstone Project Course!")

Hello Capstone Project Course!


In [44]:
#Get Wikipedia page content
torontoPage = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#Create soup object
soup = BeautifulSoup(torontoPage, 'lxml')

table = str(soup.table)
display_html(table, raw = True)


Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


<h1>Now Time to convert into a Pandas Dataframe

In [74]:
torontoDf = pd.read_html(table)[0]
torontoDf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


<H1>DF cleaning and preprocessing

In [77]:
# Drop rows where Borough, and neighborhoods are 'Not assigned'
# also combining neighborhoods with same postal codes
df1 = torontoDf[torontoDf.Borough != 'Not assigned']

#grouping by postal code and borough
df2 = df1.groupby(['Postal Code', 'Borough'], sort = False).agg(', '.join)
df2.reset_index(inplace = True)

#np.where checks a condition and returns the second argument if true, and the thrird argument if false
df2['Neighbourhood'] = np.where(df2['Neighbourhood'] == 'Not assigned', df2['Borough'], df2['Neighbourhood'])
df2

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<h1> DF SHAPE

In [82]:
#df Shape
df2.shape

(103, 3)

<h1>Import and merging Latitudes and longitudes for Canada's Neighborhoods

In [79]:
cross = pd.read_csv('https://cocl.us/Geospatial_data')
cross.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [80]:
df3 = pd.merge(df2, cross, on= 'Postal Code')
df3.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


<h1>SICK! Now time to cluster and visualize the Toronto boroughs

In [81]:
dfToronto = df3[df3['Borough'].str.contains('Toronto', regex = False)]
dfToronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [94]:
map_toronto = folium.Map(location = [43.6510, -79.3470], zoom_start = 10.5)

for lat,lng,borough,neighbourhood in zip(dfToronto['Latitude'],dfToronto['Longitude'],dfToronto['Borough'],dfToronto['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='red',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
    
map_toronto

<h1>KMeans to cluster neighborhoods 

In [99]:
import matplotlib.cm as cm
import matplotlib.colors as colors

k=5
toronto_clustering = dfToronto.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
kmeans.labels_
#dfToronto.insert(0, 'Cluster Labels', kmeans.labels_)

# create map
clustered_map = folium.Map(location=[43.651070,-79.347015],zoom_start=10.3)

# color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(dfToronto['Latitude'], dfToronto['Longitude'], dfToronto['Neighbourhood'], dfToronto['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clustered_map)
       
clustered_map