Code for importing the table from Wikipedia, data cleansing and presentation

In [32]:
# Library imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.cm as cm 
import matplotlib.colors as colors
import numpy as np
import folium

#Web scrapping
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0] 
df = pd.read_html(str(table))[0]

#Drop nan values
df.dropna(subset = ['Neighborhood'], inplace = True)

#Replace / by ,
df['Neighborhood'] = df['Neighborhood'].str.replace(" /", ",")

#Setup index values
df.sort_values(by=['Postalcode'], ascending=True, inplace=True)
df.set_index('Postalcode', inplace = True)

print(df.head(10))
print('\n')
print(("This dataframe is {} long").format(df.shape[0]))

                Borough                                     Neighborhood
Postalcode                                                              
M1B         Scarborough                                   Malvern, Rouge
M1C         Scarborough           Rouge Hill, Port Union, Highland Creek
M1E         Scarborough                Guildwood, Morningside, West Hill
M1G         Scarborough                                           Woburn
M1H         Scarborough                                        Cedarbrae
M1J         Scarborough                              Scarborough Village
M1K         Scarborough      Kennedy Park, Ionview, East Birchmount Park
M1L         Scarborough                  Golden Mile, Clairlea, Oakridge
M1M         Scarborough  Cliffside, Cliffcrest, Scarborough Village West
M1N         Scarborough                      Birch Cliff, Cliffside West


This dataframe is 103 long


Attempt to retrieve coordinates using Geodecoder. Failed attempt, so data provided in the course is used.

In [None]:
# import geocoder
import geocoder

# initialize variable to None
lat_lng_coords = None
postal_code = "M3A"

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1] 
print(latitude, longitude)

Using Geospatial_Coordinates.csv file

In [33]:
# fetching de file from a local folder
df_coor = pd.read_csv('Geospatial_Coordinates.csv', header=0, names=['Postal Code', 'Latitude', 'Longitude'], index_col=False )

# setting up de index
df_coor.sort_values(by=['Postal Code'], ascending=True, inplace=True)
df_coor.set_index('Postal Code', inplace = True)
df_coor.index.name = 'Postalcode'

print(df_coor.head())
print(df_coor.shape)

             Latitude  Longitude
Postalcode                      
M1B         43.806686 -79.194353
M1C         43.784535 -79.160497
M1E         43.763573 -79.188711
M1G         43.770992 -79.216917
M1H         43.773136 -79.239476
(103, 2)


Merging both dataframe

In [34]:
res = pd.merge(df, df_coor, on='Postalcode')
res.head(5)

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Postalcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


This code is intended to display the Neighborhood of Toronto in a map using Folium library

In [35]:
tor_latitude = 43.651070
tor_longitude = -79.347015

mymap = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=10) 

# add several blue circle markers 
for lat, lng, label in zip(res.Latitude, res.Longitude, res.Neighborhood): 

    folium.CircleMarker([lat, lng], radius=5, color='red', popup=label, fill = True, fill_color='red', fill_opacity=0.6).add_to(mymap)  

# display map 
mymap 

Initializing the clustering object 

In [36]:
from sklearn.cluster import KMeans
kclusters = 4
k_means = KMeans(init="k-means++", n_clusters=kclusters, n_init=12)

grouped_clustering = res.drop(['Borough', 'Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

The clustering was made assuming as differentiation characteristics the simple geographical location of each neighborhood.

In [37]:
# create map
map_clusters = folium.Map(location=[tor_latitude, tor_longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(res['Latitude'], res['Longitude'], res['Neighborhood'], kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters