In [1]:
# standard computing libraries
import pandas as pd
import numpy as np
import json

# webscraping
from bs4 import BeautifulSoup
import requests

# geocoder for retrieving coordinates of postcodes
import geocoder

# k-means clustering
from sklearn.cluster import KMeans

# folium for maps
import folium

# geopandas for advanced geojson handling
#import geopandas

# library to access overpass api in a more convenient way
#import overpy

# ipython command to use matplotlib
%matplotlib inline


In [2]:
postcode = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050.")

In [3]:
postcodesoup = BeautifulSoup(postcode.content)

In [4]:
list_postcodes = list()

for tr in postcodesoup.find_all("tr")[2:-5]:
    
    row = (td.text for td in tr.find_all("td"))
    
    list_postcodes.append(row)

In [5]:
df_raw = pd.DataFrame(list_postcodes, columns=["Postcode","Borough","Neighbourhood"])

In [6]:
def concatNeighbourhood(row):
    
    return pd.Series(dict(Borough = row['Borough'].unique()[0],
                        Neighbourhood = ', '.join(row['Neighbourhood'])))

In [7]:
df_post = (df_raw
            .query('Borough != "Not assigned"')
            .assign(Neighbourhood = lambda x: x["Neighbourhood"].str[:-1].replace('Not assigned', x["Borough"]))
            .groupby('Postcode').apply(concatNeighbourhood)
            .reset_index()
)

df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [8]:
df_post.shape

(103, 3)

In [10]:
df_coord = pd.read_csv('Geospatial_Coordinates.csv')

df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
df_post_coord = (pd.merge(df_post, df_coord, left_on="Postcode", right_on="Postal Code")
                   .drop("Postal Code", axis=1)
                   .rename(columns={"Postcode":"PostalCode"}))

df_post_coord.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [12]:
df_toronto = df_post_coord[df_post_coord['Borough'].str.contains('Toronto')]

df_toronto.shape

(39, 5)