In [1]:
import folium
import geocoder
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# extract html data
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html_doc = r.content.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
table_tag = soup.find('table', {'class': 'wikitable sortable'})

In [3]:
# extract table header from html
columns = ['PostalCode', 'Borough', 'Neighborhood']

# extract rows from html
row_tags = table_tag.find_all('tr')
rows = []
for row_tag in row_tags:
    item_tags = row_tag.find_all('td')
    if len(item_tags) != len(columns):
        continue
    row = list(map(lambda x: x.text.strip(), item_tags))
    # ignore rows with unassigned borough
    if row[1] == 'Not assigned':
        continue
    # set neighborhood as borough if unassigned
    if row[2] == 'Not assigned':
        row[2] = row[1]
    rows.append(row)

# Requirements

* The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.
* Only process the cells that have an assigned borough. Ignore cells with a borough that is **Not assigned**.
* More than one neighborhood can exist in one postal code area. These rows will be combined into one row with the neighborhoods separated with a comma.
* If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough.
* Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
* In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
# generate dataframe as requested
df = pd.DataFrame(rows, columns=columns)
df_postal = df.groupby(['PostalCode', 'Borough']).agg(lambda x: ', '.join(set(x))).reset_index()
df_postal

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Morningside, Guildwood, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview"
7,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile"
8,M1M,Scarborough,"Cliffside, Scarborough Village West, Cliffcrest"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
df_postal.shape

(103, 3)

After trying the geocoder package, the location coordinates turned out to be unavailable for me.

So I had to use the csv file instead.

In [6]:
!wget -O Geospatial_Coordinates.csv -q https://cocl.us/Geospatial_data

In [7]:
df_coord = pd.read_csv('Geospatial_Coordinates.csv')
df_coord.columns.values[0] = 'PostalCode'
df_coord

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [8]:
key_column = 'PostalCode'
df_total = df_postal.set_index(key_column).join(df_coord.set_index(key_column), how='left').reset_index()
df_total

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, Guildwood, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview",43.727929,-79.262029
7,M1L,Scarborough,"Oakridge, Clairlea, Golden Mile",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Scarborough Village West, Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [9]:
df_total.shape

(103, 5)

In [10]:
df_toronto = df_total[df_total['Borough'].str.contains('Toronto')]
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"Riverdale, The Danforth West",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Summerhill East, Moore Park",43.689574,-79.38316
49,M4V,Central Toronto,"Deer Park, Forest Hill SE, Summerhill West, Ra...",43.686412,-79.400049


In [11]:
fill_color = '#3186cc'

toronto_coord = (43.682467, -79.390713)
toronto_map = folium.Map(location=toronto_coord, zoom_start=12)

for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue',\
        fill=True, fill_color=fill_color, fill_opacity=0.7, parse_html=False)\
        .add_to(toronto_map)

toronto_map

In [12]:
from sklearn.cluster import KMeans

fill_colors = ['red', 'green', 'yellow', 'blue']
cluster_fill_colors = ['darkred', 'darkgreen', 'gold', 'darkblue']
n_clusters = 4

# do a k-means clustering with k=4
kmean_classifier = KMeans(n_clusters=n_clusters)
X_train = df_toronto[['Latitude', 'Longitude']]
kmean_classifier.fit(X_train)
kmean_classifier.cluster_centers_

array([[ 43.66943648, -79.32465436],
       [ 43.70563855, -79.39811351],
       [ 43.65506566, -79.44547176],
       [ 43.65434514, -79.38272671]])

In [13]:
toronto_coord = (43.682467, -79.390713)
toronto_map = folium.Map(location=toronto_coord, zoom_start=12)

# add clustered markers to the map
for idx, lat, lng, borough, neighborhood in zip(range(df_toronto.shape[0]), df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    color_idx = kmean_classifier.labels_[idx]
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color=fill_colors[color_idx],\
        fill=True, fill_color=fill_colors[color_idx], fill_opacity=0.7, parse_html=False)\
        .add_to(toronto_map)

# add cluster center markers to the map
for idx, (lat, lng) in enumerate(kmean_classifier.cluster_centers_):
    color_idx = idx
    label = 'cluster #{}'.format(idx + 1)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color=cluster_fill_colors[color_idx],\
        fill=True, fill_color='white', fill_opacity=1.0, parse_html=False)\
        .add_to(toronto_map)

toronto_map

# My Analysis

After filtering all neighborhoods in City of Toronto, I decided to run a KMeans of 4 clusters.

The result makes sense to me.

* West wing (yellow), east wing (red), north wing (green) on three sides.
* Downtown (blue) in the center.
* Downtown has its neighborhoods with the highest density.
* East wing is of the greatest sparsity.
* The cluster center of north wing happens to lie on a crossroad, interesting.