In [1]:
import numpy as np
import pandas as pd

In [2]:
# obtain relevant HTML file from Wikipedia
from bs4 import BeautifulSoup
import requests 

toronto_wiki_html = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
toronto_html_doc = requests.get(toronto_wiki_html)
soup = BeautifulSoup(toronto_html_doc.text, 'html.parser')

In [3]:
# Preparing dataframe with PostalCode, Borough and Neighbourhood

data = []
table = soup.find('table', attrs={'class': 'wikitable sortable'}) # extract relevant table from BeautifulSoup
table_body = table.find('tbody')

# extract all rows in table and assign to data list
rows = table_body.find_all('tr') 
for row in rows:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols])
    
toronto_df = pd.DataFrame(data[1:]) # convert data to DataFrame
toronto_df.columns = ['PostalCode', 'Borough', 'Neighbourhood'] # rename columns
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True) # drop rows with Borough = 'Not assigned'

index_no_assigned_neighbourhood = toronto_df[toronto_df['Neighbourhood'] == 'Not assigned'].index # find rows with Neighbourhood = 'Not assigned'
toronto_df['Neighbourhood'][index_no_assigned_neighbourhood] = toronto_df['Borough'][index_no_assigned_neighbourhood] # assign Neighbourhood = Borough

toronto_df_temp = pd.DataFrame(toronto_df.groupby('PostalCode')['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x))).reset_index() # groupby postal code and concatenate Neighbourhoods

toronto_df = toronto_df_temp.join(toronto_df[['PostalCode', 'Borough']].drop_duplicates(subset='PostalCode').set_index('PostalCode'), on='PostalCode') # join Borough column back

toronto_df = toronto_df[['PostalCode', 'Borough', 'Neighbourhood']] # rearrange columns to right order

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [4]:
# The code was removed by Watson Studio for sharing.

In [5]:
# hidden cell to load CSV data file in cloud storage
lat_long_df = pd.read_csv(body)
lat_long_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
# merge original dataframe with dataframe with latlong data
toronto_df = toronto_df.join(lat_long_df.set_index('Postal Code'), on='PostalCode') 
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [7]:
print(toronto_df.shape[0])

103
