## 1.  Create Canada Postal Code dataframe from the table on the Wiki page

In [3]:
#import the libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
print('Libraries Imported')

Libraries Imported


In [4]:
# get data from webpage
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
urltext = requests.get(url).text
postaldata = BeautifulSoup(urltext, 'html.parser')

In [5]:
# display webpage HTML tags
print(postaldata.prettify)

<bound method Tag.prettify of 
<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"5420f6b0-5d99-4191-a6e6-7e68e9679578","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":969510799,"wgRevisionId":969510799,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in 

In [6]:
#create dataframe
columnLabels = ['Postalcode', 'Borough', 'Neighborhood']
torontoCodes = pd.DataFrame(columns = columnLabels)
torontoCodes

Unnamed: 0,Postalcode,Borough,Neighborhood


In [7]:
content = postaldata.find('div', class_='mw-parser-output')
table = content.table.tbody
postalCode = 0
borough = 0
neighborhood = 0

# loop through dataset to find postalCode, borough, neighborhood, using the HTML tags
for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postalCode = td.text.strip('\n').replace(']','')
            i = i + 1
        elif i ==1:
            borough = td.text.strip('\n').replace(']','')
            i = i + 1
        elif i == 2:
            neighborhood = td.text.strip('\n').replace(']','')
    torontoCodes = torontoCodes.append({'Postalcode': postalCode, 'Borough': borough, 'Neighborhood': neighborhood}, ignore_index=True)
torontoCodes

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"


In [8]:
#clean data
# remove rows where Borough = Not assigned or 0
torontoCodes = torontoCodes[torontoCodes.Borough !='Not assigned']
torontoCodes = torontoCodes[torontoCodes.Borough !=0]
torontoCodes.reset_index(drop=True, inplace=True)
i = 0
for i in range(0, torontoCodes.shape[0]):
    if torontoCodes.iloc[i][2] == 'Not assigned':
        torontoCodes.iloc[i][2] = torontoCodes.iloc[i][1]
        i = i+ 1

# if a postal code is listed more than once, combine Neighborhood into one row
df = torontoCodes.groupby(['Postalcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df = df.dropna()
empty = 'Not assigned'
df = df[(df.Postalcode != empty) & (df.Borough != empty) & (df.Neighborhood != empty)]
df.shape

(103, 3)

In [9]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
# group by Borough
def neighborhood_list(grouped):
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))

grp = df.groupby(['Postalcode','Borough'])
dfClean = grp.apply(neighborhood_list).reset_index(name='Neighborhood')
dfClean.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [11]:
dfClean.shape

(103, 3)

## 2. Add latitude and longitude columns for the neighborhoods to the dataframe

In [12]:
# use csv file with coordinates to create a dataframe
path = "http://cocl.us/Geospatial_data"
df_coord = pd.read_csv(path)
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_coord.shape

(103, 3)

In [14]:
df_coord.sort_values(by=['Postal Code'], inplace=True)
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
dfClean.sort_values(by=['Postalcode'], inplace=True)
dfClean.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
df_co = df_coord.rename(columns = {'Postal Code':'Postalcode'})
df_co.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_LatLong = pd.merge(dfClean, df_co, on=['Postalcode'])
df_LatLong.shape

(103, 5)

In [19]:
df_LatLong.head(51)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
