In [1]:
# Part 1
# 1. Create Notebook and download libraries and packages
# import libraries
import pandas as pd # library to analyze data
import numpy
import requests # library to handle web requests
from bs4 import BeautifulSoup
#
print('Libraries imported')

Libraries imported


In [2]:
# 2. Download wikipedia page, extract PostalCode, Burough & Neighbourhood table
url ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wikitables = pd.read_html(url, header=0, attrs={"class":"wikitable sortable"})
print ("Extracted {num} wikitables".format(num=len(wikitables)))
wikidf = wikitables[0]
wikidf.head(10)

Extracted 1 wikitables


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [3]:
# 3.a) Show table column headers
wikidf.columns

Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')

In [4]:
wikidf.index

RangeIndex(start=0, stop=289, step=1)

In [5]:
wikidf.shape

(289, 3)

In [6]:
# 3. b) Only process cells that have an assigned Borough & ignore those Not assigned
# Drop table rows with Borough <<Not assigned>>
wikidf = wikidf[wikidf.Borough != 'Not assigned']
wikidf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [7]:
wikidf.shape

(212, 3)

In [8]:
# 3. c) Combine Neighbourhoods with same Postcode like M5A
#
# create unique values for Postcode column
new_df = pd.DataFrame({'Postcode':wikidf.Postcode.unique()})
# Add text of Burough column to new dataframe
new_df['Borough']=pd.DataFrame(list(set(wikidf['Borough'].loc[wikidf['Postcode'] == pc['Postcode']])) for i, pc in new_df.iterrows())
# Iterates over the rows of the dataframe to add series of multiple Neighbourhoods in list into Neighbourhood column
new_df['Neighbourhood']=pd.Series(list(set(wikidf['Neighbourhood'].loc[wikidf['Postcode'] == pc['Postcode']])) for i, pc in new_df.iterrows())
# remove unwanted [] from text in Neighbourhood column of new dataframe
new_df['Neighbourhood']=new_df['Neighbourhood'].apply(lambda pc: ', '.join(pc))
#
new_df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
#3. d) If a cell has Borough but <<Not Assigned>> Neighbourhood, then Neighbourhood get same value as Borough
# For Boroughs with <<Not assigned>> Neighbourhood then Neighbourhood <<Not Assigned>> changed to Borough's value
for index, row in new_df.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']
#
new_df.head(12) # See M7A Queen's Park in row id #4

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
# 3. f) Show Number of (Rows, Columns) of new dataframe
# after testing table in Excel and removing duplicates, row count was 103
new_df.shape

(103, 3)

In [16]:
# Part 2
# Get Geo-spatial data from website
gs_data= pd.read_csv("http://cocl.us/Geospatial_data")
gs_data.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
#Set Index of Geospatial data
gs_data.set_index("Postal Code")
#Rename cloumns so it matches column of other data set
gs_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
new_df.set_index("Postcode")
# Merge two data sets 
table_w_coordinates=pd.merge(new_df, gs_data)
table_w_coordinates.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
