In [4]:
# import libraries 

import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [5]:
# read the HTML script

html = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()

soup = BeautifulSoup(html, features='lxml')
#print(soup.prettify())

In [6]:
# all the table contents which we intend to extract is under class Wikitable Sortable

My_table = soup.find('table',{'class':'wikitable sortable'})
#My_table

In [9]:
# extract the table contents and append to the list data

data = []

for record in My_table.findAll('td'):
    data.append(record.text)

data[0:9]

['M1A',
 'Not assigned',
 'Not assigned\n',
 'M2A',
 'Not assigned',
 'Not assigned\n',
 'M3A',
 'North York',
 'Parkwoods\n']

In [10]:
# extract three seperate lists for PostalCode, Borough and Neighborhood from data 

PostalCode = data[0::3]
Borough = data[1::3]
Neighborhood = data[2::3]
Neighborhood = [line.rstrip('\n') for line in Neighborhood]

In [12]:
# create the dataframe df from the lists PostalCode, Borough and Neighborhood

df = pd.DataFrame()
df['PostalCode'] = PostalCode
df['Borough'] = Borough
df['Neighborhood'] = Neighborhood

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [13]:
# ignore cells with a borough that is "Not assigned"

df = df[df.Borough != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [14]:
# combine row values of Neighborhood into one row that share the same postal code 

df = df.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(','.join)

# for Borough "Queen's Park", replace its Neighborhood column value with "Queen's Park"

df.replace('Not assigned', 'Queen\'s Park', inplace = True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [15]:
df.shape

(103, 3)