**Packages**

In [108]:
import requests 
import lxml.html as lh
import bs4 as bs
import urllib.request

import numpy as np 
import pandas as pd 

In [109]:
#URL of the Wikipedia page
url   = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

**Downloading the website and reducing it to the relevant table**

In [110]:
#Get page
page  = urllib.request.urlopen(url).read()
#Create soup object out of it
soup = bs.BeautifulSoup(page, 'html.parser')
#Redruce to table
soup_table = soup.table

**Creating the first uncleaned version of the dataframe in pandas**

In [111]:
#Create a list of the entries of the header part of the table
header = [head.text.strip('\n') for head in soup_table.find_all("th")]
# Cycle through rows and then entries and create a two dimensional list
data   = [[td.text.strip('\n') for td in tr.find_all("td")] for tr in soup_table.find_all("tr")]
# THis is to drop the first empty entry
data    = [row for row in data if len(row) == 3]

df = pd.DataFrame(data,columns=header)
#print(df.info(verbose=True))
print(df[0:5])

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


**Cleaning the dataframe step by step**

In [112]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df_reduced=df[~df.Borough.str.contains('Not assigned')]
print(df[0:5])

  Postcode           Borough     Neighbourhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


In [113]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
print(df_reduced[df_reduced.Neighbourhood.str.contains('Not assigned')])
# It is never the case, so we do not need to care.

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


In [114]:
#More than one neighborhood can exist in one postal code area. 
#For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods:
#    Harbourfront and Regent Park.
#These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
df_reduced=df_reduced.sort_values(by=['Postcode','Borough','Neighbourhood'], ascending=[1,1,1]).reset_index(drop=True)

df_reduced=df_reduced.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

print(df_reduced[0:5])

  Postcode      Borough                           Neighbourhood
0      M1B  Scarborough                          Malvern, Rouge
1      M1C  Scarborough  Highland Creek, Port Union, Rouge Hill
2      M1E  Scarborough       Guildwood, Morningside, West Hill
3      M1G  Scarborough                                  Woburn
4      M1H  Scarborough                               Cedarbrae


In [115]:
df_reduced.shape

(103, 3)