In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [2]:
#returning HTML of the Wikipedia website
wiki_website = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [3]:
soup = BeautifulSoup(wiki_website, 'lxml')

In [4]:
#finding the table in the file

wiki_table = soup.find('table', class_='wikitable')

In [5]:
toronto_data = []

#looping through the table to extract text of the table
for row in wiki_table.find_all('tr'):
    cols = row.find_all('td')
    if len(cols) == 3:
        toronto_data.append((cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip()))


In [6]:
toronto_array = np.asarray(toronto_data)

In [7]:
df = pd.DataFrame(toronto_array)

In [8]:
df.columns = ['PostalCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
# dropping the rows where borough is not assigned

df1 = df[df.Borough != 'Not assigned']
df2 = df1.reset_index(drop=True)
df2.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [10]:
# combining rows with the same PostalCode using groupby
join_rows = lambda a: ", ".join(a)
df3 = df2.groupby(['PostalCode', 'Borough']).agg({'Neighborhood': join_rows}).reset_index()

df3.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [11]:
# assigning the name of the Borough to the Neighborhood, where Neighborhood is not assigned

df3.Neighborhood.replace('Not assigned', df3.Borough, inplace=True)
toronto_table = df3
toronto_table[85:88]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park
86,M7R,Mississauga,Canada Post Gateway Processing Centre
87,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern


In [12]:
toronto_table.shape

(103, 3)