# Canada postal code data

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
website_text = requests.get('https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=942655364').text
soup = BeautifulSoup(website_text,'xml')

In [4]:
table = soup.find('table')
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

post_df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
post_df = post_df[~post_df['PostalCode'].isnull()]  # to filter out bad rows
post_df.reset_index(drop=True, inplace=True)


### Check the records

In [5]:
post_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Remove records with Borough = Not Assigned

In [6]:
post_df.drop(post_df[post_df["Borough"]=="Not assigned"].index,inplace = True)
len(post_df)

210

### Merge Neighbourhood

In [7]:
post_group = post_df.groupby("PostalCode")["Neighbourhood"]

In [9]:
from itertools import chain
for name,group in post_group:
    records_dct = dict(group)
    
    if len(records_dct) > 1:
        keylist = list(records_dct.keys())
        post_df.loc[keylist[0],"Neighbourhood"] = ', '.join(str(x) for x in records_dct.values())
        for item in keylist[1:]:
            post_df.drop(item, inplace=True)
            
post_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park


### Reset index

In [10]:
post_df.reset_index(drop=True, inplace=True)
post_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [11]:
post_df[post_df["Neighbourhood"] == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighbourhood


As there are no Neighbourhood with Not assigned value, so no need to process

In [12]:
post_df.shape

(103, 3)