### 1. Extract data from wikipedia into a dataframe

In [10]:
# Data wrangling modules
import io
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Extract postal codes from wikipedia into an html soup:

In [11]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(url,'lxml')

Construct table with postal codes

In [12]:
table = soup.find('table', class_='wikitable sortable')
table_rows = table.find_all('tr')
loc = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        loc.append(row)
loc[:3]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods']]

Create dataframes from the data

In [13]:
df = pd.DataFrame(loc, columns=["Postalcode", "Borough", "Neighbourhood"])
df.head(3)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods


### 2. Clean Dataframe

Remove "Boroughs" and "Neighborhoods" with "Not assigned"

In [14]:
df.replace(to_replace='Not assigned', value=np.NaN, inplace=True)
df.dropna(axis=0, subset=['Borough', 'Neighbourhood'], inplace=True)
df.reset_index(drop = True, inplace = True)
df[:3]

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront


Combine rows with identical postal code

In [15]:
df=df.groupby(['Postalcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df.head(3)

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"


In [16]:
df.shape

(102, 3)