### Import modules

In [85]:
import io
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

### Download wikipedia 

In [86]:
wikipedia_page = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
file = requests.get(wikipedia_page).text



### Parsing the data

In [87]:
soup = BeautifulSoup(file, 'html.parser')
table = soup.find_all('table', class_='sortable')
print ('Found {} table(s) in the extracted wiki-html.'.format( len (table ) ) )

Found 1 table(s) in the extracted wiki-html.


### Changing to dataframe

In [88]:

table_headings = table[0].find_all('th') 
headings = [th.text.strip() for th in table_headings]
headings

['Postcode', 'Borough', 'Neighbourhood']

In [89]:
headings[0] =  'Postalcode'

headings

['Postalcode', 'Borough', 'Neighbourhood']

In [90]:
table_rows = []

for tr in table[0].find_all('tr'):
    rows = tr.find_all('td')
    if not rows: 
        continue
    row = [td.text.strip() for td in rows[:3]]
    table_rows.append(row)
table_rows

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 

In [91]:
Postal_Codes_Canada = pd.DataFrame(table_rows, columns=headings)
Postal_Codes_Canada.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Getting rid of "Not assigned" values

In [92]:
Postal_Codes_Canada.replace(to_replace='Not assigned', value=np.NaN, inplace=True)
Postal_Codes_Canada.dropna(axis=0, subset=['Borough'], inplace=True)
Postal_Codes_Canada.reset_index(drop = True, inplace = True)

In [82]:
Postal_Codes_Canada.head()

Unnamed: 0,Postalcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [93]:
s = Postal_Codes_Canada['Neighbourhood'].isna()
print (Postal_Codes_Canada['Borough'].loc[s])
Postal_Codes_Canada['Neighbourhood'].loc[s] = Postal_Codes_Canada['Borough'].loc[s]

6    Queen's Park
Name: Borough, dtype: object


In [94]:
groupby = Postal_Codes_Canada.groupby(['Postalcode','Borough'])

rows = []

for n,g in groupby:
    r = [ n[0], n[1], (', '.join(g['Neighbourhood']) )]
    rows.append(r)

Postal_Codes_Canada = pd.DataFrame(rows, columns=headings)
Postal_Codes_Canada.shape

(103, 3)