### 1. Import dependencies

In [2]:
import sys
!{sys.executable} -m pip install beautifulsoup4

[33mYou are using pip version 9.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### 2. Scrape data from Wikipedia webpage

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
result = requests.get(url).text
result = BeautifulSoup(result, 'html.parser')

In [5]:
# all the postal data are stored in a html table
tag = result.table.tbody 
columns = ['PostalCode', 'Borough', 'Neighborhood']
rows = [row.text.strip().split("\n") for row in tag.find_all('tr')] 
data = pd.DataFrame(rows) #get a basic dataframe
data.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### 3. Edit the dataframe

In [6]:
data.columns = columns # rename the columns
data = data.drop(0) # delete the first row which was the table headers
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
data = data[data['Borough'] != 'Not assigned'] # remove data with unassinged borough 
data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [8]:
# change unassigned neighborhood to the borough's name
for index, row in data.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']


In [9]:
# combine neighborhoods with the same postalcode
group_data = data.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(",".join) 

# make a new dataframe with grouped data
combined_neighborhood = group_data.values
comnined_post = [index[0] for index in group_data.index] 
comnined_borough = [index[1] for index in group_data.index] 
grouped_dataframe = pd.DataFrame({'PostalCode': comnined_post, 'Borough': comnined_borough, 'Neighborhood': combined_neighborhood})
grouped_dataframe.head()

Unnamed: 0,Borough,Neighborhood,PostalCode
0,Scarborough,"Rouge,Malvern",M1B
1,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C
2,Scarborough,"Guildwood,Morningside,West Hill",M1E
3,Scarborough,Woburn,M1G
4,Scarborough,Cedarbrae,M1H


In [10]:
# rearrange the dataframe columns
grouped_dataframe = grouped_dataframe[columns]
grouped_dataframe.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
grouped_dataframe.shape

(103, 3)