# Import Canada Borough Data

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [4]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

In [5]:
#print(soup.prettify())

## Pull Out Table

In [6]:
tables = soup.find_all('table', class_='sortable')

### Extract Data by Heading

In [7]:
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break

In [8]:
#print(table)

### Put Data in Dataframe

In [9]:
P = []
B = []
N = []
with open('CanadaArea.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighbourhood = [td.text.strip() for td in tds[:3]]
        P.append(Postcode)
        B.append(Borough)
        N.append(Neighbourhood)

In [10]:
d = pd.DataFrame({'Postcode':P,'Borough':B,'Neighbourhood':N})
d.head()
d.shape

(288, 3)

### Remove Not Assigned Stuff

In [11]:
drop = []
for dems in range(len(d)):
    if d['Borough'].iloc[dems] == 'Not assigned':
        drop.append(dems)

In [12]:
df = d.drop(d.index[drop])
df.shape

(211, 3)

In [13]:
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Combine similar postcodes

In [14]:
drop2 = []
for dems in range(len(df)-1):
    if df['Postcode'].iloc[dems] == df['Postcode'].iloc[dems+1]:
        df['Neighbourhood'].iloc[dems] =  df['Neighbourhood'].iloc[dems] + ', ' + df['Neighbourhood'].iloc[dems+1]
        drop2.append(dems+1)

In [15]:
df2 = df.drop(df.index[drop2])

### Replace the one and only 'not assigned' neighborhood with Borough

In [16]:
for i in range(len(df2)):
    if df2['Neighbourhood'].iloc[i] == 'Not assigned':
        df2['Neighbourhood'].iloc[i] = df2['Borough'].iloc[i]

In [17]:
df2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Queen's Park


### Reset Index so it looks nice

In [19]:
df2.reset_index(drop=True).head(12)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
