# This notebook will be mainly used for the capstone project.

In [1]:
import pandas as pd
import numpy as np

In [2]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Scrape the Wikipedia page

In [3]:
import requests
import urllib.request
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [4]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
html = urlopen(wiki_url)

In [6]:
soup = BeautifulSoup(html, 'html.parser')

In [7]:
# Find the table
wikitable = soup.find('table', attrs={'class': "wikitable sortable"})

In [8]:
# Retrieve the data from webpage

def tableDataText(table):       
    rows = []
    trs = table.find_all('tr')
    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
    return rows

In [9]:
list_table = tableDataText(wikitable)
list_table[:2]

[['Postal Code', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned']]

In [10]:
# Convert the data to Pandas dataframe
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Data wrangling

In [13]:
# Ignore cells with a borough that is Not assigned:
dftable.replace('Not assigned', np.nan, inplace=True)
dftable.dropna(subset=['Borough'], axis=0, inplace=True)
dftable = dftable.reset_index(drop=True)
dftable.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
# Merge Postal Code
dftable.groupby('Postal Code')['Neighbourhood'].apply(' '.join).reset_index()
dftable

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [23]:
# If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
dftable.loc[dftable['Neighbourhood']=='NaN'] # No neighbourhood is Not assigned

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [25]:
dftable.shape

(103, 3)

## Add the latitude and the longitude coordinates of each neighborhood. 

In [28]:
# Read latitude and longitude data to df_ll
df_ll = pd.read_csv('https://cocl.us/Geospatial_data')
df_ll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [27]:
# Merge 2 tables 
df = pd.merge(dftable, df_ll, on=['Postal Code'])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
