## Notebook for week 2 - Data Science Capstone Project

In [1]:
import numpy as np
import pandas as pd

In [2]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTable(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        # Returns only the text from the table
        table = bs.body.table.get_text()
    except AttributeError as e:
        return None
    return table

# Transforms the html strings to a list 
def clean_html(html: str) -> list:
    dirty_output = html.split('\n')
    # final result
    cleaned_output = []
    # removes spaces
    for item in dirty_output:
        if item:
            cleaned_output.append(item)
    return cleaned_output        

def html_to_dict(html: str) -> dict:
    items = clean_html(html)
    # Really bad way to extract the headers IN THIS CASE
    output = {items[0]: [], items[1]: [], items[2]: []}
    # We'll add every first, second and third to the respective key in output
    for index in range(3, len(items), 3):
        # Filters out any entry without a Borough
        if items[index + 1] != 'Not assigned':
            output[items[0]].append(items[index])
            output[items[1]].append(items[index + 1])
            output[items[2]].append(items[index + 2])
    return output   

### Considerations:

    - Wikipedia already grouped the Postal Codes with their different Neighbourhoods.
    - Wikipedia has edited the table and the site no longer has non-assigned Neighbourhoods with named Boroughs.

In [3]:
table_dict = None
html_table = getTable('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
if html_table == None:
    print('Table could not be found')
else:
    table_dict = html_to_dict(html_table)
    
table_df = pd.DataFrame(table_dict)
table_df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
coor = pd.read_csv('Geospatial_Coordinates.csv')
coor.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
data = table_df.join(coor.set_index('Postal Code'), on='Postal Code')
data.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [12]:
all(data['Postal Code'].unique() == table_df['Postal Code'].unique())

True