In [88]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
import geocoder


pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 50)

print("Libraries imported")

Libraries imported


## Part 1: Fetch the Postal Codes, Boroughs and Neighborhoods of Toronto

#### Read the HTML page and initialize a BeautifulSoup object with its contents

In [89]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

#### Find all tables in the page

In [90]:
tables = soup.find_all('table')

#### Iterate through the cells of the first table and retrieve the postal codes, boroughs and neighborhoods

In [91]:
#Create array to hold the data we extract
postal_codes = []
boroughs = []
neighborhoods = []

for table in tables:
    rows = table.find_all('tr')
    
    for row in rows:
        cells = row.find_all('td')
        
        if len(cells) > 1:
            postal_code = cells[0]
            postal_codes.append(postal_code.text.replace('\n', '')) # remove any trailing new line character
            
            borough = cells[1]
            boroughs.append(borough.text.replace('\n', ''))
            
            neighborhood = cells[2]
            neighborhoods.append(neighborhood.text.replace('\n', ''))
    
    break # we only need to parse the first table         

#### Transform the data to a dataframe

In [92]:
df = pd.DataFrame({'PostalCode': postal_codes,
                   'Borough': boroughs,
                   'Neighborhood': neighborhoods})

df = df[df['Borough'] != "Not assigned"] #remove any rows without Borough

#### Finally keep the postal codes we need and drop the rest

In [93]:
selected_postal_codes = ['M5G', 'M2H', 'M4B', 'M1J', 'M4G', 'M4M', 'M1R', 'M9V', 'M9L', 'M5V', 'M1B', 'M5A']

df = df[df['PostalCode'].isin(selected_postal_codes)]

df_toronto = df.reset_index(drop=True)

df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5A,Downtown Toronto,"Regent Park, Harbourfront"
1,M1B,Scarborough,"Malvern, Rouge"
2,M4B,East York,"Parkview Hill, Woodbine Gardens"
3,M4G,East York,Leaside
4,M5G,Downtown Toronto,Central Bay Street
5,M2H,North York,Hillcrest Village
6,M1J,Scarborough,Scarborough Village
7,M9L,North York,Humber Summit
8,M4M,East Toronto,Studio District
9,M1R,Scarborough,"Wexford, Maryvale"


## Part 2: Fetch the latitude and longitude of the neighborhoods


In [94]:
# Code to split each row that has more than one neighborhood for a postal code into separate rows, not needed anymore

# df_toronto = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood']) 
# df_toronto.dtypes

# for index, row in df.iterrows():
#     neighborhoods = row['Neighborhood'].split(',')
#     if len(neighborhoods) > 1 :
#         for neighborhood in neighborhoods:
#             new_row = {'PostalCode': row['PostalCode'], 'Borough': row['Borough'], 'Neighborhood': neighborhood}
#             df_toronto = df_toronto.append(new_row, ignore_index=True)
#     else:
#         new_row = {'PostalCode': row['PostalCode'], 'Borough': row['Borough'], 'Neighborhood': row['Neighborhood']}
#         df_toronto = df_toronto.append(new_row, ignore_index=True)

# df_toronto

#### Load geospatial coordinates for Toronto from CSV file

In [95]:
df_location = pd.read_csv("Geospatial_Coordinates.csv")
df_location.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### loop on each postal code to get its latitude and longitude

In [96]:
location = df_location.loc[df_location['Postal Code'] == 'M1B']
location

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353


In [97]:
latitudes = []
longitudes = []

for index, row in df.iterrows():
    postal_code = row['PostalCode']

    location = df_location.loc[df_location['Postal Code'] == postal_code]

    latitude = location.iloc[0]['Latitude']
    longitude = location.iloc[0]['Longitude']

    latitudes.append(latitude)
    longitudes.append(longitude)

df_toronto['Latitude'] = latitudes
df_toronto['Longitudes'] = longitudes

df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitudes
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,M4G,East York,Leaside,43.70906,-79.363452
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
5,M2H,North York,Hillcrest Village,43.803762,-79.363452
6,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
7,M9L,North York,Humber Summit,43.756303,-79.565963
8,M4M,East Toronto,Studio District,43.659526,-79.340923
9,M1R,Scarborough,"Wexford, Maryvale",43.750072,-79.295849


## Part 3: Cluster Toronto neighborhoods