<h1>Segmenting and Clustering Neighborhoods in Toronto</h1>
<p>Written by: Xavier</p>

<h2>Import Packages</h2>

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import geocoder
print("Imported packages...")

Imported packages...


<h2>Section 1: Initialize Beautiful Soup & Create DataFrame</h2>

In [2]:
# Get data from Wikipedia
r = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
print("Got Wikipedia data...")

# Parse using BeautifulSoup
soup = BeautifulSoup(r,'lxml')
print("Parsed Wikipedia data...")

# Empty list to store data later
table_contents = []

# Get all tables on the Wikipedia page
# KEY_NOTE: There's only one table
table = soup.find('table')
print("Found table...")

# Transpose rows in Wikipedia table
# to empty list
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# Use newly updated list to create dataframe
# with correct headings and format
df=pd.DataFrame(table_contents)
print("Created dataframe...")
df['Borough']=df['Borough'].replace(
    {
        'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
        'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
        'EtobicokeNorthwest':'Etobicoke Northwest',
        'East YorkEast Toronto':'East York/East Toronto',
        'MississaugaCanada Post Gateway Processing Centre':'Mississauga'
    }
)
print("Formatted dataframe...")

Got Wikipedia data...
Parsed Wikipedia data...
Found table...
Created dataframe...
Formatted dataframe...


In [3]:
# Check if data is formatted correctly
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [4]:
# Check the size of the dataframe
df.shape

(103, 3)

<h2>Section 2: Get Lat. & Long.</h2>

In [5]:
# def getGeo(postal_code):
#     # initialize your variable to None
#     lat_lng_coords = None

#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#         g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#         lat_lng_coords = g.latlng

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     return latitude, longitude

geo_data = pd.read_csv("Geospatial_Coordinates.csv")
geo_data.rename(columns={'Postal Code':'PostalCode'},inplace=True)
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [6]:
df = pd.merge(geo_data, df, how='right', on='PostalCode')

In [7]:
df.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M3A,43.753259,-79.329656,North York,Parkwoods
1,M4A,43.725882,-79.315572,North York,Victoria Village
2,M5A,43.65426,-79.360636,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,43.718518,-79.464763,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,43.662301,-79.389494,Queen's Park,Ontario Provincial Government
