# Segmenting and Clustering Neighborhoods in Toronto

## 1. Scrape postal codes

In [12]:
import pandas as pd
import numpy as np

In [13]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# Extract tables
df_url = pd.read_html(url)

# Get first table                                                                                                           
df_table = df_url[0]


df_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal Code      180 non-null object
Borough          180 non-null object
Neighbourhood    180 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


In [14]:
df_table.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [15]:
#drop rows with "Borough" = "Not assigned"
i_borough_na = df_table[df_table["Borough"] == "Not assigned"].index

df_table_1 = df_table.drop(i_borough_na, axis=0)
df_table_1.shape

(103, 3)

In [16]:
#if "Neighbourhood" = "Not assigned" take over "Borough"
def adapt_hood(hood, borough):
    if hood == "Not assigned":
        return borough
    else:
        return hood

df_table_1["Neighbourhood"] = df_table_1.apply(lambda x: adapt_hood(x["Neighbourhood"], x["Borough"]), axis=1)

In [17]:
#check for duplicate Postal Codes
df_group_pc = df_table_1.groupby(["Postal Code"])["Neighbourhood"].count().to_frame().sort_values(by="Neighbourhood", ascending=False)
df_group_pc.head(5)

Unnamed: 0_level_0,Neighbourhood
Postal Code,Unnamed: 1_level_1
M1B,1
M5R,1
M6G,1
M6E,1
M6C,1


In [18]:
df_table_1.shape

(103, 3)

## 2. Geocoding

In [19]:
! pip install geocoder



In [20]:
import geocoder # import geocoder

In [21]:
"""
loc_list = []

for index, row in df_table_1.iterrows():
    # initialize your variable to None
    lat_lng_coords = None
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(row["Postal Code"]))
      lat_lng_coords = g.latlng
        
    loc_list.append([row["Postal Code"], lat_lng_coords[0], lat_lng_coords[1]])  
    
loc_list
"""

'\nloc_list = []\n\nfor index, row in df_table_1.iterrows():\n    # initialize your variable to None\n    lat_lng_coords = None\n    \n    # loop until you get the coordinates\n    while(lat_lng_coords is None):\n      g = geocoder.google(\'{}, Toronto, Ontario\'.format(row["Postal Code"]))\n      lat_lng_coords = g.latlng\n        \n    loc_list.append([row["Postal Code"], lat_lng_coords[0], lat_lng_coords[1]])  \n    \nloc_list\n'

Geocoder not accessible. Using csv-file instead

In [25]:
df_coord = pd.read_csv("Geospatial_Coordinates.csv")
df_coord.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [26]:
df_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [28]:
df_hoods_coord = pd.merge(df_table_1, df_coord, how="left", on="Postal Code")
df_hoods_coord.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postal Code      103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


In [29]:
df_hoods_coord.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
