# Segmenting & Clustering Neighborhoods in Toronto, Canada

In [5]:
import pandas as pd
import numpy as np
print('Packages downloaded')

Packages downloaded


In [22]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)
print(len(dfs))

3


In [23]:
# read table into pandas dataframe
df = pd.DataFrame(dfs[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [26]:
# drop not assigned values
df.replace("Not assigned", np.nan, inplace = True)
df.dropna(subset=["Borough"], axis=0, inplace=True)

In [29]:
df.reset_index(inplace=True)

In [31]:
df = df.drop(['index'], axis=1)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [35]:
# load csv file containing lat_long data
geodf = pd.read_csv('http://cocl.us/Geospatial_data')
geodf.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [36]:
# merge dataframes
newdf = pd.merge(df, geodf)
newdf.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [37]:
newdf.shape

(103, 5)

In [80]:
import folium
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors

In [71]:
# KMeans clustering into 10 separate Boroughs
k = 10
X = newdf.values[:,(3,4)]
k_means = KMeans(init = "k-means++", n_clusters = k, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[5 3 6 7 6 4 9 2 3 6 8 4 9 3 3 6 8 4 9 3 6 8 9 3 6 8 9 2 7 3 6 8 0 2 7 3 6
 6 0 2 7 3 6 6 0 2 7 3 6 7 4 0 2 7 3 8 1 4 0 2 7 8 8 1 4 5 2 8 8 1 4 5 2 8
 6 1 4 4 5 8 6 1 5 8 6 5 8 6 1 4 5 6 6 1 4 9 6 6 1 6 3 1 1]


In [72]:
# inserting cluster labels into dataframe for mapping
newdf.insert(0, 'Cluster Label', labels)
newdf.head()

Unnamed: 0,Cluster Label,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,5,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,6,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,7,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [79]:
#Coordinates of Toronto ON
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [82]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(newdf['Latitude'], newdf['Longitude'], newdf['Neighbourhood'], newdf['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
#generate map
map_clusters