In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans

### 1. Tha Toronto neighborhoods dataset

The dataset was taken from the https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M website,

and loaded into the excel file Toronto_Boroughs_Neighborhoods.xlsx .

In [2]:
data = pd.read_excel('Toronto_Boroughs_Neighborhoods.xlsx')

In [3]:
data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [4]:
# replace "/" by ","
data['Neighborhood'] = data['Neighborhood'].str.replace('/',',')

In [5]:
# remove rows with not assigned boroughs
data = data[data['Borough'] != 'Not assigned']

In [6]:
# are there any boroughs with not assigned neignborhoods ?
m = data[data['Neighborhood']=='Not assigned']
print('there are {} boroughs with not assigned neighborhoods in the dataframe'.format(m.shape[0]))

there are 0 boroughs with not assigned neighborhoods in the dataframe


In [7]:
# note that when conditioning two cells above, in the remaining dataframe the first index will be 2.. 
# similarly there will be gaps between the indices also in the remaining dataframe
# here we rename the indices so that they start from 0 and differ by 1
data.index = list(range(data.shape[0]))

In [8]:
data.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [9]:
data.shape

(104, 3)

### 2. Geographical coordinates of Toronto postal codes

In [10]:
# pip install pgeocode
import pgeocode

In [11]:
canada =  pgeocode.Nominatim('ca')

In [12]:
latitude = []
longitude = []
for postal_code in data['Postal code']:
    lat = canada.query_postal_code(str(postal_code)).latitude
    latitude.append(lat)
    long = canada.query_postal_code(str(postal_code)).longitude
    longitude.append(long)

In [13]:
coordinates = { 'Postal code' : data['Postal code'],
              'Latitude' : latitude,
              'Longitude' : longitude}
coordinates = pd.DataFrame(coordinates)
coordinates.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M3A,43.7545,-79.33
1,M4A,43.7276,-79.3148
2,M5A,43.6555,-79.3626
3,M6A,43.7223,-79.4504
4,M7A,43.6641,-79.3889


In [14]:
data =  data.join(coordinates.set_index('Postal code'), on='Postal code')

In [15]:
data.isnull().sum()

Postal code     1
Borough         1
Neighborhood    0
Latitude        2
Longitude       2
dtype: int64

In [16]:
# drop NaN values and keep only rows with data
data = data.dropna()

In [17]:
data.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.6641,-79.3889


### 3. Neighborhood clusters

In [18]:
# pip install folium and geopy if necessary
import folium
from geopy.geocoders import Nominatim

In [19]:
# Toronto map & coordinates

geolocator = Nominatim(user_agent="__")
Toronto = geolocator.geocode("Toronto")

map_toronto = folium.Map(location=[Toronto.latitude, Toronto.longitude], zoom_start=10)

In [20]:
# gather all postal code coordinates into the array X 
x = data['Latitude'].values
lat = x.reshape(x.size,1)
y = data['Longitude'].values
long = y.reshape(y.size,1)

X = np.concatenate((lat,long), axis = 1)

In [21]:
# the map colours
colours = ['red', 'blue', 'yellow', 'black', 'gray', 'purple']

In [22]:
def cluster_the_city(n_clusters):  # pick up to six clusters
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X)
    
    labels = kmeans.labels_.reshape(kmeans.labels_.size,1)

    labelled_data = np.concatenate((X,labels), axis = 1)

    for j, color in zip(range(n_clusters),colours[:n_clusters]):
        k = labelled_data[labelled_data[:,2]==j]
        k = k[:,[0,1]]
        for lat, lng in zip(k[:,0], k[:,1]):
            borough = data[(data['Latitude']==lat)&(data['Longitude']==lng)]['Borough'].values
            neighborhood = data[(data['Latitude']==lat)&(data['Longitude']==lng)]['Neighborhood'].values
            label = '{}, {}'.format(neighborhood, borough)
            label = folium.Popup(label, parse_html=True)
            folium.CircleMarker(
                [lat, lng],
                radius=5,
                popup=label,
                color=color,
                fill=True,
                fill_color='#3186cc',
                fill_opacity=0.7,
                parse_html=False).add_to(map_toronto) 
    
    return map_toronto

In [23]:
cluster_the_city(6)