# Segmentation and Clustering of Neighborhoods in Toronto

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import warnings
warnings.filterwarnings('ignore')

Retrieve the data from Wikipedia

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df = pd.read_html(url, header=0)[0]
df.rename(columns={'Neighbourhood':'Neighborhood'},inplace=True)
df.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods


Only process the cells that have an assigned borough. Ignore cells with a borough that is **Not assigned**.

In [3]:
df1 = df[df['Borough'] != "Not assigned"].reset_index(drop=True)
df1.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [4]:
# Check if there are repeated Postal Codes
res = (len(df1["Postal Code"]) == len(df1["Postal Code"].unique()))
if res == False:
    print("There are repeated postal codes")
else:
    print("There are no repeated postal codes.")

There are no repeated postal codes.


Check the shape of the dataframe and print the number of rows

In [5]:
df1.shape

(103, 3)

The dataframe has 103 rows

### Add the geographical coordinates (latitude and Longitude) to the dataframe

In [6]:
import geocoder # import geocoder

**Note:** the geocoder.google function was unreliable, so I used the csv file. 

In [7]:
df_geo =pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
neighborhoods = df1.copy()
neighborhoods['Latitude'] = 0
neighborhoods['Longitude'] = 0

In [9]:
for PsCode in df_geo['Postal Code']:
    # Get the longitude and latitude
    lat = df_geo['Latitude'][df_geo['Postal Code']==PsCode].values
    long = df_geo['Longitude'][df_geo['Postal Code']==PsCode].values
    # Add the latitude and longitude to the dataframe neighborhoods
    neighborhoods['Latitude'][ neighborhoods['Postal Code']==PsCode ] = lat[0]
    neighborhoods['Longitude'][ neighborhoods['Postal Code']==PsCode ] = long[0]

In [10]:
neighborhoods.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636


### Explore and cluster the neighborhood in Toronto

In [11]:
# import the k-means and folium library
from sklearn.cluster import KMeans
import folium # map rendering library

For this part, we will work with the Downtown Toronto neighborhood.

In [12]:
DowntownToronto_data = neighborhoods[neighborhoods['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
DowntownToronto_data.head(6)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


Show the neighborhoods on the map

In [13]:
# create map of Toronto using latitude and longitude values
latitude = 43.6532
longitude = -79.3832
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=13)

# add markers to map
for lat, lng, borough, neighborhood in zip(DowntownToronto_data['Latitude'], DowntownToronto_data['Longitude'], DowntownToronto_data['Borough'], DowntownToronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Let's create a function that returns the number of neighborhoods which will be used for clustering

In [14]:
def getNumberOfNeighborhoods(data):
    numberNb = []
    New_df = data
    New_df['Number of Neighborhood'] = 0
    for nb in data['Neighborhood']:
        numberNb.append(len(nb.split(',')))
                
    New_df = data
    New_df['Number of Neighborhood'] = numberNb
    
    return(New_df)

In [15]:
New_df = getNumberOfNeighborhoods(DowntownToronto_data)

In [16]:
New_df.head(3)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Number of Neighborhood
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2


Perfom clustering using the latitudes and the number of neighborhood

In [17]:
# set number of clusters
kclusters = 4
# run k-means clustering
# kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(New_df_clustering)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(New_df[['Latitude','Number of Neighborhood']])
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:17] 

array([1, 1, 1, 3, 3, 3, 3, 0, 0, 1, 1, 1, 0, 2, 3, 3, 1], dtype=int32)

In [18]:
# add clustering labels
New_df_Clusted = New_df
New_df_Clusted.insert(6, 'Cluster Labels', kmeans.labels_)
New_df_Clusted

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Number of Neighborhood,Cluster Labels
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,1
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,1
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,1
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,1,3
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,1,3
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,1,3
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564,1,3
7,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568,3,0
8,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752,3,0
9,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576,2,1


In [19]:
import matplotlib.cm as cm
import matplotlib.colors as colors

Plot the clusters on the map

In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(New_df_Clusted['Latitude'], New_df_Clusted['Longitude'], New_df_Clusted['Neighborhood'], New_df_Clusted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

The map shows the postal codes in downtown Toronto with similar number of neighborhoods.
We can see that the postal codes with similar number of neighborhoods are not necesseraly adjacent to each other geographically.