# Segmenting and Clustering Neighborhoods in Toronto

First let's import libraries

In [144]:
import pandas as pd
import numpy as np

The html-site is assigned and loaded via pandas read_html. Afterward we will assign the first table on the html site under dfs-pandas dataframe.
Finally we are going to look up our 5 first values of our dfs-dataframe.

In [170]:
url='http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df=pd.read_html(url, header=0, flavor='bs4')
dfs=pd.DataFrame(df[0])
dfs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Let's exclude "Not assigned" Borough:

In [171]:
dfs=dfs[dfs.Borough != 'Not assigned']
dfs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Let's join the duplicated Postcodes and split them within Neighbourhood-column with a comma. We specify not to order the dataframe:

In [172]:
dfs = dfs.groupby(['Postcode','Borough'], sort = False).agg(lambda x: ','.join(x))
dfs.reset_index(level=['Postcode','Borough'], inplace=True)
dfs.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


Next we are going to replace the "Not assigned" Neighbourhoods with the Borough-label and illustrate the dataframe:

In [173]:
dfs.loc[dfs['Neighbourhood'] == ('Not assigned'), 'Neighbourhood'] = dfs['Borough']
dfs.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


# PART 1: The dataframe shape:

In [174]:
dfs.shape

(103, 3)

#Adding the location data

In [82]:
!wget -O Geospatial_coordinates.csv https://cocl.us/Geospatial_data

--2019-06-13 11:52:41--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 159.8.72.228
Connecting to cocl.us (cocl.us)|159.8.72.228|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-13 11:52:42--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 185.235.236.197
Connecting to ibm.box.com (ibm.box.com)|185.235.236.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-13 11:52:42--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-06-13 11

In [175]:
coordinates = pd.read_csv('Geospatial_coordinates.csv')

In [176]:
geodf = pd.DataFrame(coordinates)
geodf.head()
#geodf.shape

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Part 2: table with coordinates included:

In [179]:
dfs['Latitude']=geodf['Latitude'].values
dfs['Longitude']=geodf['Longitude'].values
dfs.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.806686,-79.194353
1,M4A,North York,Victoria Village,43.784535,-79.160497
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.763573,-79.188711
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.770992,-79.216917
4,M7A,Queen's Park,Queen's Park,43.773136,-79.239476
5,M9A,Etobicoke,Islington Avenue,43.744734,-79.239476
6,M1B,Scarborough,"Rouge,Malvern",43.727929,-79.262029
7,M3B,North York,Don Mills North,43.711112,-79.284577
8,M4B,East York,"Woodbine Gardens,Parkview Hill",43.716316,-79.239476
9,M5B,Downtown Toronto,"Ryerson,Garden District",43.692657,-79.264848


# Part 3: Cluster & Mapping of the dataset

In [180]:
!pip install folium

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/4f/86/1ab30184cb60bc2b95deffe2bd86b8ddbab65a4fac9f7313c278c6e8d049/folium-0.9.1-py2.py3-none-any.whl (91kB)
[K    100% |████████████████████████████████| 92kB 9.1MB/s ta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
[31mtensorflow 1.3.0 requires tensorflow-tensorboard<0.2.0,>=0.1.0, which is not installed.[0m
Installing collected packages: branca, folium
Successfully installed branca-0.3.1 folium-0.9.1


In [183]:
import folium

In [200]:
map_toronto = folium.Map(location=['43.4430', '-79.2224'], zoom_start=9.5)

for i in range(0,len(dfs)):
    folium.CircleMarker([dfs.iloc[i]['Latitude'], dfs.iloc[i]['Longitude']],
        radius=5,
    weight=1).add_to(map_toronto)

map_toronto

Let's now cluster the data:

In [206]:
from sklearn.cluster import KMeans 
kclusters = 5
toronto_clusters = dfs.drop(['Postcode', 'Borough', 'Neighbourhood'], 1)
k_means = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clusters)

# check cluster labels generated for each row in the dataframe

k_means_labels = kmeans.labels_
k_means_labels

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4,
       4, 4, 2, 2, 2, 4, 4, 4, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4,
       4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3,
       3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 1, 3, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [207]:
k_means_cluster_centers = k_means.cluster_centers_
k_means_cluster_centers

array([[ 43.77480817, -79.24566825],
       [ 43.68279116, -79.53037109],
       [ 43.70021343, -79.32439325],
       [ 43.66240221, -79.39927215],
       [ 43.75057744, -79.41419236]])

In [208]:
locdata = pd.DataFrame({'Latitude':k_means_cluster_centers[:,0],'Longitude':k_means_cluster_centers[:,1]})

locdata.head()

Unnamed: 0,Latitude,Longitude
0,43.774808,-79.245668
1,43.682791,-79.530371
2,43.700213,-79.324393
3,43.662402,-79.399272
4,43.750577,-79.414192


In [None]:
Let's plot the 5 centroids to our map:

In [221]:
map_cluster_centers = folium.Map(location=['43.4430', '-79.2224'], zoom_start=9.5)
for i in range(0,len(locdata)):
    folium.CircleMarker([locdata.iloc[i]['Latitude'], locdata.iloc[i]['Longitude']],
    radius=10,
    weight=5).add_to(map_cluster_centers)

map_cluster_centers