# Segmenting and Clustering Neighborhoods in the city of Toronto, Canada

In [51]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Part 1. Toronto neighborhood DataFrame

In [52]:
#!conda install -c conda-forge scrapy --yes 

#### import libs to scrapy data from html

In [53]:
import scrapy
from scrapy.http import TextResponse

#### Scrape data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M , n order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [54]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
res = requests.get(url, verify = False)
response = TextResponse(res.url, body=res.text, encoding='utf-8')



Get table from wiki

In [55]:
table = response.xpath('//table[@class="wikitable sortable"]')

store postal codes in a list

In [56]:
postalCodes = []
for row in table.xpath('tbody//tr'):
    if row.xpath('td'):
        #print(row.xpath('td//text()').extract())
        postalCodes.append([td.strip() for td in row.xpath('td//text()').extract()])
# take away last columns from the list because new line or \n
postalCodes = [postalCode[:3] for postalCode in postalCodes]   

In [57]:
# instantiate the dataframe
columns = ['PostalCode', 'Borough','Neighborhood']
neighborhoods = pd.DataFrame(postalCodes, columns=columns)
neighborhoods.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


#####  Ignore cells with a borough that is Not assigned.

In [58]:

neighborhoods = neighborhoods[neighborhoods['Borough']!='Not assigned'].reset_index(drop=True)
neighborhoods.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [59]:
neighborhoods.loc[neighborhoods['Neighborhood'] == "Not assigned",'Neighborhood'] =  neighborhoods['Borough']
neighborhoods.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [60]:
df_neighborhoods = neighborhoods.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda Neighborhood: ','.join(Neighborhood)).reset_index()
df_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [61]:
df_neighborhoods.shape

(103, 3)

In [62]:
df_neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Part 2. Map Latitude and longitude coordinates to neighborhood

In [63]:
df_lat_lng_coords  = pd.read_csv('Geospatial_Coordinates.csv') 
df_lat_lng_coords = df_lat_lng_coords.rename(columns={'Postal Code':'PostalCode'})

In [64]:
df_lat_lng_coords.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [65]:
df_Toronto_neigh = pd.merge(df_neighborhoods, df_lat_lng_coords, on="PostalCode" )
df_Toronto_neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [66]:
df_Toronto_neigh.shape

(103, 5)

## Part 3. Explore and cluster the neighborhoods in Toronto

#### Work with only boroughs that contain the word Toronto

In [67]:
toronto = df_Toronto_neigh[df_Toronto_neigh['Borough'].str.contains("Toronto")].reset_index(drop=True)

In [68]:
toronto_onehot = pd.get_dummies(toronto[['Borough']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
toronto_onehot['Borough'] = toronto['Borough'] 
# move Borough column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,East Toronto,0,0,1,0
1,East Toronto,0,0,1,0
2,East Toronto,0,0,1,0
3,East Toronto,0,0,1,0
4,Central Toronto,1,0,0,0


In [69]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
toronto_grouped

Unnamed: 0,Borough,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,Central Toronto,1,0,0,0
1,Downtown Toronto,0,1,0,0
2,East Toronto,0,0,1,0
3,West Toronto,0,0,0,1


In [70]:
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Borough', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 1, 2])

In [71]:
# add clustering labels
toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)


In [72]:
toronto_merged = toronto

In [73]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_grouped.set_index('Borough'), on='Borough')
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,Central Toronto,Downtown Toronto,East Toronto,West Toronto
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,0,0,1,0
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,1,0,0,1,0
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,1,0,0,1,0
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1,0,0,1,0
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,1,0,0,0


In [74]:

# create map
map_clusters = folium.Map(location=[43.676357, -79.352188], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
print(rainbow)
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

['#8000ff', '#2adddd', '#d4dd80', '#ff0000']


### Above map shows the clusters for boroughs that contains the world Toronto