# Segmenting and Clustering Neighborhoods in Toronto 
BY: Walter García

Date: 07-02-2020

Country: Chile

City: Santiago

# Importing Libraries 

In [0]:
from bs4 import BeautifulSoup
import time
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# Soup

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html = requests.get(url)
soup = BeautifulSoup(html.text)
sp = soup.find_all()
sp

[<html class="client-nojs" dir="ltr" lang="en">
 <head>
 <meta charset="utf-8"/>
 <title>List of postal codes of Canada: M - Wikipedia</title>
 <script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XjZO1QpAMFIAALOzc6EAAACH","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal c

# Data

### Acquistion

In [3]:
#importing the data by scrubbing the wiki page of canada postal codes
df_Canada  = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df_Torronto = pd.DataFrame(df_Canada[0])
df_Torronto.shape

(287, 3)

### Wrangling

In [4]:
#Data wrangling

#remove unassigned boroughs
indexnamesTodrop = df_Torronto[ df_Torronto['Borough'] == 'Not assigned' ].index
df_Torronto.drop( indexnamesTodrop, inplace=True)
print(df_Torronto.shape)

(210, 3)


In [5]:
#mark the borough name for the unassigned neighborhood
df_Torronto.loc[df_Torronto['Neighbourhood'] == ('Not assigned'), 'Neighbourhood'] = df_Torronto['Borough']
print(df_Torronto[df_Torronto['Neighbourhood'] == 'Not assigned'])

Empty DataFrame
Columns: [Postcode, Borough, Neighbourhood]
Index: []


In [6]:
#get the final shape of the dataframe
df_Torronto.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [7]:
#get the final record count in the dataframe
df_Torronto.shape

(210, 3)

# Geo co-ordinates

In [8]:
pip install geocoder



### Get geo co-ordinates

In [0]:
import geocoder # import geocoder

In [10]:
%%time
g = geocoder.google('M1B, Toronto, Ontario')
print('M4Y', g.latlng)

#from google lat and longitude is not returning any, so reading from csv file

M4Y None
CPU times: user 15.7 ms, sys: 2.24 ms, total: 17.9 ms
Wall time: 25.3 ms


In [11]:
df_Coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
df_Coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
#embed the lat and longitude to the Torronto postal code
df_TorrontoFinal = df_Torronto.merge(df_Coordinates,how ='left',left_on='Postcode',right_on='Postal Code')
df_TorrontoFinal.drop('Postal Code',axis=1,inplace=True)
df_TorrontoFinal.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


In [13]:
#get the lat and long for Torronto(whole city)
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
TR_latitude = location.latitude
TR_longitude = location.longitude
print('The geograpical coordinate of Torronto are {}, {}.'.format(TR_latitude, TR_longitude))

The geograpical coordinate of Torronto are 43.653963, -79.387207.


In [14]:
# create map of New York using latitude and longitude values
map_Torronto = folium.Map(location=(TR_latitude, TR_longitude), zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_TorrontoFinal['Latitude'], df_TorrontoFinal['Longitude'], df_TorrontoFinal['Borough'], df_TorrontoFinal['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Torronto)  
    
map_Torronto

# Segementation of Neighborhood

In [15]:
#lets explore the neighborhood which starts with Torronto
df_TR_Neigh = (df_TorrontoFinal[df_TorrontoFinal['Borough'].str.contains('East Tor')]).reset_index(drop=True)
df_TR_Neigh.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,The Danforth West,43.679557,-79.352188
2,M4K,East Toronto,Riverdale,43.679557,-79.352188
3,M4L,East Toronto,The Beaches West,43.668999,-79.315572
4,M4L,East Toronto,India Bazaar,43.668999,-79.315572


In [0]:
#with foursquare lets explore more!!!!!

#credentials
CLIENT_ID = 'FXAKR2VKXED2PPGI2ZPFMEJD5LAXQ3M4KI01THJWPMWAQHDC' # your Foursquare ID
CLIENT_SECRET = 'EJR3IXSZBX2KX0TAP1FYJBZLLPEXPCWV2WEW5XL4ZPM31JD3' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=100

In [0]:
#function definition... lets loop through all boroughs and explore the venues surrounding it
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [0]:
#Get all the nearby venues by calling the function that we expored using foursquare
Torronto_venues = getNearbyVenues( names=df_TR_Neigh['Neighbourhood'],
                                   latitudes=df_TR_Neigh['Latitude'],
                                   longitudes=df_TR_Neigh['Longitude']
                                 )

In [19]:
Torronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Danforth West,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [20]:
Torronto_grouped = Torronto_venues.groupby('Venue Category').count()
Torronto_grouped.head()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
American Restaurant,4,4,4,4,4,4
Auto Workshop,1,1,1,1,1,1
Bakery,4,4,4,4,4,4
Bank,1,1,1,1,1,1
Bar,1,1,1,1,1,1


### One hot coding to cluster the neighborhood

In [21]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Torronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Torronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#print([Toronto_onehot.columns[0]])
#list(Toronto_onehot.columns[:-1])
fixed_columns = [Toronto_onehot.columns[48]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Butcher,Café,Caribbean Restaurant,Cheese Shop,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Frozen Yogurt Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Gay Bar,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Ice Cream Shop,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Middle Eastern Restaurant,Movie Theater,Neighborhood.1,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Thai Restaurant,Trail,Wine Bar
0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Danforth West,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,The Danforth West,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
# set number of clusters
kclusters = 5

Cluster_TorontoNeighbor = Toronto_onehot.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Cluster_TorontoNeighbor)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 0, 1, 0, 4, 1, 1, 0], dtype=int32)

In [23]:
# add clustering labels
Torronto_venues.insert(0, 'Cluster Labels', kmeans.labels_)
Torronto_venues_Cluster = Torronto_venues
Torronto_venues_Cluster.head(15)

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,0,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,1,The Danforth West,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
5,0,The Danforth West,43.679557,-79.352188,MenEssentials,43.67782,-79.351265,Cosmetics Shop
6,4,The Danforth West,43.679557,-79.352188,Cafe Fiorentina,43.677743,-79.350115,Italian Restaurant
7,1,The Danforth West,43.679557,-79.352188,Mezes,43.677962,-79.350196,Greek Restaurant
8,1,The Danforth West,43.679557,-79.352188,Messini Authentic Gyros,43.677827,-79.350569,Greek Restaurant
9,0,The Danforth West,43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


### Analyse each cluster label

In [24]:
local = Torronto_venues_Cluster.loc[Torronto_venues_Cluster['Cluster Labels'] == 0, Torronto_venues_Cluster.columns[[1] + list(range(5, Torronto_venues_Cluster.shape[1]))]]

local.head(20)

Unnamed: 0,Neighborhood,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676821,-79.293942,Trail
1,The Beaches,43.678879,-79.297734,Health Food Store
2,The Beaches,43.679181,-79.297215,Pub
3,The Beaches,43.680563,-79.292869,Neighborhood
5,The Danforth West,43.67782,-79.351265,Cosmetics Shop
9,The Danforth West,43.677773,-79.351187,Ice Cream Shop
10,The Danforth West,43.677663,-79.351313,Brewery
11,The Danforth West,43.67753,-79.352295,Ice Cream Shop
13,The Danforth West,43.677335,-79.35313,Pub
14,The Danforth West,43.677622,-79.352116,Yoga Studio


In [25]:
print(list(range(5, Torronto_venues_Cluster.shape[1])))
print(Torronto_venues_Cluster.shape[1])
print(Torronto_venues_Cluster.columns[1])

[5, 6, 7]
8
Neighborhood
