# Final Assignment: Segmenting and Clustering Neighborhoods in Toronto

First, let's import required libraries

In [3]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import requests
from sklearn.cluster import KMeans
from sklearn.preprocessing import Normalizer
import matplotlib.cm as cm
import matplotlib.colors as colors

Now, let's import list of demographic data on each Toronto neighborhood taken from the Canadian census.

In [4]:
data = pd.read_csv('toronto_clustering-census.csv', header = None, names = ['Neighborhood', 'Income', '2nd Language Proportion', '2nd Language', 'Lat', 'Lon'])
data.set_index('Neighborhood', inplace=True)
data.head()

Unnamed: 0_level_0,Income,2nd Language Proportion,2nd Language,Lat,Lon
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Regent Park/Trefann Court,19521,0.105,Bengali,,
Oakridge,21155,0.126,Bengali,,
Crescent Town,23021,0.181,Bengali,,
Leaside,82670,0.004,Bulgarian,,
The Beaches,67536,0.007,Cantonese,,


Let's retrieve latitude and longtitude of each neighborhood using geopy library.

In [5]:
geolocator = Nominatim(user_agent='Coursera Project')
city = ', Toronto'

for s in data.index:
    location = geolocator.geocode(s + city)
    if location != None:
        data.loc[s, 'Lat'] = location.raw['lat']
        data.loc[s, 'Lon'] = location.raw['lon']

data['Lat'] = data['Lat'].astype(float)
data['Lon'] = data['Lon'].astype(float)
data.head()

Unnamed: 0_level_0,Income,2nd Language Proportion,2nd Language,Lat,Lon
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Regent Park/Trefann Court,19521,0.105,Bengali,43.659279,-79.366135
Oakridge,21155,0.126,Bengali,43.697174,-79.274823
Crescent Town,23021,0.181,Bengali,43.695403,-79.293099
Leaside,82670,0.004,Bulgarian,43.704798,-79.36809
The Beaches,67536,0.007,Cantonese,43.671024,-79.296712


We can inspect all the neighborhoods on the map of Toronto.

In [6]:
# Create map of Toronto
mapToronto = folium.Map(location=[43.7, -79.384293], zoom_start=11)

# Add markers to map
for lat, lng, label in zip(data['Lat'], data['Lon'], data.index):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapToronto)  
    
mapToronto

Let's construct our features. First, let's filter proportions for chinese languages.

In [7]:
data.loc[~data['2nd Language'].isin(['Cantonese', 'Mandarin', 'Unspecified Chinese']), '2nd Language Proportion'] = 0
data.drop('2nd Language', axis=1, inplace = True)
data.rename(columns = {'2nd Language Proportion' : 'Chinese Language Proportion'}, inplace = True)
data.head()

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Regent Park/Trefann Court,19521,0.0,43.659279,-79.366135
Oakridge,21155,0.0,43.697174,-79.274823
Crescent Town,23021,0.0,43.695403,-79.293099
Leaside,82670,0.0,43.704798,-79.36809
The Beaches,67536,0.007,43.671024,-79.296712


Let's enter our Foursquare API credentials.

In [8]:
CLIENT_ID = 'REDACTED' #Foursquare ID
CLIENT_SECRET = 'REDACTED' #Foursquare Secret
VERSION = '20180604'
LIMIT = 50
search_query = 'Food'

Now, we will create a fuinction that retrieves nearby venues for specified location.

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name + ' ', end='')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&query={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            search_query)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Let's use this function with our dataset.

In [10]:
toronto_food = getNearbyVenues(data.index, data['Lat'], data['Lon'])

Regent Park/Trefann Court Oakridge Crescent Town Leaside The Beaches Upper Beaches Bridle Path Kensington Market Alexandra Park Agincourt Cliffside West Rouge Steeles Milliken East Danforth Riverdale Willowdale Bayview Village Bayview Woods Steeles Leslieville Cricket Club Ledbury Park Port Union Rouge Hill Maryvale Clairlea Wexford Wilson Heights Ionview Lawrence Manor St. James Town Lawrence Park South Hill Moore Park Chaplin Estates Yorkville Bedford Park Old East York Bracondale Hill The Danforth Woburn Fort York/Liberty Village Humberlea Downsview Maple Leaf Lawrence Heights Humber Summit The Queensway Richview York University Heights Pelmo Park Glen Park Financial District Casa Loma Princess Gardens York Mills Islington Six Points Port Lands Henry Farm Bay Street Corridor Toronto Islands Guildwood Davisville Graydon Hall Lansing Centennial Parkdale High Park North Swansea Markland Wood Mimico Roncesvalles Long Branch New Toronto Sunnylea Alderwood Fashion District Fairbank Silver

Let's add a column that would represent the number of chinese food places for each neighborhood.

In [11]:
chinese_food = toronto_food.loc[toronto_food['Venue Category'].isin({'Asian Restaurant', 'Chinese Restaurant', 'Dim Sum Restaurant', 'Dumpling Restaurant', 'Noodle House'})].groupby('Neighborhood').count()
chinese_food.drop(['Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude'], axis = 1, inplace = True)
chinese_food.rename(columns = {'Venue Category' : 'Chinese Food Count'}, inplace = True)
data = data.merge(chinese_food, left_index = True, right_index = True, how='left')
data['Chinese Food Count'] = data['Chinese Food Count'].apply(lambda x: 0 if np.isnan(x) else x)
data.head(10)

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Regent Park/Trefann Court,19521,0.0,43.659279,-79.366135,1.0
Oakridge,21155,0.0,43.697174,-79.274823,0.0
Crescent Town,23021,0.0,43.695403,-79.293099,0.0
Leaside,82670,0.0,43.704798,-79.36809,0.0
The Beaches,67536,0.007,43.671024,-79.296712,1.0
Upper Beaches,44346,0.007,43.671024,-79.296712,1.0
Bridle Path,314107,0.12,43.7354,-79.370883,0.0
Kensington Market,23335,0.143,43.655214,-79.40226,3.0
Alexandra Park,19687,0.179,43.650758,-79.404298,4.0
Agincourt,25750,0.193,43.785353,-79.278549,11.0


Now, let's add a column that would represent the total number of food venues for each neighborhood.

In [12]:
food_count = toronto_food.groupby('Neighborhood').count()
food_count.drop(['Neighborhood Latitude', 'Neighborhood Longitude', 'Venue', 'Venue Latitude', 'Venue Longitude'], axis = 1, inplace = True)
food_count.rename(columns = {'Venue Category' : 'Total Food Count'}, inplace = True)
data = data.merge(food_count, left_index = True, right_index = True, how='left')
data['Total Food Count'] = data['Total Food Count'].apply(lambda x: 0 if np.isnan(x) else x)
data.head(10)

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Regent Park/Trefann Court,19521,0.0,43.659279,-79.366135,1.0,28.0
Oakridge,21155,0.0,43.697174,-79.274823,0.0,4.0
Crescent Town,23021,0.0,43.695403,-79.293099,0.0,0.0
Leaside,82670,0.0,43.704798,-79.36809,0.0,5.0
The Beaches,67536,0.007,43.671024,-79.296712,1.0,26.0
Upper Beaches,44346,0.007,43.671024,-79.296712,1.0,26.0
Bridle Path,314107,0.12,43.7354,-79.370883,0.0,1.0
Kensington Market,23335,0.143,43.655214,-79.40226,3.0,50.0
Alexandra Park,19687,0.179,43.650758,-79.404298,4.0,50.0
Agincourt,25750,0.193,43.785353,-79.278549,11.0,18.0


Let's normalize the data before applying k-means clustering algorithm.

In [13]:
clustering = data.reset_index().drop(['Neighborhood', 'Lat', 'Lon'], 1)
transformer = Normalizer().fit_transform(clustering)

Now we can aplly the k-means clustering algorithm.

In [14]:
kclusters = 4
kmeans = KMeans(n_clusters=kclusters, random_state=10).fit(transformer)

data_clustered = data
data_clustered['Cluster Labels'] = kmeans.labels_
data_clustered.head(10)

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count,Cluster Labels
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Regent Park/Trefann Court,19521,0.0,43.659279,-79.366135,1.0,28.0,1
Oakridge,21155,0.0,43.697174,-79.274823,0.0,4.0,0
Crescent Town,23021,0.0,43.695403,-79.293099,0.0,0.0,0
Leaside,82670,0.0,43.704798,-79.36809,0.0,5.0,0
The Beaches,67536,0.007,43.671024,-79.296712,1.0,26.0,2
Upper Beaches,44346,0.007,43.671024,-79.296712,1.0,26.0,2
Bridle Path,314107,0.12,43.7354,-79.370883,0.0,1.0,0
Kensington Market,23335,0.143,43.655214,-79.40226,3.0,50.0,3
Alexandra Park,19687,0.179,43.650758,-79.404298,4.0,50.0,3
Agincourt,25750,0.193,43.785353,-79.278549,11.0,18.0,2


Let's show our clusters on the map of Toronto.

In [15]:
# create map
map_clusters = folium.Map(location=[43.7, -79.384293], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(data_clustered['Lat'], data_clustered['Lon'], data_clustered.index, data_clustered['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Finally, let's inspect our clusters and make the conclusions.

Cluster 1:

In [16]:
data_clustered.loc[data_clustered['Cluster Labels'] == 0]

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count,Cluster Labels
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Oakridge,21155,0.000,43.697174,-79.274823,0.0,4.0,0
Crescent Town,23021,0.000,43.695403,-79.293099,0.0,0.0,0
Leaside,82670,0.000,43.704798,-79.368090,0.0,5.0,0
Bridle Path,314107,0.120,43.735400,-79.370883,0.0,1.0,0
Cliffside,32701,0.023,43.711170,-79.248177,0.0,6.0,0
West Rouge,44605,0.028,43.785963,-79.130756,0.0,1.0,0
Steeles,26660,0.237,43.816178,-79.314538,0.0,0.0,0
Bayview Village,46752,0.084,43.769197,-79.376662,0.0,6.0,0
Bayview Woods Steeles,41485,0.093,43.798127,-79.382973,0.0,0.0,0
Cricket Club,104362,0.000,43.739826,-79.418975,0.0,5.0,0


Cluster 2:

In [17]:
data_clustered.loc[data_clustered['Cluster Labels'] == 1]

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count,Cluster Labels
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Regent Park/Trefann Court,19521,0.0,43.659279,-79.366135,1.0,28.0,1
Milliken,25243,0.266,43.823217,-79.301523,15.0,38.0,1
Leslieville,30886,0.094,43.6627,-79.332815,0.0,33.0,1
Fort York/Liberty Village,46086,0.0,43.642155,-79.410934,4.0,49.0,1
Lawrence Heights,29867,0.0,43.722778,-79.450933,2.0,42.0,1
York University Heights,24432,0.0,43.770226,-79.50218,0.0,23.0,1
Bay Street Corridor,40598,0.096,43.665275,-79.387528,1.0,45.0,1
Parkdale,26314,0.0,43.637178,-79.436004,3.0,28.0,1
Junction Triangle,28067,0.0,43.665478,-79.470352,0.0,35.0,1
Trinity Bellwoods,31106,0.0,43.647627,-79.413879,1.0,37.0,1


Cluster 3:

In [18]:
data_clustered.loc[data_clustered['Cluster Labels'] == 2]

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count,Cluster Labels
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
The Beaches,67536,0.007,43.671024,-79.296712,1.0,26.0,2
Upper Beaches,44346,0.007,43.671024,-79.296712,1.0,26.0,2
Agincourt,25750,0.193,43.785353,-79.278549,11.0,18.0,2
East Danforth,33847,0.042,43.68636,-79.300316,1.0,16.0,2
Riverdale,40139,0.067,43.66547,-79.352594,7.0,22.0,2
Willowdale,39895,0.079,43.769814,-79.41381,1.0,33.0,2
Wexford,28556,0.0,43.745377,-79.294715,1.0,11.0,2
St. James Town,22341,0.0,43.669403,-79.372704,0.0,19.0,2
Yorkville,105239,0.0,43.671386,-79.390168,2.0,50.0,2
Old East York,33172,0.0,43.670862,-79.372792,0.0,20.0,2


Cluster 4:

In [19]:
data_clustered.loc[data_clustered['Cluster Labels'] == 3]

Unnamed: 0_level_0,Income,Chinese Language Proportion,Lat,Lon,Chinese Food Count,Total Food Count,Cluster Labels
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Kensington Market,23335,0.143,43.655214,-79.40226,3.0,50.0,3
Alexandra Park,19687,0.179,43.650758,-79.404298,4.0,50.0,3
