## Task 3: Explore and cluster neighborhoods in Toronto

#### 1. Import libraries

In [35]:
import pandas as pd
import numpy as np
import requests
from geopy.geocoders import Nominatim
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

#### 2. Import dataframe from Task 2

In [29]:
# Read data
df = pd.read_csv('can_geo_df.csv')

# Extract only rows that are in Toronto neighborhood
df = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
df.sort_values(by=['PostalCode'], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f'Shape of final dataframe: {df.shape}')
print('A snippet of the first 5 rows the dataframe: ')
df.head()

Shape of final dataframe: (39, 5)
A snippet of the first 5 rows the dataframe: 


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


#### 3. Cluster neighborhoods

#### 3.1. Cluster using KMeans

#### **Step 1**:  Retrieve nearby venues to generate more attributes

In [3]:
CLIENT_ID = 'QT00YVC2AY2FQHORW0EUGCOWUIE2ZQ0UFAQIDWR43EJRQAWO' # your Foursquare ID
CLIENT_SECRET = '3BP5F0GKG5BCT1ZJ1M41N4YWCOW0EQIQVDOOSJ21MD3I53Z0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [4]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):    
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)    
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [5]:
# Get nearby venues of all neighborhoods using Postal Code as differentiator
toronto_venues = getNearbyVenues(names=df['PostalCode'], 
                                 latitudes=df['Latitude'], 
                                 longitudes=df['Longitude'])

In [6]:
print('This is the list of all nearby venues in Toronto with the neighborhood that they are in:')
toronto_venues

This is the list of all nearby venues in Toronto with the neighborhood that they are in:


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4E,43.676357,-79.293031,Dip 'n Sip,43.678897,-79.297745,Coffee Shop
...,...,...,...,...,...,...,...
1598,M7Y,43.662744,-79.321558,Amin Car Repair Garage,43.663544,-79.320130,Auto Workshop
1599,M7Y,43.662744,-79.321558,The Ashbridge Estate,43.664691,-79.321805,Garden
1600,M7Y,43.662744,-79.321558,TTC Russell Division,43.664908,-79.322560,Light Rail Station
1601,M7Y,43.662744,-79.321558,Jonathan Ashbridge Park,43.664702,-79.319898,Park


#### **Step 2:** Create a one-hot vector of the mean frequency of each catergory in each neighborhood

In [36]:
# Step 1: Create one-hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Step 2: Add the Postal code column to the dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# Step 3: Group and average the frequency of unique category in each neighborhood
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [37]:
print(f'The shape of the average dataframe grouped by Neighborhood postal code is: {toronto_grouped.shape}')
print('A snippet of the dataframe - ready for k-means clustering:')
toronto_grouped.head()

The shape of the average dataframe grouped by Neighborhood postal code is: (39, 231)
A snippet of the dataframe - ready for k-means clustering:


Unnamed: 0,Neighborhood,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,...,0.0,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### **Step 3:** From the engineered features, apply K-mean clustering

In [38]:
# Set up k and run KMean
k = 3
km = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped.drop(columns={'Neighborhood'}, axis=1))

In [39]:
# Label neighborhood with their cluster number
toronto_grouped.insert(1, 'Cluster', km.labels_)
toronto_grouped.head()

Unnamed: 0,Neighborhood,Cluster,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,M4E,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,...,0.0,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M4N,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
print('Number of members in each cluster: ')
toronto_grouped['Cluster'].value_counts()

Number of members in each cluster: 


0    34
2     4
1     1
Name: Cluster, dtype: int64

#### 3.2. Characterise clusters

In [22]:
# Create a mean one-hot encoding for each cluster
toronto_cluster = toronto_grouped.groupby('Cluster').mean().reset_index()
toronto_cluster

Unnamed: 0,Cluster,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theme Restaurant,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0.00845,0.00173,0.00173,0.00173,0.00346,0.00519,0.00346,0.009595,0.0022,...,0.000382,0.0007,0.001783,0.006583,0.000882,0.007585,0.001275,0.004385,0.003499,0.000668
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.175,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [42]:
# Characterise cluster based on popular venue types

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Cluster']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cluster_venues = pd.DataFrame(columns=columns)
cluster_venues['Cluster'] = toronto_cluster['Cluster']

for ind in np.arange(toronto_cluster.shape[0]):
    cluster_venues.iloc[ind, 1:] = return_most_common_venues(toronto_cluster.iloc[ind, :], num_top_venues)

cluster_venues

Unnamed: 0,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,Coffee Shop,Café,Restaurant,Park,Italian Restaurant,Pizza Place,Bakery,Sandwich Place,Pub,Bar
1,1,Pool,Garden,Wine Shop,Dance Studio,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center
2,2,Park,Trail,Bus Line,Playground,Swim School,Tennis Court,Jewelry Store,Sushi Restaurant,Dance Studio,Deli / Bodega


#### **Comment:** Based on the common venues in each cluster, we can conclude that:
- Cluster 1: Eatery cluster with lots of F&B places
- Cluster 2: Relaxed spot with chilling places like pools and gardens
- Cluster 3: Common place with public transport

#### 3.3. Visualise clusters on Map

In [30]:
# Label neighborhoods in DF
df.insert(1, 'Cluster', km.labels_)
df.head()

Unnamed: 0,PostalCode,Cluster,Borough,Neighborhood,Latitude,Longitude
0,M4E,0,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,0,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,0,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,0,East Toronto,Studio District,43.659526,-79.340923
4,M4N,2,Central Toronto,Lawrence Park,43.72802,-79.38879


In [43]:
# Generate the latitude and longitude of Toronto
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ca_toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighborhood'], df['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### In case the map is not viewable on Github, here is the screenshot of the map run on my notebook:
<img src="cluster_toronto.png">