# Segmenting and Clustering Neighborhoods in Toronto
## By Zhemin Wu
### This is the notebook for Assignment of Week 3 for the Coursera Applied Data Science Capstone project

In [2]:
# Import necessary modules
import pandas as pd
import numpy as np

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## Part 1: Preparing raw DataFrame by scraping from the wiki page

In [5]:
# Use Pandas web Scraping func to get dataframs from html tables
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
df_list = pd.read_html(url)

#The required table is the 1st one
df_raw = df_list[0]
df_raw.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
# Drop rows where borough == Not assigned
df_cleaned = df_raw[df_raw['Borough'] != 'Not assigned']
df_cleaned.reset_index(inplace=True, drop=True)

#Check if we have certain postal codes listed multiple-times
print("We have %d duplicated rows with same postal codes" % len(df_cleaned[df_cleaned.duplicated(subset='Postal Code')]))

#Check if we have certain rows with Neighbourhood Not assigned
print("We have %d rows with Neighbourhood not assigned" % len(df_cleaned[df_cleaned['Neighbourhood'] == 'Not assigned']))

# Actually we don't have any duplicated rows with same postal codes,
# and we don't have any rows with Neighbourhood not assigned
# The wiki page already contains the table in the format we want

We have 0 duplicated rows with same postal codes
We have 0 rows with Neighbourhood not assigned


In [7]:
df_cleaned.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
df_cleaned.shape

(103, 3)

## Part 2: Get latitude and the longitude coordinates of each Postal Code

In [6]:
# Download Geospatial Coordinates data
import wget
url = "https://cocl.us/Geospatial_data"
wget.download(url, 'Geospatial_Coordinates.csv')

100% [################################################################################]     2K / 2K

'Geospatial_Coordinates.csv'

In [9]:
# Read coordinates data and save into a DataFram
geo_df = pd.read_csv('Geospatial_Coordinates.csv')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
# Merge the coordinates data into our Neighbourhood data
df_merged = df_cleaned.join(geo_df.set_index('Postal Code'), on='Postal Code')
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3: Explore and cluster the neighborhoods in Toronto

In [11]:
# Filter out Toroto neighbourhoods
df_Toronto = df_merged[df_merged['Borough'].str.contains('Toronto')]
print("We have %d different postal codes in Toronto" % df_Toronto.shape[0])
df_Toronto.head()

We have 39 different postal codes in Toronto


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [12]:
# Get Latitude and Longitude of Toronto
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Display neighbourhoods in a map before clustering

In [13]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, code, borough, neighbourhood in zip(df_Toronto['Latitude'], df_Toronto['Longitude'], df_Toronto['Postal Code'], df_Toronto['Borough'], df_Toronto['Neighbourhood']):
    label = '{},{}: {}'.format(borough, code, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

### Define Foursquare Credentials and Version (will hide credentials after running notebook for privacy protection)


In [13]:
CLIENT_ID = 'S11LJUAASU5DXZ0XFFU1F04EWFBFWTLSPLNPUNCLXIHVZPJA' # your Foursquare ID
CLIENT_SECRET = '541XGM2R1ATFA0PW23D3K1P0IE2SWNH1M0VUBML02S35SITW' # your Foursquare Secret
ACCESS_TOKEN = '2L1ML4I0BRXQ2Z3DLR4RRSSS0Q3KZGHSP352BSAD5V1MRZAL' # your FourSquare Access Token
VERSION = '20210201' # Foursquare API version, use 2021Feb1
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: S11LJUAASU5DXZ0XFFU1F04EWFBFWTLSPLNPUNCLXIHVZPJA
CLIENT_SECRET:541XGM2R1ATFA0PW23D3K1P0IE2SWNH1M0VUBML02S35SITW


### Define a func for exploring nearby venues around Toronto's spots with different postal codes

In [14]:
def getNearbyVenues(codes, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for code, lat, lng in zip(codes, latitudes, longitudes):
        print("Exploring around the postal code: %s" % code)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            code, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now get the nearby veneues for each of the postal codes in Toronto

In [15]:
toronto_venues = getNearbyVenues(df_Toronto['Postal Code'], df_Toronto['Latitude'], df_Toronto['Longitude'])

Exploring around the postal code: M5A
Exploring around the postal code: M7A
Exploring around the postal code: M5B
Exploring around the postal code: M5C
Exploring around the postal code: M4E
Exploring around the postal code: M5E
Exploring around the postal code: M5G
Exploring around the postal code: M6G
Exploring around the postal code: M5H
Exploring around the postal code: M6H
Exploring around the postal code: M5J
Exploring around the postal code: M6J
Exploring around the postal code: M4K
Exploring around the postal code: M5K
Exploring around the postal code: M6K
Exploring around the postal code: M4L
Exploring around the postal code: M5L
Exploring around the postal code: M4M
Exploring around the postal code: M4N
Exploring around the postal code: M5N
Exploring around the postal code: M4P
Exploring around the postal code: M5P
Exploring around the postal code: M6P
Exploring around the postal code: M4R
Exploring around the postal code: M5R
Exploring around the postal code: M6R
Exploring ar

In [16]:
toronto_venues.head()

Unnamed: 0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
1,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


### Check how many unique catetories we have obtained

In [17]:
print("There are %d different categories" % len(toronto_venues['Venue Category'].unique()))

There are 232 different categories


### Now analyze each postal code zone with one hot encoding

In [18]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Add back postal code
toronto_onehot['Postal Code'] = toronto_venues['Postal Code']

# move postal code column to the first column
columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[columns]

toronto_onehot.head()

Unnamed: 0,Postal Code,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group by postal code by taking the mean of the frequency of occurrence of each category

In [19]:
toronto_grouped = toronto_onehot.groupby('Postal Code').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Postal Code,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Tibetan Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,...,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
toronto_grouped.shape

(39, 233)

### Let's create a dataframe to show the top 10 vanues for each postal code zone

In [21]:
# Define a func to return most common venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [22]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postal Code']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Postal Code'] = toronto_grouped['Postal Code']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Health Food Store,Pub,Neighborhood,Trail,Yoga Studio,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run
1,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Restaurant,Bubble Tea Shop,Bakery,Pub,Pizza Place
2,M4L,Fast Food Restaurant,Park,Food & Drink Shop,Liquor Store,Sandwich Place,Italian Restaurant,Restaurant,Ice Cream Shop,Steakhouse,Fish & Chips Shop
3,M4M,Coffee Shop,Gastropub,Brewery,Café,Bakery,American Restaurant,Yoga Studio,Neighborhood,Cheese Shop,Clothing Store
4,M4N,Park,Bus Line,Swim School,Business Service,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


### Now it's the time to build a K-Means cluster model for clustering all the postal code zones

In [23]:
# Create features matrix by dropping the Postal Code column
toronto_grouped_clustering = toronto_grouped.drop('Postal Code', axis=1)

# Number of clusters
num_clusters = 5

# Create KMeans model object and fit to the feature data
km_model = KMeans(init='k-means++', n_clusters=num_clusters, n_init=12, random_state=0)
km_model.fit(toronto_grouped_clustering)


KMeans(n_clusters=5, n_init=12, random_state=0)

In [24]:
# Get cluster labels
cluster_labels = km_model.labels_

# Insert into datafram
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', cluster_labels)
neighborhoods_venues_sorted.head()

Unnamed: 0,Cluster Labels,Postal Code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,M4E,Health Food Store,Pub,Neighborhood,Trail,Yoga Studio,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run
1,4,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store,Restaurant,Bubble Tea Shop,Bakery,Pub,Pizza Place
2,0,M4L,Fast Food Restaurant,Park,Food & Drink Shop,Liquor Store,Sandwich Place,Italian Restaurant,Restaurant,Ice Cream Shop,Steakhouse,Fish & Chips Shop
3,4,M4M,Coffee Shop,Gastropub,Brewery,Café,Bakery,American Restaurant,Yoga Studio,Neighborhood,Cheese Shop,Clothing Store
4,4,M4N,Park,Bus Line,Swim School,Business Service,Event Space,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [25]:
# Merge with Toronto DF
df_Toronto_cluster = df_Toronto.join(neighborhoods_venues_sorted.set_index('Postal Code'), on='Postal Code' )
df_Toronto_cluster.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4,Coffee Shop,Bakery,Pub,Park,Breakfast Spot,Restaurant,Café,Theater,Yoga Studio,Cosmetics Shop
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,4,Coffee Shop,Sushi Restaurant,College Auditorium,Beer Bar,Smoothie Shop,Sandwich Place,Burrito Place,Café,Portuguese Restaurant,General Entertainment
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,4,Clothing Store,Coffee Shop,Bubble Tea Shop,Cosmetics Shop,Café,Japanese Restaurant,Middle Eastern Restaurant,Ramen Restaurant,Movie Theater,Theater
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,4,Coffee Shop,Café,Gastropub,American Restaurant,Cocktail Bar,Hotel,Department Store,Farmers Market,Restaurant,Bakery
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Pub,Neighborhood,Trail,Yoga Studio,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run


## Finally, display clustering of postal code zone in a map

In [26]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, code, borough, neigh, cluster in zip(df_Toronto_cluster['Latitude'], 
                                                   df_Toronto_cluster['Longitude'], 
                                                   df_Toronto_cluster['Postal Code'], 
                                                   df_Toronto_cluster['Borough'], 
                                                   df_Toronto_cluster['Neighbourhood'], 
                                                   df_Toronto_cluster['Cluster Labels']):
    label = folium.Popup('Cluster {}-{},{}: {}'.format(cluster, borough, code, neigh), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters