**Segmenting and Clustering Neighborhoods in Toronto**

Part 1

*Pre-processing and data retrieval*

In [1]:
# Libraries
import pandas as pd
import numpy as np
import requests

!pip install beautifulsoup4
!pip install lxml
from bs4 import BeautifulSoup



In [2]:
# Data Retrieval
wikiurl="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table_class="wikitable sortable jquery-tablesorter"
response=requests.get(wikiurl)
print(response.status_code)

200


In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(response.text, 'html.parser')
indiatable=soup.find('table',{'class':"wikitable"})

In [4]:
df=pd.read_html(str(indiatable))

# convert list to dataframe
df=pd.DataFrame(df[0])
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
df.shape

(180, 3)

*Data Cleaning*

In [6]:
df = df[df.Borough != 'Not assigned']

df['Neighbourhood'] = np.where(df['Neighbourhood']=='Not assigned', 
                           df['Borough'],      
                           df['Neighbourhood'])     
df.reset_index(inplace = True)
del df['index']
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df.shape

(103, 3)

Part 2

In [8]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
df2 = pd.merge(left=df, right=geo_df, how='left', left_on='Postal Code', right_on='Postal Code')

Part 3


*Library*

In [10]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


*Generate Map of Toronto*

In [11]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto is {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto is 43.6534817, -79.3839347.


In [12]:
map_t = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df2['Latitude'], df2['Longitude'], df2['Borough'], df2['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_t)  
    
map_t

*screenshot of map in case Github does not show*

In [32]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/tylerhys/capstone/tylerhys-SCNSHOTS/toronto.PNG")

*Exploring Venues in Toronto*

In [13]:
# Credentials
CLIENT_ID = 'FDEQATRDUJMGFTJL5SZH2M3RHIIXJ00BANMCUVBG2TB4FJ5U' # your Foursquare ID
CLIENT_SECRET = 'VV3HJAUBIL1OGJIMQZESK3T1FVDUUQMHPWCAT3PW2IXZ5SWL' # your Foursquare Secret
ACCESS_TOKEN = '4Q3LIZDIPLDM2CHKNKEEDDXV1X5MAKWF4S0T2XBNFSNGDPBK' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: FDEQATRDUJMGFTJL5SZH2M3RHIIXJ00BANMCUVBG2TB4FJ5U
CLIENT_SECRET:VV3HJAUBIL1OGJIMQZESK3T1FVDUUQMHPWCAT3PW2IXZ5SWL


In [14]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [15]:
t_venues = getNearbyVenues(names=df2['Neighbourhood'].where(df2['Borough'].str.contains("Toronto")),
                                   latitudes=df2['Latitude'],
                                   longitudes=df2['Longitude']
                                  )

nan
nan
Regent Park, Harbourfront
nan
Queen's Park, Ontario Provincial Government
nan
nan
nan
nan
Garden District, Ryerson
nan
nan
nan
nan
nan
St. James Town
nan
nan
nan
The Beaches
Berczy Park
nan
nan
nan
Central Bay Street
Christie
nan
nan
nan
nan
Richmond, Adelaide, King
Dufferin, Dovercourt Village
nan
nan
nan
nan
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
nan
nan
nan
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
nan
nan
nan
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
nan
nan
nan
nan
nan
Studio District
nan
nan
nan
nan
nan
nan
Lawrence Park
Roselawn
nan
nan
nan
nan
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
nan
nan
nan
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
nan
nan
nan
Davisville
University of Toronto, Harbord
Runnymede, Swansea
nan
Moore Park, Summerhill East


In [16]:
# Number of venues
t_venues.dropna(axis=0, inplace = True)
print(t_venues.shape)
t_venues.head()

(854, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
8,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
9,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
10,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
11,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
12,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [17]:
#Find out number of unique categories from returned venues
t_venues.groupby('Neighbourhood').count()
print('There are {} uniques categories.'.format(len(t_venues['Venue Category'].unique())))

There are 197 uniques categories.


*Find Top 10 Venues of each Neighbourhood*

In [18]:
# one hot encoding
t_onehot = pd.get_dummies(t_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
t_onehot['Neighbourhood'] = t_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [t_onehot.columns[-1]] + list(t_onehot.columns[:-1])
t_onehot = t_onehot[fixed_columns]

t_grouped = t_onehot.groupby('Neighbourhood').mean().reset_index()

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = t_grouped['Neighbourhood']

for ind in np.arange(t_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(t_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Seafood Restaurant,Coffee Shop,Cocktail Bar,Beer Bar,Farmers Market,Fish Market,Tailor Shop,Jazz Club,Breakfast Spot,Liquor Store
1,"Brockton, Parkdale Village, Exhibition Place",Café,Breakfast Spot,Nightclub,Coffee Shop,Performing Arts Venue,Climbing Gym,Restaurant,Italian Restaurant,Intersection,Stadium
2,"Business reply mail Processing Centre, South C...",Skate Park,Burrito Place,Fast Food Restaurant,Farmers Market,Brewery,Auto Workshop,Restaurant,Pizza Place,Gym / Fitness Center,Comic Shop
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport,Harbor / Marina,Coffee Shop,Plane,Rental Car Location,Sculpture Garden,Bar,Boat or Ferry
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Office,Sandwich Place,Bubble Tea Shop,Portuguese Restaurant,Poke Place,Pizza Place,Yoga Studio


*Cluster Neighbourhoods*

In [21]:
# set number of clusters
kclusters = 5

t_grouped_clustering = t_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(t_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [22]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

t_merged = pd.merge(left=df2, right=neighbourhoods_venues_sorted, how='left', left_on='Neighbourhood', right_on='Neighbourhood')

t_merged.dropna(axis=0, inplace = True)

t_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Bakery,Park,Breakfast Spot,Theater,Restaurant,Café,Pub,Chocolate Shop,Yoga Studio
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Sushi Restaurant,Yoga Studio,Beer Bar,Sandwich Place,Burrito Place,Café,Portuguese Restaurant,College Auditorium,Nightclub
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Café,Theater,Coffee Shop,Burrito Place,Hotel,Electronics Store,Sandwich Place,Japanese Restaurant,Diner,Burger Joint
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0.0,Gastropub,Café,Farmers Market,Coffee Shop,Gym,Food Truck,Hotel,Italian Restaurant,Japanese Restaurant,Jazz Club
19,M4E,East Toronto,The Beaches,43.676357,-79.293031,4.0,Trail,Health Food Store,Neighborhood,Pub,Yoga Studio,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store


In [29]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(t_merged['Latitude'], t_merged['Longitude'], t_merged['Neighbourhood'], t_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

*screenshot of map in case Github does not show*

In [31]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://raw.githubusercontent.com/tylerhys/capstone/tylerhys-SCNSHOTS/cluster.PNG")

*Examine Clusters*

Cluster 1

In [24]:
t_merged.loc[t_merged['Cluster Labels'] == 0, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,0.0,Coffee Shop,Bakery,Park,Breakfast Spot,Theater,Restaurant,Café,Pub,Chocolate Shop,Yoga Studio
4,Downtown Toronto,0.0,Coffee Shop,Sushi Restaurant,Yoga Studio,Beer Bar,Sandwich Place,Burrito Place,Café,Portuguese Restaurant,College Auditorium,Nightclub
9,Downtown Toronto,0.0,Café,Theater,Coffee Shop,Burrito Place,Hotel,Electronics Store,Sandwich Place,Japanese Restaurant,Diner,Burger Joint
15,Downtown Toronto,0.0,Gastropub,Café,Farmers Market,Coffee Shop,Gym,Food Truck,Hotel,Italian Restaurant,Japanese Restaurant,Jazz Club
20,Downtown Toronto,0.0,Seafood Restaurant,Coffee Shop,Cocktail Bar,Beer Bar,Farmers Market,Fish Market,Tailor Shop,Jazz Club,Breakfast Spot,Liquor Store
24,Downtown Toronto,0.0,Coffee Shop,Café,Italian Restaurant,Office,Sandwich Place,Bubble Tea Shop,Portuguese Restaurant,Poke Place,Pizza Place,Yoga Studio
25,Downtown Toronto,0.0,Grocery Store,Café,Park,Baby Store,Coffee Shop,Nightclub,Italian Restaurant,Candy Store,Restaurant,Donut Shop
30,Downtown Toronto,0.0,Coffee Shop,Café,Pizza Place,Restaurant,Steakhouse,Bakery,Speakeasy,Smoke Shop,Hotel,Seafood Restaurant
31,West Toronto,0.0,Bakery,Pharmacy,Supermarket,Bank,Bar,Brewery,Pool,Café,Playground,Middle Eastern Restaurant
36,Downtown Toronto,0.0,Plaza,Café,Park,Hotel,Dance Studio,Deli / Bodega,Sporting Goods Shop,Basketball Stadium,Lake,Supermarket


Cluster 2

In [25]:
t_merged.loc[t_merged['Cluster Labels'] == 1, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
62,Central Toronto,1.0,Garden,Home Service,Yoga Studio,Deli / Bodega,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store


Cluster 3

In [26]:
t_merged.loc[t_merged['Cluster Labels'] == 2, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
83,Central Toronto,2.0,Tennis Court,Restaurant,Dance Studio,Eastern European Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner,Dessert Shop


Cluster 4

In [27]:
t_merged.loc[t_merged['Cluster Labels'] == 3, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
61,Central Toronto,3.0,Park,Swim School,Bus Line,Business Service,Escape Room,Eastern European Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store
68,Central Toronto,3.0,Park,Jewelry Store,Trail,Sushi Restaurant,Bus Line,Yoga Studio,Deli / Bodega,Donut Shop,Dog Run,Distribution Center
91,Downtown Toronto,3.0,Park,Playground,Trail,Cuban Restaurant,Eastern European Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store,Diner


Cluster 5

In [28]:
t_merged.loc[t_merged['Cluster Labels'] == 4, t_merged.columns[[1] + list(range(5, t_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
19,East Toronto,4.0,Trail,Health Food Store,Neighborhood,Pub,Yoga Studio,Cuban Restaurant,Donut Shop,Dog Run,Distribution Center,Discount Store


*Evaluation of Clusters*

Cluster 1 contains the most neighbourhoods, while having mostly eateries as their 1st Most Common Venue. As for the other 4 clusters,they seem to be more of suburbs, having their 1st Most Common Venue as outdoor venues such as Parks or Gardens.