# 1. Web Scraping

In [1]:
# import the necessary libraries
import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

from geopy.geocoders import Nominatim

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium 

To build upon the sample code given, I'll traverse one row vertically, and then move on to the next column.

Otherwise, the final data frame will only contain the postal codes from the first column of the table off of the Wikipedia page.

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
presoup = requests.get(url).text
soup = BeautifulSoup(presoup)

table_contents = []
table = soup.find('table')
for row in table.findAll('tr'):
    for col in row.findAll('td'):
        cell = {}
        if col.span.text == 'Not assigned':
            pass
        else:
            cell['Postal Code'] = col.p.text[:3]
            cell['Borough'] = col.p.span.text.split('(')[0]
            cell['Neighborhood'] = (((((col.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
            table_contents.append(cell)

In [3]:
# print(table_contents)
df = pd.DataFrame(table_contents)

df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

# 2. Setting up the data frame

I'll take the intersection between my neighborhood data frame and the data frame read from the lat-lon CSV.

In [4]:
latslons = pd.read_csv('./Geospatial_Coordinates.csv')

df1 = df.merge(latslons, how = 'inner', indicator = False)

display(df1.head())

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


# 3. Geocoder

Let's get started with the geospatial data analysis and visualization.

In [5]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent = "toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geographical coordinates of Toronto are 43.6534817, -79.3839347.


In [6]:
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


Create the map to display the neighborhoods in the data frame.

In [7]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location = [latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df1['Latitude'], df1['Longitude'], df1['Borough'], df1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(map_toronto)  
    
map_toronto

We seem to have many neighborhoods from Downtown Toronto. I'll filter my data frame down to these points to keep this notebook more succinct.

In [8]:
dt_tor = df1[df1['Borough'] == 'Downtown Toronto'].reset_index(drop = True)

In [9]:
address = 'Downtown Toronto, Toronto'

geolocator = Nominatim(user_agent = "dt_toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Downtown Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Downtown Toronto are 43.6541737, -79.38081162653639.


In [10]:
dt_tor.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383


In [11]:
# create map of Manhattan using latitude and longitude values
dt_toronto = folium.Map(location = [latitude, longitude], zoom_start = 11)

# add markers to map
for lat, lng, label in zip(dt_tor['Latitude'], dt_tor['Longitude'], dt_tor['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = 'blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7,
        parse_html = False).add_to(dt_toronto)  
    
dt_toronto

The next step is to use Foursquare API and cluster the neighborhoods in our data frame.

In [12]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: ...
CLIENT_SECRET: ...


In [13]:
print(dt_tor.Neighborhood[:3])

0    Regent Park, Harbourfront
1     Garden District, Ryerson
2               St. James Town
Name: Neighborhood, dtype: object


Let's explore what's around the University of Toronto.

In [14]:
uni_string = dt_tor.loc[10, 'Neighborhood'].split(',')[0]

neighborhood_latitude = dt_tor.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = dt_tor.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = uni_string

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of University of Toronto are 43.6542599, -79.3606359.


In [15]:
LIMIT = 100
radius = 500 # 500 meter search radius

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()

# the list of nearby venues
venues = results['response']['groups'][0]['items']

nearby_venues = pd.json_normalize(venues) # flatten JSON

In [16]:
display((nearby_venues.head()))
print()
print("{} venues found within a 500-meter radius around the University of Tokyo.".format(nearby_venues.shape[0]))

Unnamed: 0,referralId,reasons.count,reasons.items,venue.id,venue.name,venue.location.address,venue.location.crossStreet,venue.location.lat,venue.location.lng,venue.location.labeledLatLngs,...,venue.location.cc,venue.location.city,venue.location.state,venue.location.country,venue.location.formattedAddress,venue.categories,venue.photos.count,venue.photos.groups,venue.venuePage.id,venue.location.neighborhood
0,e-0-54ea41ad498e9a11e9e13308-0,0,"[{'summary': 'This spot is popular', 'type': '...",54ea41ad498e9a11e9e13308,Roselle Desserts,362 King St E,Trinity St,43.653447,-79.362017,"[{'label': 'display', 'lat': 43.65344672305267...",...,CA,Toronto,ON,Canada,"[362 King St E (Trinity St), Toronto ON M5A 1K...","[{'id': '4bf58dd8d48988d16a941735', 'name': 'B...",0,[],,
1,e-0-53b8466a498e83df908c3f21-1,0,"[{'summary': 'This spot is popular', 'type': '...",53b8466a498e83df908c3f21,Tandem Coffee,368 King St E,at Trinity St,43.653559,-79.361809,"[{'label': 'display', 'lat': 43.65355870959944...",...,CA,Toronto,ON,Canada,"[368 King St E (at Trinity St), Toronto ON, Ca...","[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",0,[],,
2,e-0-574c229e498ebb5c6b257902-2,0,"[{'summary': 'This spot is popular', 'type': '...",574c229e498ebb5c6b257902,Cooper Koo Family YMCA,461 Cherry St,,43.653249,-79.358008,"[{'label': 'display', 'lat': 43.65324910177244...",...,CA,Toronto,ON,Canada,"[461 Cherry St, Toronto ON M5A 0H7, Canada]","[{'id': '52e81612bcbc57f1066b7a37', 'name': 'D...",0,[],,
3,e-0-50760559e4b0e8c7babe2497-3,0,"[{'summary': 'This spot is popular', 'type': '...",50760559e4b0e8c7babe2497,Body Blitz Spa East,497 King Street East,btwn Sackville St and Sumach St,43.654735,-79.359874,"[{'label': 'display', 'lat': 43.65473505045365...",...,CA,Toronto,ON,Canada,[497 King Street East (btwn Sackville St and S...,"[{'id': '4bf58dd8d48988d1ed941735', 'name': 'S...",0,[],,
4,e-0-5612b1cc498e3dd742af0dc8-4,0,"[{'summary': 'This spot is popular', 'type': '...",5612b1cc498e3dd742af0dc8,Impact Kitchen,573 King St E,at St Lawrence St,43.656369,-79.35698,"[{'label': 'display', 'lat': 43.65636850543279...",...,CA,Toronto,ON,Canada,"[573 King St E (at St Lawrence St), Toronto ON...","[{'id': '4bf58dd8d48988d1c4941735', 'name': 'R...",0,[],,



44 venues found within a 500-meter radius around the University of Tokyo.


Let's clean up the data frame; nearby_venues is full of unnecessary information.

In [17]:
# helper function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [18]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.rename(columns = {'name': 'Venue Name', 'categories': 'Category',
                                'lat': 'Venue Latitude', 'lng': 'Venue Longitude'}, inplace = True)
nearby_venues.head()

Unnamed: 0,Venue Name,Category,Venue Latitude,Venue Longitude
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


I'm going to use the same function as the New York notebook to construct my final data frame for the clustering process.

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
dt_tor_venues = getNearbyVenues(names = dt_tor['Neighborhood'],
                                   latitudes = dt_tor['Latitude'],
                                   longitudes = dt_tor['Longitude']
                                  )

Regent Park, Harbourfront
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [21]:
dt_tor_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


Everything seems to be fine up to this point. We can start working on clustering the neighborhoods.

# 4. Analyzing the neighborhoods

In [22]:
# one hot encoding
tor_onehot = pd.get_dummies(dt_tor_venues[['Venue Category']], prefix = "", prefix_sep = "")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = dt_tor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_onehot.head()

Unnamed: 0,Yoga Studio,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thai Restaurant,Theater,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We have 204 different categories in this data frame. Let's see what's the most popular activity for each neighborhood.

In [23]:
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
for hood in tor_grouped['Neighborhood']:
    print("----" + hood + "----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop=True).head(1))
    print('\n')

----Berczy Park----
          venue  freq
0  Cocktail Bar  0.05


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
             venue  freq
0  Airport Service  0.17


----Central Bay Street----
         venue  freq
0  Coffee Shop  0.16


----Christie----
           venue  freq
0  Grocery Store  0.25


----Church and Wellesley----
         venue  freq
0  Coffee Shop  0.08


----Commerce Court, Victoria Hotel----
         venue  freq
0  Coffee Shop  0.15


----First Canadian Place, Underground city----
         venue  freq
0  Coffee Shop  0.11


----Garden District, Ryerson----
            venue  freq
0  Clothing Store  0.09


----Harbourfront East, Union Station, Toronto Islands----
         venue  freq
0  Coffee Shop  0.13


----Kensington Market, Chinatown, Grange Park----
  venue  freq
0  Café  0.07


----Regent Park, Harbourfront----
         venue  freq
0  Coffee Shop  0.16


----Richmond, Adelaide, King----
        

I'm borrowing the function in the New York notebook to rank the 10 most popular categories per neighborhood.

In [24]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns = columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Cocktail Bar,Bakery,Coffee Shop,Restaurant,Cheese Shop,Seafood Restaurant,Pub,Beer Bar,Farmers Market,Pharmacy
1,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Service,Airport Lounge,Airport Terminal,Plane,Airport,Boutique,Boat or Ferry,Sculpture Garden,Coffee Shop,Bar
2,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Burger Joint,Salad Place,Bubble Tea Shop,Japanese Restaurant,Sculpture Garden,Restaurant
3,Christie,Grocery Store,Café,Park,Coffee Shop,Candy Store,Nightclub,Restaurant,Baby Store,Italian Restaurant,Miscellaneous Shop
4,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant,Hotel,Mediterranean Restaurant,Men's Store,Fast Food Restaurant,Pub


# 5. Cluster the neighborhoods

I'll use K-Means to cluster the neighborhoods into 5 groups.

In [25]:
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the data frame
kmeans.labels_[0:10] 

array([0, 3, 0, 1, 0, 0, 0, 0, 0, 4])

Let's create a new data frame that includes the cluster as well as the top 10 venues for each neighborhood.

In [26]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = dt_tor

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Park,Bakery,Breakfast Spot,Theater,Café,Farmers Market,Chocolate Shop,Cosmetics Shop
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Clothing Store,Coffee Shop,Middle Eastern Restaurant,Cosmetics Shop,Bubble Tea Shop,Italian Restaurant,Japanese Restaurant,Café,Movie Theater,Lingerie Store
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Café,Coffee Shop,Cosmetics Shop,Restaurant,Cocktail Bar,Beer Bar,Japanese Restaurant,Creperie,Lingerie Store,Moroccan Restaurant
3,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Cocktail Bar,Bakery,Coffee Shop,Restaurant,Cheese Shop,Seafood Restaurant,Pub,Beer Bar,Farmers Market,Pharmacy
4,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383,0,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Burger Joint,Salad Place,Bubble Tea Shop,Japanese Restaurant,Sculpture Garden,Restaurant


In [27]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

5 cluster K-means picked 1 big cluster and made the other clusters look like outliers. Let's delve a bit deeper to see what's going on and what the algorithm picked up in each cluster.

# 6. Examine Clusters


### Cluster 1

In [28]:
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,0,Coffee Shop,Pub,Park,Bakery,Breakfast Spot,Theater,Café,Farmers Market,Chocolate Shop,Cosmetics Shop
1,Downtown Toronto,0,Clothing Store,Coffee Shop,Middle Eastern Restaurant,Cosmetics Shop,Bubble Tea Shop,Italian Restaurant,Japanese Restaurant,Café,Movie Theater,Lingerie Store
2,Downtown Toronto,0,Café,Coffee Shop,Cosmetics Shop,Restaurant,Cocktail Bar,Beer Bar,Japanese Restaurant,Creperie,Lingerie Store,Moroccan Restaurant
3,Downtown Toronto,0,Cocktail Bar,Bakery,Coffee Shop,Restaurant,Cheese Shop,Seafood Restaurant,Pub,Beer Bar,Farmers Market,Pharmacy
4,Downtown Toronto,0,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Burger Joint,Salad Place,Bubble Tea Shop,Japanese Restaurant,Sculpture Garden,Restaurant
6,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Thai Restaurant,Hotel,Clothing Store,Deli / Bodega,Gym,Sushi Restaurant,Pizza Place
7,Downtown Toronto,0,Coffee Shop,Aquarium,Café,Restaurant,Hotel,Fried Chicken Joint,Brewery,Scenic Lookout,History Museum,Baseball Stadium
8,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Bakery,Seafood Restaurant,Japanese Restaurant,Salad Place,Italian Restaurant,Lounge
9,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Hotel,Gym,Deli / Bodega,Bakery,Seafood Restaurant,Italian Restaurant,Japanese Restaurant
14,Downtown Toronto,0,Café,Coffee Shop,Bakery,Pizza Place,Park,Pub,Italian Restaurant,Restaurant,Pet Store,Deli / Bodega


This is by large the biggest cluster and is dominated by cafés and coffee shops. This sums up the downtown pretty well (notice the pubs, bars, restaurants, and hotels).

### Cluster 2

In [29]:
tor_merged.loc[tor_merged['Cluster Labels'] == 1, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,Downtown Toronto,1,Grocery Store,Café,Park,Coffee Shop,Candy Store,Nightclub,Restaurant,Baby Store,Italian Restaurant,Miscellaneous Shop


This cluster seems to capture the areas where the people of Downtown Toronto do their grocery shopping.

### Cluster 3

In [30]:
tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,2,Park,Playground,Trail,Yoga Studio,Moroccan Restaurant,Martial Arts School,Mediterranean Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant


This cluster seems to capture residential areas as the most common venues are parks, playgrounds, and trails.

### Cluster 4

In [31]:
tor_merged.loc[tor_merged['Cluster Labels'] == 3, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Downtown Toronto,3,Airport Service,Airport Lounge,Airport Terminal,Plane,Airport,Boutique,Boat or Ferry,Sculpture Garden,Coffee Shop,Bar


This is the cluster on the Centre Island; everything is related to the airport or the ferry service.

### Cluster 5

In [32]:
tor_merged.loc[tor_merged['Cluster Labels'] == 4, tor_merged.columns[[1] + list(range(5, tor_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,4,Café,Bar,Bakery,Japanese Restaurant,Bookstore,Sandwich Place,College Gym,Beer Store,Restaurant,College Arts Building
11,Downtown Toronto,4,Café,Bar,Coffee Shop,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Caribbean Restaurant,Mexican Restaurant,Grocery Store,Park,Gaming Cafe


This cluster is the most similar to Cluster 1. One can also see that it's physically the closest to Cluster 1.

The difference seems to be that cafés and bars are more popular in this cluster. Foreign cuisine also seems to be more popular in this cluster.

# 7. Acknowledgements

The helper functions in this notebook were taken from the last course of the IBM Data Science specialization.