## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import requests
import pandas as pd
import numpy as np
from lxml import etree
from bs4 import BeautifulSoup as bsoup
import os
import json
#!conda install -c conda-forge geocoder
import geocoder

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.6.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [2]:
# The code was removed by Watson Studio for sharing.

In [3]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [4]:
raw_random_wikipedia_page = requests.get(wikipedia_link)

In [5]:
page = raw_random_wikipedia_page.text
#print(page)

In [6]:
soup = bsoup(page, "lxml")
tablePostalCode = soup.find_all("table")[0]
rows = tablePostalCode.find_all("tr")
listPostalCode = []
for row in rows:
    tds = row.find_all("td")
    for td in tds:
        if (td.p.span.find("i") is not None) and ("Not assigned" in td.p.span.i.text):
            #print("Ignored")
            continue
        spanText = td.p.span.text
        #print(spanText)
        code = td.p.b.text
        aTags = td.find_all("a")
        borough = spanText[0 : spanText.find("(")]
        neighborhood = spanText[spanText.find("(") + 1: len(spanText) - 1]
        listPostalCode.append({"PostalCode": code, "Borough": borough, "Neighborhood": neighborhood})
listPostalCode

[{'Borough': 'North York', 'Neighborhood': 'Parkwoods', 'PostalCode': 'M3A'},
 {'Borough': 'North York',
  'Neighborhood': 'Victoria Village',
  'PostalCode': 'M4A'},
 {'Borough': 'Downtown Toronto',
  'Neighborhood': 'Regent Park / Harbourfront',
  'PostalCode': 'M5A'},
 {'Borough': 'North York',
  'Neighborhood': 'Lawrence Manor / Lawrence Heights',
  'PostalCode': 'M6A'},
 {'Borough': "Queen's Park",
  'Neighborhood': "Queen's Park",
  'PostalCode': 'M7A'},
 {'Borough': 'Etobicoke',
  'Neighborhood': 'Islington Avenue',
  'PostalCode': 'M9A'},
 {'Borough': 'Scarborough',
  'Neighborhood': 'Malvern / Rouge',
  'PostalCode': 'M1B'},
 {'Borough': 'North York',
  'Neighborhood': 'Don Mills)Nort',
  'PostalCode': 'M3B'},
 {'Borough': 'East York',
  'Neighborhood': 'Parkview Hill / Woodbine Gardens',
  'PostalCode': 'M4B'},
 {'Borough': 'Downtown Toronto',
  'Neighborhood': 'Garden District, Ryerson',
  'PostalCode': 'M5B'},
 {'Borough': 'North York', 'Neighborhood': 'Glencairn', 'PostalC

In [7]:
columns = ["PostalCode", "Borough", "Neighborhood"]
dfPostalCode = pd.DataFrame.from_records(data=listPostalCode, columns=columns)
dfPostalCode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Queen's Park


Explain my work and any assumptions I made: Firstly, I used etree lib from lxml to scrap Postal Code table content, then pd.read_html lib to convert html content to data frame as code below:

root = etree.XML(page) 

tableTag = etree.XPath("//table") 

postalCodeTorontoElems = tableTag(root)[0] 

listDfTorontoPostalCode = pd.read_html(etree.tostring(postalCodeTorontoElems,method='html')) 

listCol = list(listDfTorontoPostalCode[0].columns) 

print(listDfTorontoPostalCode[0].shape) 

dfTorontoPostalCode = listDfTorontoPostalCode[0]

I realized that there were some difficult cases to seperate which is borough or neighborhood, because the format/structure of cells in Wiki Postal Code table was not the same while all content in each cell of data frame is text and no spaces. For example, cell(1,7), cell(3,3), cell(2,8), cell(3,8).

Thus, I used etree lib to read and seperate each element that I was interested in. But it's not a good way.

Then, it's realy luck to me when I saw the note in this assignment. I used BeautifulSoup lib as recommendation in note of assignment. It's really easy and make my task on the fly!

I supposed that the text before character "(" is represented the borough, and all text in "()" is represented the list of neighborhoods in borough.

In [8]:
dfPostalCode.shape

(103, 3)

## Make calls to the Google Geocoding API to get the latitude and longitude coordinates of the postal codes in dataframe

In [9]:
# The code was removed by Watson Studio for sharing.

In [10]:
def updateLatLngForTorontoPostalCode(dfPostalCode, loadAPI = False):
    if os.path.isfile('TorontoPostalCode.csv') and loadAPI == False:
        print("Load data from csv")
        dfPostalCode = pd.read_csv('TorontoPostalCode.csv')
    else:
        for row in list(range(0,dfPostalCode.shape[0])):
            #print("At index: {0}, Postal Code: {1}".format(row, dfPostalCode.iloc[row,0]))
            url = URL.format(API_KEY, dfPostalCode.iloc[row,0])
            #print("Get url: {0}".format(url))
            response = requests.get(url).json() # get response
            if response["status"] == "ZERO_RESULTS":
                print("NO RESULT at row index {0}: row".format(row))
                continue
            else:
                geographical_data = response["results"][0]["geometry"]["location"] # get geographical coordinates
                latitude = geographical_data['lat']
                longitude = geographical_data['lng']
                dfPostalCode.iloc[row, dfPostalCode.columns.get_loc('Latitude')] = latitude
                dfPostalCode.iloc[row, dfPostalCode.columns.get_loc('Longitude')] = longitude
    return dfPostalCode

In [11]:
def updateLatLngForTorontoPostalCodeUseGeocoder(dfPostalCode, loadAPI = False):
    if os.path.isfile('TorontoPostalCode.csv') and loadAPI == False:
        print("Load data from csv")
        dfPostalCode = pd.read_csv('TorontoPostalCode.csv')
    else:
        latitude = []
        longitude = []

        for pc in dfPostalCode["PostalCode"].values:
            coordsLatLng = None
            while(coordsLatLng is None):
                g = geocoder.google('{}, Toronto, Ontario'.format(pc))
                coordsLatLng = g.latlng

            latitude.append(coordsLatLng[0])
            longitude.append(coordsLatLng[1])

        dfPostalCode['Latitude'] = latitude
        dfPostalCode['Longitude'] = longitude

    return dfPostalCode

In [12]:
# Useless
#dfPostalCode = updateLatLngForTorontoPostalCode(dfPostalCode, True)
# Using
dfPostalCode = updateLatLngForTorontoPostalCodeUseGeocoder(dfPostalCode, True)
dfPostalCode

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.654260,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,Malvern / Rouge,43.806686,-79.194353
7,M3B,North York,Don Mills)Nort,43.745906,-79.352188
8,M4B,East York,Parkview Hill / Woodbine Gardens,43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [13]:
dfPostalCode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [14]:
# Backup to use later instead of making request to Google Geoencoding API
fnPostCodeCSV = 'TorontoPostalCode.csv'
dfPostalCode.to_csv(fnPostCodeCSV, sep=',', encoding='utf-8', index=False)

In [15]:
# Check csv again
dfTest = pd.read_csv('TorontoPostalCode.csv')
dfTest.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Use geopy library to get the latitude and longitude values of Toronto City.

In [16]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [17]:
# create map of Toronto using latitude and longitude values
mapToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfPostalCode['Latitude'], dfPostalCode['Longitude'], dfPostalCode['Borough'], dfPostalCode['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        #parse_html=False
    ).add_to(mapToronto)  
    
mapToronto

## I will perform segment and cluster only the boroughs that contain the word Toronto.

In [18]:
dfTorontoBoroughs = dfPostalCode[dfPostalCode.Borough.str.contains("Toronto")].reset_index(drop=True)
dfTorontoBoroughs

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
5,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
6,M6G,Downtown Toronto,Christie,43.669542,-79.422564
7,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568
8,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259
9,M4J,East YorkEast Toronto,The Danforth East,43.685347,-79.338106


In [20]:
# Still create map of latitude and longitude values of Toronto city
mapTorontoBorough = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers of borough that contains word Toronto to map
for lat, lng, borough, neighborhood in zip(dfTorontoBoroughs['Latitude'], dfTorontoBoroughs['Longitude'], dfTorontoBoroughs['Borough'], dfTorontoBoroughs['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        #parse_html=False
    ).add_to(mapTorontoBorough)  
    
mapTorontoBorough

## Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [21]:
# The code was removed by Watson Studio for sharing.

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, limit)
            
        # make the GET request
        response = requests.get(url).json()["response"]
        if "groups" not in response:
            print("No venues at Neighborhood {0} ({1}, {2})".format(name, lat, lng))
            continue
        results = response['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Now, let's get 100 venues that are in Toronto boroughs within a radius of 500 meters.

In [23]:
dfBoroughTorontoVenues = getNearbyVenues(dfTorontoBoroughs['Neighborhood'], dfTorontoBoroughs['Latitude'], dfTorontoBoroughs['Longitude'], radius=500)
dfBoroughTorontoVenues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Regent Park / Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,Regent Park / Harbourfront,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,Regent Park / Harbourfront,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
3,Regent Park / Harbourfront,43.65426,-79.360636,Cooper Koo YMCA,43.653191,-79.357947,Gym / Fitness Center
4,Regent Park / Harbourfront,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


How many venues were returned for each neighborhood?

In [24]:
dfBoroughTorontoVenues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,57,57,57,57,57,57
Brockton / Parkdale Village / Exhibition Place,22,22,22,22,22,22
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,13,13,13,13,13,13
Central Bay Street,88,88,88,88,88,88
Christie,16,16,16,16,16,16
Church and Wellesley,80,80,80,80,80,80
Commerce Court / Victoria Hotel,100,100,100,100,100,100
Davisville,35,35,35,35,35,35
Davisville North,8,8,8,8,8,8
Dufferin / Dovercourt Village,16,16,16,16,16,16


How many unique categories can be curated from all the returned venues?

In [25]:
print('There are {} uniques categories.'.format(len(dfBoroughTorontoVenues['Venue Category'].unique())))

There are 225 uniques categories.


## Analyze Each Neighborhood

In [26]:
# one hot encoding
onehotBoroughToronto = pd.get_dummies(dfBoroughTorontoVenues[['Venue Category']], prefix="", prefix_sep="")

# found that there was column name "Neighborhood" in this data frame, so I rename it to Venue Neighborhood
onehotBoroughToronto.rename(columns={'Neighborhood': 'Venue Neighborhood'}, inplace=True)

# get list of column name before adding neighborhood column
fixed_columns = onehotBoroughToronto.columns

# add neighborhood column back to dataframe
onehotBoroughToronto['Neighborhood'] = dfBoroughTorontoVenues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ["Neighborhood"] + list(fixed_columns)
onehotBoroughToronto = onehotBoroughToronto[fixed_columns]

onehotBoroughToronto.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Regent Park / Harbourfront,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [27]:
onehotBoroughTorontoGrouped = onehotBoroughToronto.groupby('Neighborhood').mean().reset_index()
onehotBoroughTorontoGrouped

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Brockton / Parkdale Village / Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455
2,CN Tower / King and Spadina / Railway Lands / ...,0.0,0.0,0.076923,0.076923,0.076923,0.153846,0.153846,0.153846,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,...,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.011364,0.0,0.011364
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Church and Wellesley,0.0125,0.0125,0.0,0.0,0.0,0.0,0.0,0.0,0.025,...,0.0,0.0,0.0,0.0125,0.0125,0.0,0.0,0.0,0.0,0.0125
6,Commerce Court / Victoria Hotel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0
7,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Davisville North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Dufferin / Dovercourt Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's print each neighborhood along with the top 5 most common venues

In [28]:
num_top_venues = 5

for hood in onehotBoroughTorontoGrouped['Neighborhood']:
    print("----"+hood+"----")
    temp = onehotBoroughTorontoGrouped[onehotBoroughTorontoGrouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
          venue  freq
0   Coffee Shop  0.07
1  Cocktail Bar  0.05
2    Restaurant  0.05
3    Steakhouse  0.04
4          Café  0.04


----Brockton / Parkdale Village / Exhibition Place----
                    venue  freq
0             Coffee Shop  0.14
1          Breakfast Spot  0.09
2                    Café  0.09
3             Yoga Studio  0.05
4  Furniture / Home Store  0.05


----CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport----
              venue  freq
0   Airport Service  0.15
1  Airport Terminal  0.15
2    Airport Lounge  0.15
3             Plane  0.08
4     Boat or Ferry  0.08


----Central Bay Street----
                 venue  freq
0          Coffee Shop  0.16
1                 Café  0.06
2   Italian Restaurant  0.06
3  Japanese Restaurant  0.03
4         Burger Joint  0.03


----Christie----
               venue  freq
0               Café  0.19
1      Grocery Store  0.19
2               P

Let's put that into a pandas dataframe

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [30]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venueBoroughTorontoSorted = pd.DataFrame(columns=columns)
venueBoroughTorontoSorted['Neighborhood'] = onehotBoroughTorontoGrouped['Neighborhood']

for ind in np.arange(onehotBoroughTorontoGrouped.shape[0]):
    venueBoroughTorontoSorted.iloc[ind, 1:] = return_most_common_venues(onehotBoroughTorontoGrouped.iloc[ind, :], num_top_venues)

venueBoroughTorontoSorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Farmers Market,Cheese Shop,Steakhouse,Sushi Restaurant,Beer Bar,Bakery
1,Brockton / Parkdale Village / Exhibition Place,Coffee Shop,Café,Breakfast Spot,Bar,Grocery Store,Furniture / Home Store,Italian Restaurant,Falafel Restaurant,Convenience Store,Performing Arts Venue
2,CN Tower / King and Spadina / Railway Lands / ...,Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Airport,Airport Food Court,Airport Gate,Boat or Ferry,Plane,Sculpture Garden
3,Central Bay Street,Coffee Shop,Italian Restaurant,Café,Bubble Tea Shop,Japanese Restaurant,Burger Joint,Ice Cream Shop,Bar,Bakery,Spa
4,Christie,Grocery Store,Café,Park,Athletics & Sports,Italian Restaurant,Diner,Nightclub,Convenience Store,Restaurant,Baby Store
5,Church and Wellesley,Japanese Restaurant,Gay Bar,Sushi Restaurant,Coffee Shop,Restaurant,Burger Joint,Gastropub,Smoke Shop,Pub,American Restaurant
6,Commerce Court / Victoria Hotel,Coffee Shop,Hotel,Café,Restaurant,American Restaurant,Gastropub,Deli / Bodega,Seafood Restaurant,Steakhouse,Gym
7,Davisville,Sandwich Place,Pizza Place,Dessert Shop,Coffee Shop,Italian Restaurant,Seafood Restaurant,Café,Sushi Restaurant,Farmers Market,Diner
8,Davisville North,Hotel,Grocery Store,Sandwich Place,Park,Clothing Store,Food & Drink Shop,Breakfast Spot,Burger Joint,Yoga Studio,Dim Sum Restaurant
9,Dufferin / Dovercourt Village,Bakery,Pharmacy,Athletics & Sports,Music Venue,Middle Eastern Restaurant,Café,Discount Store,Brewery,Liquor Store,Park


## Cluster Neighborhoods

Run k-means to cluster the neighborhood into 5 clusters.

In [33]:
# set number of clusters
kclusters = 5

boroughTorontoGroupedClustering = onehotBoroughTorontoGrouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(boroughTorontoGroupedClustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [35]:
dfPostalCodeMerged = dfTorontoBoroughs
dfPostalCodeMerged

# add clustering labels
dfPostalCodeMerged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dfPostalCodeMerged = dfPostalCodeMerged.join(venueBoroughTorontoSorted.set_index('Neighborhood'), on='Neighborhood')

dfPostalCodeMerged.head() # check the last columns!Finally, let's visualize the resulting clusters

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636,0,Coffee Shop,Park,Café,Bakery,Theater,Breakfast Spot,Mexican Restaurant,Pub,Italian Restaurant,Spa
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Japanese Restaurant,Italian Restaurant,Cosmetics Shop,Ramen Restaurant,Bar,Pizza Place,Tea Room
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Restaurant,Café,Hotel,Clothing Store,Park,Cocktail Bar,Japanese Restaurant,Cosmetics Shop,Gastropub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Coffee Shop,Boutique,Pub,Venue Neighborhood,Dance Studio,Electronics Store,Eastern European Restaurant,Donut Shop,Dog Run,Discount Store
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Restaurant,Seafood Restaurant,Farmers Market,Cheese Shop,Steakhouse,Sushi Restaurant,Beer Bar,Bakery


Finally, let's visualize the resulting clusters

In [39]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfPostalCodeMerged['Latitude'], dfPostalCodeMerged['Longitude'], dfPostalCodeMerged['Neighborhood'], dfPostalCodeMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [40]:
dfPostalCodeMerged.loc[dfPostalCodeMerged['Cluster Labels'] == 1, dfPostalCodeMerged.columns[[2] + list(range(5, dfPostalCodeMerged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,The Annex / North Midtown / Yorkville,1,Coffee Shop,Sandwich Place,Café,Pizza Place,Park,Pub,Burger Joint,Jewish Restaurant,Indian Restaurant,BBQ Joint
33,St. James Town / Cabbagetown,1,Restaurant,Coffee Shop,Pizza Place,Pub,Bakery,Indian Restaurant,Market,Italian Restaurant,Café,Japanese Restaurant
