## Segmenting and Clustering Neighborhoods in Toronto

In [2]:
import requests
import pandas as pd
import numpy as np
from lxml import etree
from bs4 import BeautifulSoup as bsoup
import os
import json

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.6.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following NEW packages will be INSTALLED:

    altair:  2.2.2-py_0 conda-forge
    branca:  0.3.0-py_0 conda-forge
    folium:  0.6.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge

branca-0.3.0-p 100% |################################| Time: 0:00:00 789.74 kB/s
vincent-0.4.4- 100% |################################| Time: 0:00:00 907.02 kB/s
altair-2.2.2-p 100% |################################| Time: 0:00:00 910.31 kB/s
folium-0.6.0-p 100% |################################| Time: 0:00:00   1.67 MB/s


In [3]:
# The code was removed by Watson Studio for sharing.

In [4]:
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [5]:
raw_random_wikipedia_page = requests.get(wikipedia_link)

In [6]:
page = raw_random_wikipedia_page.text
#print(page)

In [7]:
soup = bsoup(page, "lxml")
tablePostalCode = soup.find_all("table")[0]
rows = tablePostalCode.find_all("tr")
listPostalCode = []
for row in rows:
    tds = row.find_all("td")
    for td in tds:
        if (td.p.span.find("i") is not None) and ("Not assigned" in td.p.span.i.text):
            continue
        spanText = td.p.span.text
        code = td.p.b.text
        aTags = td.find_all("a")
        if len(aTags) <= 0:
            continue
        borough = aTags[0].text
        listNeighborhood = []
        if len(aTags) > 1:
            neighborhood = spanText[spanText.find("(") + 1: len(spanText) - 1]
        else:
            neighborhood = borough
        listPostalCode.append({"PostalCode": code, "Borough": borough, "Neighborhood": neighborhood})
#listPostalCode

In [8]:
columns = ["PostalCode", "Borough", "Neighborhood"]
dfPostalCode = pd.DataFrame.from_records(data=listPostalCode, columns=columns)
dfPostalCode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Queen's Park,Queen's Park


Explain my work and any assumptions I made:
Firstly, I used etree lib from lxml to scrap Postal Code table content, then pd.read_html lib to convert html content to data frame as code below:

root = etree.XML(page)

tableTag = etree.XPath("//table")

postalCodeTorontoElems = tableTag(root)[0]

listDfTorontoPostalCode = pd.read_html(etree.tostring(postalCodeTorontoElems,method='html'))

listCol = list(listDfTorontoPostalCode[0].columns)

print(listDfTorontoPostalCode[0].shape)

dfTorontoPostalCode = listDfTorontoPostalCode[0]

I realized that there were some difficult cases to seperate which is borough or neighborhood, because the format/structure of cells in Wiki Postal Code table was not the same while all content in each cell of data frame is text and no spaces. For example, cell(1,7), cell(3,3), cell(2,8), cell(3,8).

Thus, I used etree lib to read and seperate each element that I was interested in. But it's not a good way.

Then, it's realy luck to me when I saw the note in this assignment. I used BeautifulSoup lib as recommendation in note of assignment. It's really easy and make my task on the fly!

I supposed that the first a tag is represented the borough, and all text in "()" is represented the list of neighborhoods in borough.

In [9]:
dfPostalCode.shape

(101, 3)

## Make calls to the Google Geocoding API to get the latitude and longitude coordinates of the postal codes in dataframe

In [10]:
# The code was removed by Watson Studio for sharing.

In [11]:
def updateLatLngForTorontoPostalCode(dfPostalCode):
    if os.path.isfile('TorontoPostalCode.csv'):
        print("Load data from csv")
        dfPostalCode = pd.read_csv('TorontoPostalCode.csv')
    else:
        print("Make request to Google Geocoding API")
        for row in list(range(0,dfPostalCode.shape[0])):
            #print("At index: {0}, Postal Code: {1}".format(row, dfPostalCode.iloc[row,0]))
            url = URL.format(API_KEY, dfPostalCode.iloc[row,0])
            #print("Get url: {0}".format(url))
            response = requests.get(url).json() # get response
            if response["status"] == "ZERO_RESULTS":
                print("NO RESULT at row index {0}: row".format(row))
                continue
            else:
                geographical_data = response["results"][0]["geometry"]["location"] # get geographical coordinates
                latitude = geographical_data['lat']
                longitude = geographical_data['lng']
                dfPostalCode.iloc[row, dfPostalCode.columns.get_loc('Latitude')] = latitude
                dfPostalCode.iloc[row, dfPostalCode.columns.get_loc('Longitude')] = longitude
    return dfPostalCode

In [12]:
dfPostalCode = updateLatLngForTorontoPostalCode(dfPostalCode)

Make request to Google Geocoding API
NO RESULT at row index 1: row
NO RESULT at row index 2: row
NO RESULT at row index 3: row
NO RESULT at row index 4: row
NO RESULT at row index 5: row
NO RESULT at row index 6: row
NO RESULT at row index 7: row
NO RESULT at row index 9: row
NO RESULT at row index 10: row
NO RESULT at row index 11: row
NO RESULT at row index 12: row
NO RESULT at row index 15: row
NO RESULT at row index 16: row
NO RESULT at row index 17: row
NO RESULT at row index 18: row
NO RESULT at row index 19: row
NO RESULT at row index 21: row
NO RESULT at row index 22: row
NO RESULT at row index 23: row
NO RESULT at row index 24: row
NO RESULT at row index 25: row
NO RESULT at row index 26: row
NO RESULT at row index 29: row
NO RESULT at row index 32: row
NO RESULT at row index 33: row
NO RESULT at row index 37: row
NO RESULT at row index 38: row
NO RESULT at row index 41: row
NO RESULT at row index 42: row
NO RESULT at row index 44: row
NO RESULT at row index 48: row
NO RESULT 

In [13]:
dfPostalCode.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,32.363577,-90.152413
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,Regent Park / Harbourfront,0.0,0.0
3,M6A,North York,Lawrence Manor / Lawrence Heights,0.0,0.0
4,M7A,Queen's Park,Queen's Park,0.0,0.0


In [14]:
# Backup to use later instead of making request to Google Geoencoding API
fnPostCodeCSV = 'TorontoPostalCode.csv'
dfPostalCode.to_csv(fnPostCodeCSV, sep=',', encoding='utf-8', index=False)

In [15]:
# Check csv again
dfTest = pd.read_csv('TorontoPostalCode.csv')
dfTest.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,32.363577,-90.152413
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,Regent Park / Harbourfront,0.0,0.0
3,M6A,North York,Lawrence Manor / Lawrence Heights,0.0,0.0
4,M7A,Queen's Park,Queen's Park,0.0,0.0


## Use geopy library to get the latitude and longitude values of Toronto City.

In [16]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


In [17]:
# create map of Toronto using latitude and longitude values
mapToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(dfPostalCode['Latitude'], dfPostalCode['Longitude'], dfPostalCode['Borough'], dfPostalCode['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        #parse_html=False
    ).add_to(mapToronto)  
    
mapToronto

## I will perform segment and cluster only the boroughs that contain the word Toronto and have specified coordinate (latitude, longitude).

In [18]:
print("The number of boroughs that contain the word Toronto before removed the ones have unspecified coordinate: {}".format(dfPostalCode[dfPostalCode.Borough.str.contains("Toronto")].shape[0]))
dfTorontoBoroughs = dfPostalCode[(dfPostalCode.Borough.str.contains("Toronto")) & (dfPostalCode.Latitude != 0) & (dfPostalCode.Longitude != 0)].reset_index(drop=True)
dfTorontoBoroughs

The number of boroughs that contain the word Toronto before removed the ones have unspecified coordinate: 19


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306
1,M5H,Downtown Toronto,Richmond / Adelaide / King,43.651729,-79.381389
2,M5J,Downtown Toronto,Harbourfront East / Union Station / Toronto Is...,43.640816,-79.381752
3,M5T,Downtown Toronto,Kensington Market / Chinatown / Grange Park,45.406597,-71.969598
4,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...,43.645711,-79.392732
5,M5W,Downtown Toronto,Downtown Toronto,43.646435,-79.374846
6,M4X,Downtown Toronto,St. James Town / Cabbagetown,43.667967,-79.367675
7,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


In [19]:
# I recognized that there was coordinate of neighborhood Downtown Toronto is redundant, so I decided remove one of them from data frame
# I decided remove neighborhood "Kensington Market...", because it didn't have marker on the map as well as no any venues got from FOURSQUARE API
dfTorontoBoroughs = dfTorontoBoroughs[(dfTorontoBoroughs.PostalCode != "M5W") & (dfTorontoBoroughs.PostalCode != "M5T")].reset_index(drop=True)
dfTorontoBoroughs

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306
1,M5H,Downtown Toronto,Richmond / Adelaide / King,43.651729,-79.381389
2,M5J,Downtown Toronto,Harbourfront East / Union Station / Toronto Is...,43.640816,-79.381752
3,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...,43.645711,-79.392732
4,M4X,Downtown Toronto,St. James Town / Cabbagetown,43.667967,-79.367675
5,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316


In [20]:
# Still create map of latitude and longitude values of Toronto city
mapTorontoBorough = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers of borough that contains word Toronto to map
for lat, lng, borough, neighborhood in zip(dfTorontoBoroughs['Latitude'], dfTorontoBoroughs['Longitude'], dfTorontoBoroughs['Borough'], dfTorontoBoroughs['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        #parse_html=False
    ).add_to(mapTorontoBorough)  
    
mapTorontoBorough

## Next, we are going to start utilizing the Foursquare API to explore the neighborhoods and segment them.

In [21]:
# The code was removed by Watson Studio for sharing.

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, limit)
            
        # make the GET request
        response = requests.get(url).json()["response"]
        if "groups" not in response:
            print("No venues at Neighborhood {0} ({1}, {2})".format(name, lat, lng))
            continue
        results = response['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## Now, let's get 100 venues that are in Toronto boroughs within a radius of 500 meters.

In [23]:
dfBoroughTorontoVenues = getNearbyVenues(dfTorontoBoroughs['Neighborhood'], dfTorontoBoroughs['Latitude'], dfTorontoBoroughs['Longitude'], radius=500)
dfBoroughTorontoVenues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Toronto,43.644771,-79.373306,LCBO,43.642944,-79.37244,Liquor Store
1,Downtown Toronto,43.644771,-79.373306,The Keg Steakhouse + Bar,43.646676,-79.374822,Steakhouse
2,Downtown Toronto,43.644771,-79.373306,Sony Centre for the Performing Arts,43.646292,-79.376022,Concert Hall
3,Downtown Toronto,43.644771,-79.373306,Hockey Hall Of Fame (Hockey Hall of Fame),43.646974,-79.377323,Museum
4,Downtown Toronto,43.644771,-79.373306,Sukhothai,43.648487,-79.374547,Thai Restaurant


How many venues were returned for each neighborhood?

In [24]:
dfBoroughTorontoVenues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport,100,100,100,100,100,100
Church and Wellesley,80,80,80,80,80,80
Downtown Toronto,57,57,57,57,57,57
Harbourfront East / Union Station / Toronto Islands,100,100,100,100,100,100
Richmond / Adelaide / King,100,100,100,100,100,100
St. James Town / Cabbagetown,50,50,50,50,50,50


How many unique categories can be curated from all the returned venues?

In [25]:
print('There are {} uniques categories.'.format(len(dfBoroughTorontoVenues['Venue Category'].unique())))

There are 155 uniques categories.


## Analyze Each Neighborhood

In [26]:
# one hot encoding
onehotBoroughToronto = pd.get_dummies(dfBoroughTorontoVenues[['Venue Category']], prefix="", prefix_sep="")

# found that there was column name "Neighborhood" in this data frame, so I rename it to Venue Neighborhood
onehotBoroughToronto.rename(columns={'Neighborhood': 'Venue Neighborhood'}, inplace=True)

# get list of column name before adding neighborhood column
fixed_columns = onehotBoroughToronto.columns

# add neighborhood column back to dataframe
onehotBoroughToronto['Neighborhood'] = dfBoroughTorontoVenues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ["Neighborhood"] + list(fixed_columns)
onehotBoroughToronto = onehotBoroughToronto[fixed_columns]

onehotBoroughToronto.head()

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Downtown Toronto,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


Let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [27]:
onehotBoroughTorontoGrouped = onehotBoroughToronto.groupby('Neighborhood').mean().reset_index()
onehotBoroughTorontoGrouped

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Bagel Shop,Bakery,...,Tea Room,Thai Restaurant,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,CN Tower / King and Spadina / Railway Lands / ...,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.0,...,0.0,0.02,0.02,0.0,0.03,0.01,0.0,0.0,0.0,0.02
1,Church and Wellesley,0.0125,0.0125,0.025,0.0,0.0,0.0125,0.0,0.0,0.0,...,0.0125,0.0,0.0125,0.0,0.0125,0.0125,0.0,0.0,0.0,0.0125
2,Downtown Toronto,0.0,0.0,0.0,0.0,0.017544,0.0,0.0,0.017544,0.035088,...,0.017544,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Harbourfront East / Union Station / Toronto Is...,0.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,0.01,...,0.01,0.0,0.01,0.02,0.01,0.0,0.01,0.01,0.0,0.0
4,Richmond / Adelaide / King,0.0,0.0,0.03,0.0,0.01,0.0,0.02,0.0,0.02,...,0.0,0.01,0.02,0.0,0.02,0.0,0.0,0.01,0.01,0.0
5,St. James Town / Cabbagetown,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02


Let's print each neighborhood along with the top 5 most common venues

In [28]:
num_top_venues = 5

for hood in onehotBoroughTorontoGrouped['Neighborhood']:
    print("----"+hood+"----")
    temp = onehotBoroughTorontoGrouped[onehotBoroughTorontoGrouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----CN Tower / King and Spadina / Railway Lands / Harbourfront West / Bathurst Quay / South Niagara / Island airport----
                venue  freq
0          Restaurant  0.07
1         Coffee Shop  0.06
2      Sandwich Place  0.04
3               Hotel  0.04
4  Italian Restaurant  0.04


----Church and Wellesley----
                 venue  freq
0  Japanese Restaurant  0.06
1          Coffee Shop  0.05
2              Gay Bar  0.05
3     Sushi Restaurant  0.05
4           Restaurant  0.04


----Downtown Toronto----
          venue  freq
0   Coffee Shop  0.07
1    Restaurant  0.05
2  Cocktail Bar  0.05
3        Bakery  0.04
4   Cheese Shop  0.04


----Harbourfront East / Union Station / Toronto Islands----
         venue  freq
0  Coffee Shop  0.14
1        Hotel  0.05
2  Pizza Place  0.04
3         Café  0.04
4     Aquarium  0.03


----Richmond / Adelaide / King----
            venue  freq
0     Coffee Shop  0.07
1  Clothing Store  0.07
2            Café  0.05
3           Hotel  0.04
4 

Let's put that into a pandas dataframe

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [30]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venueBoroughTorontoSorted = pd.DataFrame(columns=columns)
venueBoroughTorontoSorted['Neighborhood'] = onehotBoroughTorontoGrouped['Neighborhood']

for ind in np.arange(onehotBoroughTorontoGrouped.shape[0]):
    venueBoroughTorontoSorted.iloc[ind, 1:] = return_most_common_venues(onehotBoroughTorontoGrouped.iloc[ind, :], num_top_venues)

venueBoroughTorontoSorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,CN Tower / King and Spadina / Railway Lands / ...,Restaurant,Coffee Shop,Italian Restaurant,Sandwich Place,Hotel,French Restaurant,Vegetarian / Vegan Restaurant,Gym,Bar,Yoga Studio
1,Church and Wellesley,Japanese Restaurant,Gay Bar,Coffee Shop,Sushi Restaurant,Restaurant,Burger Joint,Café,Gastropub,Fast Food Restaurant,Smoke Shop
2,Downtown Toronto,Coffee Shop,Cocktail Bar,Restaurant,Steakhouse,Seafood Restaurant,Cheese Shop,Farmers Market,Bakery,Sushi Restaurant,Café
3,Harbourfront East / Union Station / Toronto Is...,Coffee Shop,Hotel,Pizza Place,Café,Sports Bar,Aquarium,Scenic Lookout,Italian Restaurant,Brewery,Bar
4,Richmond / Adelaide / King,Clothing Store,Coffee Shop,Café,Hotel,Italian Restaurant,Gastropub,American Restaurant,Steakhouse,Plaza,Restaurant
5,St. James Town / Cabbagetown,Coffee Shop,Restaurant,Pizza Place,Indian Restaurant,Italian Restaurant,Pub,Bakery,Market,Café,Yoga Studio


## Cluster Neighborhoods

Run k-means to cluster the neighborhood into 2 clusters.

In [31]:
# set number of clusters
kclusters = 2

boroughTorontoGroupedClustering = onehotBoroughTorontoGrouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(boroughTorontoGroupedClustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [32]:
# Because there were only 5 in 7 neighborhoods have venue information got from FOURSQUARE API I decided filter data frame to be suitable to cluster
dfPostalCodeMerged = dfTorontoBoroughs
dfPostalCodeMerged

# add clustering labels
dfPostalCodeMerged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
dfPostalCodeMerged = dfPostalCodeMerged.join(venueBoroughTorontoSorted.set_index('Neighborhood'), on='Neighborhood')

dfPostalCodeMerged.head() # check the last columns!Finally, let's visualize the resulting clusters

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5E,Downtown Toronto,Downtown Toronto,43.644771,-79.373306,0,Coffee Shop,Cocktail Bar,Restaurant,Steakhouse,Seafood Restaurant,Cheese Shop,Farmers Market,Bakery,Sushi Restaurant,Café
1,M5H,Downtown Toronto,Richmond / Adelaide / King,43.651729,-79.381389,1,Clothing Store,Coffee Shop,Café,Hotel,Italian Restaurant,Gastropub,American Restaurant,Steakhouse,Plaza,Restaurant
2,M5J,Downtown Toronto,Harbourfront East / Union Station / Toronto Is...,43.640816,-79.381752,0,Coffee Shop,Hotel,Pizza Place,Café,Sports Bar,Aquarium,Scenic Lookout,Italian Restaurant,Brewery,Bar
3,M5V,Downtown Toronto,CN Tower / King and Spadina / Railway Lands / ...,43.645711,-79.392732,0,Restaurant,Coffee Shop,Italian Restaurant,Sandwich Place,Hotel,French Restaurant,Vegetarian / Vegan Restaurant,Gym,Bar,Yoga Studio
4,M4X,Downtown Toronto,St. James Town / Cabbagetown,43.667967,-79.367675,0,Coffee Shop,Restaurant,Pizza Place,Indian Restaurant,Italian Restaurant,Pub,Bakery,Market,Café,Yoga Studio


Finally, let's visualize the resulting clusters

In [33]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(dfPostalCodeMerged['Latitude'], dfPostalCodeMerged['Longitude'], dfPostalCodeMerged['Neighborhood'], dfPostalCodeMerged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [34]:
dfPostalCodeMerged.loc[dfPostalCodeMerged['Cluster Labels'] == 1, dfPostalCodeMerged.columns[[2] + list(range(5, dfPostalCodeMerged.shape[1]))]]

Unnamed: 0,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Richmond / Adelaide / King,1,Clothing Store,Coffee Shop,Café,Hotel,Italian Restaurant,Gastropub,American Restaurant,Steakhouse,Plaza,Restaurant
