# Segmenting and Clustering Neighborhoods in Toronto

In [165]:
# Installing necessary packages
!pip install bs4
!pip install lxml



### Import of necessary files

In [166]:
from bs4 import BeautifulSoup

import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.5.12

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.

Solving environment: done


  current version: 4.5.11
  latest version: 4.5.12

Please update conda by running

    $ conda update -n base conda



# All requested packages already installed.

Libraries imported.


### Parsing of wiki page

In [167]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":867606113,"wgRevisionId":867606113,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

### Accessing only Table Data

In [168]:
soup.table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

### Setting Dataframe columns

In [169]:
column_names = ['Postcode', 'Borough', 'Neighborhood'] 

### Setting Dataframe

In [170]:
df = pd.DataFrame(columns=column_names)

Postcode=[]
Borough=[]
Neighborhood=[]

### Assigning values in List

In [171]:
for row in soup.table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        Postcode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))

### Assinging list values into Dataframe

In [172]:
# print(Postcode)
df['Postcode']=Postcode
df['Borough']=Borough
df['Neighborhood']=Neighborhood

### Accessing the dataframe

In [173]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Viewing the shape of Dataframe

In [174]:
df.shape

(289, 3)

### Removing the Non assinged rows of Borough column

In [175]:
#Removing not assigned rows
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)

### Removing the non assigned values of Neighborhood and assigned borough values

In [176]:
df.loc[df.Neighborhood == 'Not assigned\n', "Neighborhood"] = df.Borough

### Accessing the dataframe

In [177]:
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### New shape of dataframe as

In [178]:
df.shape

(212, 3)

### Count of duplicate values

In [179]:
#count of unique Postcode
len(df.Postcode.unique())

103

### Grouping and assinging the appended values of Neighborhood of same Postcode

In [180]:
df = df.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.columns = ['Postcode', 'Borough', 'Neighborhood']

### New DataFrame shape as

In [181]:
df.shape

(103, 3)

### Reading values of Latitude and Logitude from csv

In [182]:
df_latlon = pd.read_csv('http://cocl.us/Geospatial_data')
df_latlon.columns = ['Postcode', 'Latitude', 'Longitude']

### merging of dataframe with Latitude and Longitude

In [183]:
df = pd.merge(df, df_latlon, on=['Postcode'], how='inner')

In [184]:
df

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park\n, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West\n",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West\n",43.692657,-79.264848


In [185]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


### Use geopy library to get the latitude and longitude values of New York City.

In [186]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))


  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


### Create a map of Toronto with neighborhoods superimposed on top

In [187]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(df, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### However, for illustration purposes, let's simplify the above map and segment and cluster only the neighborhoods in East Toronto. So let's slice the original dataframe and create a new dataframe of the East Toronto data.

In [188]:
east_toronto_data = df[df['Borough'] == 'East Toronto'].reset_index(drop=True)
east_toronto_data.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West\n, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West\n, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District\n,43.659526,-79.340923
4,M7Y,East Toronto,Business reply mail Processing Centre969 Easte...,43.662744,-79.321558


### Get the geographical coordinates of East Toronto

In [189]:
address = 'East Toronto, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of east_toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of east_toronto are 43.653963, -79.387207.


### Let's visualizat East Toronto the neighborhoods in it

In [190]:
# create map of Manhattan using latitude and longitude values
map_east_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(east_toronto_data['Latitude'], east_toronto_data['Longitude'], east_toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_east_toronto)  
    
map_east_toronto

### Define Foursquare Credentials and Version¶

In [191]:
CLIENT_ID = 'JMRORPLT5G4P1SHF3L1JRRXSFADNRKJT32PPH0FPCZSKUKW3' # your Foursquare ID
CLIENT_SECRET = 'LSVDKT4B2IDQWKYBI3YNJVSCJVUMHBKK30XZFBL4CS0KVV1V' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JMRORPLT5G4P1SHF3L1JRRXSFADNRKJT32PPH0FPCZSKUKW3
CLIENT_SECRET:LSVDKT4B2IDQWKYBI3YNJVSCJVUMHBKK30XZFBL4CS0KVV1V


### Get the neighborhood's name.

In [192]:
east_toronto_data.loc[0, 'Neighborhood']

'The Beaches'

### Get the neighborhood's latitude and longitude values.

In [193]:
neighborhood_latitude = east_toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = east_toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = east_toronto_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


### top 100 venues that are in The Beaches within a radius of 500 meters

In [194]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url


'https://api.foursquare.com/v2/venues/explore?&client_id=JMRORPLT5G4P1SHF3L1JRRXSFADNRKJT32PPH0FPCZSKUKW3&client_secret=LSVDKT4B2IDQWKYBI3YNJVSCJVUMHBKK30XZFBL4CS0KVV1V&v=20180605&ll=43.67635739999999,-79.2930312&radius=500&limit=100'

### Send the GET request and examine the resutls

In [195]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c1de2e44c1f677d709a6931'},
 'response': {'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b8daea1f964a520480833e3',
       'name': 'Grover Pub and Grub',
       'location': {'address': '676 Kingston Rd.',
        'crossStreet': 'at Main St.',
        'lat': 43.679181434941015,
        'lng': -79.29721535878515,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.679181434941015,
          'lng': -79.29721535878515}],
    

### get_category_type function from the Foursquare lab.

In [196]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### clean the json and structure it into a pandas dataframe.

In [197]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Grover Pub and Grub,Pub,43.679181,-79.297215
1,Starbucks,Coffee Shop,43.678798,-79.298045
2,Guru Raghavendra Ji,Astrologer,43.680187,-79.292337
3,Upper Beaches,Neighborhood,43.680563,-79.292869
4,Skaut Design,Furniture / Home Store,43.680344,-79.29054


### how many venues were returned by Foursquare?

In [198]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


### function to repeat the same process to all the neighborhoods in East Toronto

In [199]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### code to run the above function on each neighborhood and create a new dataframe

In [200]:
east_toronto_venues = getNearbyVenues(names=east_toronto_data['Neighborhood'],
                                   latitudes=east_toronto_data['Latitude'],
                                   longitudes=east_toronto_data['Longitude']
                                  )



The Beaches
The Danforth West
, Riverdale
The Beaches West
, India Bazaar
Studio District

Business reply mail Processing Centre969 Eastern



### check the size of the resulting dataframe

In [201]:
print(east_toronto_venues.shape)
east_toronto_venues.head()

(128, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
1,The Beaches,43.676357,-79.293031,Starbucks,43.678798,-79.298045,Coffee Shop
2,The Beaches,43.676357,-79.293031,Guru Raghavendra Ji,43.680187,-79.292337,Astrologer
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,The Beaches,43.676357,-79.293031,Skaut Design,43.680344,-79.29054,Furniture / Home Store


### how many venues were returned for each neighborhood

In [202]:
east_toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Business reply mail Processing Centre969 Eastern\n,18,18,18,18,18,18
Studio District\n,40,40,40,40,40,40
The Beaches,5,5,5,5,5,5
"The Beaches West\n, India Bazaar",23,23,23,23,23,23
"The Danforth West\n, Riverdale",42,42,42,42,42,42


### how many unique categories can be curated from all the returned venues

In [203]:
print('There are {} uniques categories.'.format(len(east_toronto_venues['Venue Category'].unique())))

There are 68 uniques categories.


### Analyse Each Neighborhood

In [204]:
# one hot encoding
east_toronto_onehot = pd.get_dummies(east_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
east_toronto_onehot['Neighborhood'] = east_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [east_toronto_onehot.columns[-1]] + list(east_toronto_onehot.columns[:-1])
east_toronto_onehot = east_toronto_onehot[fixed_columns]

east_toronto_onehot.head()

Unnamed: 0,Yoga Studio,American Restaurant,Astrologer,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Middle Eastern Restaurant,Movie Theater,Music Store,Neighborhood,New American Restaurant,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,The Beaches,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Examine New Dataframe set

In [205]:
east_toronto_onehot.shape

(128, 68)

### group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [206]:
east_toronto_grouped = east_toronto_onehot.groupby('Neighborhood').mean().reset_index()
east_toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,American Restaurant,Astrologer,Auto Workshop,Bakery,Bank,Bar,Board Shop,Bookstore,Brewery,Bubble Tea Shop,Burger Joint,Burrito Place,Café,Caribbean Restaurant,Cheese Shop,Chinese Restaurant,Clothing Store,Coffee Shop,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Coworking Space,Dessert Shop,Diner,Farmers Market,Fast Food Restaurant,Fish & Chips Shop,Fish Market,Food & Drink Shop,Fruit & Vegetable Store,Furniture / Home Store,Garden,Garden Center,Gastropub,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Health Food Store,Ice Cream Shop,Indian Restaurant,Italian Restaurant,Juice Bar,Latin American Restaurant,Light Rail Station,Liquor Store,Middle Eastern Restaurant,Movie Theater,Music Store,New American Restaurant,Park,Pet Store,Pizza Place,Pub,Recording Studio,Restaurant,Sandwich Place,Seafood Restaurant,Skate Park,Smoke Shop,Spa,Stationery Store,Steakhouse,Sushi Restaurant,Trail
0,Business reply mail Processing Centre969 Easte...,0.055556,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.055556,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.055556,0.0,0.055556,0.055556,0.0,0.0,0.055556,0.055556,0.055556,0.0,0.0,0.0,0.0
1,Studio District\n,0.025,0.05,0.0,0.0,0.05,0.025,0.025,0.0,0.025,0.025,0.0,0.0,0.0,0.1,0.0,0.025,0.025,0.025,0.075,0.025,0.0,0.025,0.0,0.025,0.0,0.025,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.025,0.0,0.025,0.0,0.05,0.025,0.025,0.0,0.0,0.025,0.0,0.025,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.025,0.025,0.0,0.0,0.0,0.025,0.0,0.0,0.0
2,The Beaches,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"The Beaches West\n, India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.043478,0.043478,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.043478,0.043478,0.0,0.043478,0.0,0.0,0.086957,0.043478,0.043478,0.043478,0.0,0.0,0.086957,0.0,0.0,0.0,0.0,0.0,0.043478,0.043478,0.0
4,"The Danforth West\n, Riverdale",0.02381,0.02381,0.0,0.0,0.02381,0.0,0.0,0.0,0.047619,0.02381,0.02381,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.02381,0.0,0.02381,0.02381,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.0,0.0,0.238095,0.02381,0.0,0.0,0.02381,0.071429,0.02381,0.047619,0.02381,0.0,0.0,0.02381,0.0,0.0,0.0,0.0,0.0,0.0,0.02381,0.02381,0.0,0.02381,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381


### New Shape

In [207]:
east_toronto_grouped.shape

(5, 68)

### each neighborhood along with the top 5 most common venues

In [208]:
num_top_venues = 5

for hood in east_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = east_toronto_grouped[east_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Business reply mail Processing Centre969 Eastern
----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2          Comic Shop  0.06
3                 Spa  0.06
4          Smoke Shop  0.06


----Studio District
----
                venue  freq
0                Café  0.10
1         Coffee Shop  0.08
2  Italian Restaurant  0.05
3              Bakery  0.05
4           Gastropub  0.05


----The Beaches----
                    venue  freq
0             Coffee Shop   0.2
1              Astrologer   0.2
2  Furniture / Home Store   0.2
3                     Pub   0.2
4               Juice Bar   0.0


----The Beaches West
, India Bazaar----
            venue  freq
0            Park  0.09
1  Sandwich Place  0.09
2     Coffee Shop  0.04
3    Liquor Store  0.04
4   Movie Theater  0.04


----The Danforth West
, Riverdale----
                venue  freq
0    Greek Restaurant  0.24
1         Coffee Shop  0.07
2      Ice Cream Shop  0.07
3           Bookstore  0.05

### function to sort the venues in descending order.

In [209]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### new dataframe and display the top 10 venues for each neighborhood.

In [210]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = east_toronto_grouped['Neighborhood']

for ind in np.arange(east_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(east_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Business reply mail Processing Centre969 Easte...,Light Rail Station,Garden,Pizza Place,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden Center
1,Studio District\n,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Cheese Shop,Chinese Restaurant,Gym / Fitness Center,Convenience Store
2,The Beaches,Furniture / Home Store,Coffee Shop,Astrologer,Pub,Coworking Space,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Trail
3,"The Beaches West\n, India Bazaar",Sandwich Place,Park,Light Rail Station,Gym,Ice Cream Shop,Italian Restaurant,Burrito Place,Burger Joint,Coffee Shop,Liquor Store
4,"The Danforth West\n, Riverdale",Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Bookstore,Diner,Indian Restaurant,Health Food Store,Grocery Store,Furniture / Home Store


### Run k-means to cluster the neighborhood into 3 clusters.

In [211]:
# set number of clusters
kclusters = 3

east_toronto_grouped_clustering = east_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(east_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 1, 0], dtype=int32)

### New Dataframe of top 10 venues for each neighborhood

In [212]:
east_toronto_merged = east_toronto_data

# add clustering labels
east_toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
east_toronto_merged = east_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

east_toronto_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,1,Furniture / Home Store,Coffee Shop,Astrologer,Pub,Coworking Space,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Trail
1,M4K,East Toronto,"The Danforth West\n, Riverdale",43.679557,-79.352188,1,Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Bookstore,Diner,Indian Restaurant,Health Food Store,Grocery Store,Furniture / Home Store
2,M4L,East Toronto,"The Beaches West\n, India Bazaar",43.668999,-79.315572,2,Sandwich Place,Park,Light Rail Station,Gym,Ice Cream Shop,Italian Restaurant,Burrito Place,Burger Joint,Coffee Shop,Liquor Store
3,M4M,East Toronto,Studio District\n,43.659526,-79.340923,1,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Cheese Shop,Chinese Restaurant,Gym / Fitness Center,Convenience Store
4,M7Y,East Toronto,Business reply mail Processing Centre969 Easte...,43.662744,-79.321558,0,Light Rail Station,Garden,Pizza Place,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden Center


###  visualize the resulting clusters

In [213]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(east_toronto_merged['Latitude'], east_toronto_merged['Longitude'], east_toronto_merged['Neighborhood'], east_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Cluster 1

In [214]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 0, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,East Toronto,0,Light Rail Station,Garden,Pizza Place,Auto Workshop,Brewery,Burrito Place,Comic Shop,Farmers Market,Fast Food Restaurant,Garden Center


### Cluser 3

In [215]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 1, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,1,Furniture / Home Store,Coffee Shop,Astrologer,Pub,Coworking Space,Comfort Food Restaurant,Comic Shop,Convenience Store,Cosmetics Shop,Trail
1,East Toronto,1,Greek Restaurant,Ice Cream Shop,Coffee Shop,Italian Restaurant,Bookstore,Diner,Indian Restaurant,Health Food Store,Grocery Store,Furniture / Home Store
3,East Toronto,1,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Cheese Shop,Chinese Restaurant,Gym / Fitness Center,Convenience Store


### Cluster 3

In [216]:
east_toronto_merged.loc[east_toronto_merged['Cluster Labels'] == 2, east_toronto_merged.columns[[1] + list(range(5, east_toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,East Toronto,2,Sandwich Place,Park,Light Rail Station,Gym,Ice Cream Shop,Italian Restaurant,Burrito Place,Burger Joint,Coffee Shop,Liquor Store
