In [1]:
#PART 1 - CREATE DATAFRAME OF TORONTO NEIGHBOURHOODS

#importing Libraries and updating pandas to 1.0.3 to avoid attribute errors scraping html
!pip install beautifulsoup4 
!pip install lxml # parser
!pip install html5lib
!pip install requests
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
!pip install pandas==1.0.3

Collecting pandas==1.0.3
  Downloading pandas-1.0.3-cp37-cp37m-manylinux1_x86_64.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 17.1 MB/s eta 0:00:01
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.0.5
    Uninstalling pandas-1.0.5:
      Successfully uninstalled pandas-1.0.5
Successfully installed pandas-1.0.3


In [2]:
#Read in the source html data table
df_source=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df_source.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [3]:
#Start formating the data by deleting the first row with index 0

df_1 = df_source.drop(df_source.index[0])
df_1.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [4]:
#Deleting rows that cells with a borough that is Not assigned.
df_2 = df_1[df_1.Borough != 'Not assigned']
df_2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [5]:
#If a cell has a borough but a Not assigned neighbourhood, then the neighbourhood will be the same as the borough.
pd.options.mode.chained_assignment = None

df_2.Neighbourhood[df_2.Neighbourhood == "Not assigned"] = df_2.Borough

df_2.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
# Combining Neighbourhoods with the same Boroughs

df_3 = df_2.groupby(['Postal Code','Borough'])['Neighbourhood'].agg(lambda x: tuple(x)).reset_index()
df_3.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"(Malvern, Rouge,)"
1,M1C,Scarborough,"(Rouge Hill, Port Union, Highland Creek,)"
2,M1E,Scarborough,"(Guildwood, Morningside, West Hill,)"
3,M1G,Scarborough,"(Woburn,)"
4,M1H,Scarborough,"(Cedarbrae,)"
5,M1J,Scarborough,"(Scarborough Village,)"
6,M1K,Scarborough,"(Kennedy Park, Ionview, East Birchmount Park,)"
7,M1L,Scarborough,"(Golden Mile, Clairlea, Oakridge,)"
8,M1M,Scarborough,"(Cliffside, Cliffcrest, Scarborough Village We..."
9,M1N,Scarborough,"(Birch Cliff, Cliffside West,)"


In [7]:
#In the last cell of the notebook, use the .shape method to print the number of rows of your dataframe.
df_3.shape

(103, 3)

In [8]:
#PART 2 - CREATE GEOCODED DATAFRAME & LOCATION MAP

#Obtain the latitude and the longitude coordinates of each neighborhood. 
# Use the Geocoder package or the csv file to create the following dataframe due to package can be very unreliable
url = 'http://cocl.us/Geospatial_data'
df_gs=pd.read_csv(url)
df_gs.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
#check the shape of the csv file - needs to be (103, 3) to match df3 format.
df_gs.shape

(103, 3)

In [10]:
df_4 = df_3.join(df_gs.set_index('Postal Code'), on='Postal Code')
df_4.head
#df_4 = df_3.groupby(['Postal Code','Borough'])['Neighbourhood'].agg(lambda x: tuple(x)).reset_index()
#df_4.head(11)

<bound method NDFrame.head of     Postal Code      Borough  \
0           M1B  Scarborough   
1           M1C  Scarborough   
2           M1E  Scarborough   
3           M1G  Scarborough   
4           M1H  Scarborough   
..          ...          ...   
98          M9N         York   
99          M9P    Etobicoke   
100         M9R    Etobicoke   
101         M9V    Etobicoke   
102         M9W    Etobicoke   

                                         Neighbourhood   Latitude  Longitude  
0                                    (Malvern, Rouge,)  43.806686 -79.194353  
1            (Rouge Hill, Port Union, Highland Creek,)  43.784535 -79.160497  
2                 (Guildwood, Morningside, West Hill,)  43.763573 -79.188711  
3                                            (Woburn,)  43.770992 -79.216917  
4                                         (Cedarbrae,)  43.773136 -79.239476  
..                                                 ...        ...        ...  
98                              

In [11]:
#Answer to Part 2 - Joined Dataframe
df_4

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"(Malvern, Rouge,)",43.806686,-79.194353
1,M1C,Scarborough,"(Rouge Hill, Port Union, Highland Creek,)",43.784535,-79.160497
2,M1E,Scarborough,"(Guildwood, Morningside, West Hill,)",43.763573,-79.188711
3,M1G,Scarborough,"(Woburn,)",43.770992,-79.216917
4,M1H,Scarborough,"(Cedarbrae,)",43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,"(Weston,)",43.706876,-79.518188
99,M9P,Etobicoke,"(Westmount,)",43.696319,-79.532242
100,M9R,Etobicoke,"(Kingsview Village, St. Phillips, Martin Grove...",43.688905,-79.554724
101,M9V,Etobicoke,"(South Steeles, Silverstone, Humbergate, James...",43.739416,-79.588437


In [12]:
!pip install folium
print("folium installed")

Collecting folium
  Downloading folium-0.11.0-py2.py3-none-any.whl (93 kB)
[K     |████████████████████████████████| 93 kB 4.4 MB/s  eta 0:00:01
[?25hCollecting branca>=0.3.0
  Downloading branca-0.4.1-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
folium installed


In [13]:
!pip install geocoder
from geopy.geocoders import Nominatim

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 9.1 MB/s  eta 0:00:01
[?25hCollecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [14]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Coordinates {}, {}.'.format(latitude, longitude))

Coordinates 43.6534817, -79.3839347.


In [15]:
import folium

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_4['Latitude'], df_4['Longitude'], df_4['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [19]:
# PART 3 - CLUSTER THE MOST COMMON VENUES IN TORONTO USING FOURSQUARE 


#Foursquare details and version
CLIENT_ID =  '3NHSKBGWJT5CPZXQFNRF2G2K54RDXYHXUBE00DI3EUTO3ZWM'  #Foursquare Client ID
CLIENT_SECRET = '1M1EC2JDNQQBIIVTAUXYRXG3L4CP2WOY1BA2TGTV5LN45JOZ' #Foursquare Client Secret
VERSION = '20201214' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3NHSKBGWJT5CPZXQFNRF2G2K54RDXYHXUBE00DI3EUTO3ZWM
CLIENT_SECRET:1M1EC2JDNQQBIIVTAUXYRXG3L4CP2WOY1BA2TGTV5LN45JOZ


In [23]:
neighbourhood_latitude = df_4.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = df_4.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = df_4.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of ('Malvern, Rouge',) are 43.806686299999996, -79.19435340000001.


In [24]:
# Now, let's get the top 50 venues that are within a radius of 1000 meters.

LIMIT = 50 # number of venues returned by Foursquare API
radius = 1000 # define radius (metres)

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=3NHSKBGWJT5CPZXQFNRF2G2K54RDXYHXUBE00DI3EUTO3ZWM&client_secret=1M1EC2JDNQQBIIVTAUXYRXG3L4CP2WOY1BA2TGTV5LN45JOZ&v=20201214&ll=43.806686299999996,-79.19435340000001&radius=1000&limit=50'

In [25]:
# Send the GET request to Foursquare to see the results

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5fe02e4b3577fa053cde2d0a'},
 'response': {'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 16,
  'suggestedBounds': {'ne': {'lat': 43.81568630900001,
    'lng': -79.18190576146081},
   'sw': {'lat': 43.797686290999984, 'lng': -79.20680103853921}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4d669cba83865481c948fa53',
       'name': 'Images Salon & Spa',
       'location': {'address': '8130 Sheppard Ave E',
        'crossStreet': 'Morningside Ave',
        'lat': 43.80228301948931,
        'lng': -79.19856472801668,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80228301948931,
          'lng': -79.19856472801668}],
       

In [26]:
# Now to get the category of the venue

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [48]:
import json
from pandas import json_normalize

# Need to prepare the json file and structure it into a df.
venues = results['response']['groups'][0]['items']
   
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Images Salon & Spa,Spa,43.802283,-79.198565
1,Harvey's,Restaurant,43.80002,-79.198307
2,Wendy's,Fast Food Restaurant,43.802008,-79.19808
3,RBC Royal Bank,Bank,43.798782,-79.19709
4,Wendy’s,Fast Food Restaurant,43.807448,-79.199056
5,Caribbean Wave,Caribbean Restaurant,43.798558,-79.195777
6,Staples Morningside,Paper / Office Supplies Store,43.800285,-79.196607
7,Tim Hortons,Coffee Shop,43.802,-79.198169
8,Fortress Technology Inc.,Hardware Store,43.801677,-79.19413
9,Bus Stop: 85 & 116,Bus Station,43.802198,-79.199389


In [42]:
# Total venues that were returned 

print(' There were {} venues were returned by Foursquare within a 1000m radius of Malvern, Rouge.'.format(nearby_venues.shape[0]))

 There were 16 venues were returned by Foursquare within a 1000m radius of Malvern, Rouge.


In [46]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [49]:
toronto_venues = getNearbyVenues(names=df_4['Neighbourhood'],
                                   latitudes=df_4['Latitude'],
                                   longitudes=df_4['Longitude']
                                  )

('Malvern, Rouge',)
('Rouge Hill, Port Union, Highland Creek',)
('Guildwood, Morningside, West Hill',)
('Woburn',)
('Cedarbrae',)
('Scarborough Village',)
('Kennedy Park, Ionview, East Birchmount Park',)
('Golden Mile, Clairlea, Oakridge',)
('Cliffside, Cliffcrest, Scarborough Village West',)
('Birch Cliff, Cliffside West',)
('Dorset Park, Wexford Heights, Scarborough Town Centre',)
('Wexford, Maryvale',)
('Agincourt',)
("Clarks Corners, Tam O'Shanter, Sullivan",)
("Milliken, Agincourt North, Steeles East, L'Amoreaux East",)
("Steeles West, L'Amoreaux West",)
('Upper Rouge',)
('Hillcrest Village',)
('Fairview, Henry Farm, Oriole',)
('Bayview Village',)
('York Mills, Silver Hills',)
('Willowdale, Newtonbrook',)
('Willowdale, Willowdale East',)
('York Mills West',)
('Willowdale, Willowdale West',)
('Parkwoods',)
('Don Mills',)
('Don Mills',)
('Bathurst Manor, Wilson Heights, Downsview North',)
('Northwood Park, York University',)
('Downsview',)
('Downsview',)
('Downsview',)
('Downsview',

In [50]:
print(toronto_venues.shape)
toronto_venues.head()

(3375, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"(Malvern, Rouge,)",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"(Malvern, Rouge,)",43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
2,"(Malvern, Rouge,)",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
3,"(Malvern, Rouge,)",43.806686,-79.194353,RBC Royal Bank,43.798782,-79.19709,Bank
4,"(Malvern, Rouge,)",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant


In [51]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(Agincourt,)",41,41,41,41,41,41
"(Alderwood, Long Branch,)",24,24,24,24,24,24
"(Bathurst Manor, Wilson Heights, Downsview North,)",29,29,29,29,29,29
"(Bayview Village,)",14,14,14,14,14,14
"(Bedford Park, Lawrence Manor East,)",40,40,40,40,40,40
...,...,...,...,...,...,...
"(Willowdale, Willowdale West,)",11,11,11,11,11,11
"(Woburn,)",9,9,9,9,9,9
"(Woodbine Heights,)",27,27,27,27,27,27
"(York Mills West,)",20,20,20,20,20,20


In [52]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"(Malvern, Rouge,)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"(Malvern, Rouge,)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"(Malvern, Rouge,)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"(Malvern, Rouge,)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"(Malvern, Rouge,)",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
toronto_onehot.shape

(3375, 313)

In [54]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,"(Agincourt,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"(Alderwood, Long Branch,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"(Bathurst Manor, Wilson Heights, Downsview Nor...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"(Bayview Village,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"(Bedford Park, Lawrence Manor East,)",0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0,...,0.0,0.025,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0


In [55]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [57]:
# Get Top 5 common venues for the neighbourhoods

import numpy as np
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# Make columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# Make a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"(Agincourt,)",Chinese Restaurant,Shopping Mall,Bakery,Restaurant,Pizza Place
1,"(Alderwood, Long Branch,)",Discount Store,Pizza Place,Convenience Store,Intersection,Pharmacy
2,"(Bathurst Manor, Wilson Heights, Downsview Nor...",Bank,Pizza Place,Coffee Shop,Mobile Phone Shop,Restaurant
3,"(Bayview Village,)",Grocery Store,Gas Station,Japanese Restaurant,Bank,Trail
4,"(Bedford Park, Lawrence Manor East,)",Coffee Shop,Italian Restaurant,Sandwich Place,Bank,Thai Restaurant


In [58]:
#Cluster the neighbourhoods

#import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)

In [63]:
#Merge the df with the Top 5 then cluster for each neighbourhood

# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_4

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"(Malvern, Rouge,)",43.806686,-79.194353,0.0,Coffee Shop,Fast Food Restaurant,Trail,Paper / Office Supplies Store,Bakery
1,M1C,Scarborough,"(Rouge Hill, Port Union, Highland Creek,)",43.784535,-79.160497,2.0,Breakfast Spot,Playground,Italian Restaurant,Burger Joint,Park
2,M1E,Scarborough,"(Guildwood, Morningside, West Hill,)",43.763573,-79.188711,1.0,Pizza Place,Bank,Coffee Shop,Restaurant,Fast Food Restaurant
3,M1G,Scarborough,"(Woburn,)",43.770992,-79.216917,1.0,Coffee Shop,Park,Mobile Phone Shop,Chinese Restaurant,Fast Food Restaurant
4,M1H,Scarborough,"(Cedarbrae,)",43.773136,-79.239476,1.0,Bakery,Pharmacy,Gas Station,Bank,Coffee Shop


In [64]:
toronto_merged[toronto_merged['Cluster Labels'].isnull()]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
16,M1X,Scarborough,"(Upper Rouge,)",43.836125,-79.205636,,,,,,


In [65]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

toronto_merged_nonan = toronto_merged.dropna(subset=['Cluster Labels'])

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged_nonan['Latitude'], toronto_merged_nonan['Longitude'], toronto_merged_nonan['Neighbourhood'], toronto_merged_nonan['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [72]:
#Cluster 0 - Red

toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 0, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Scarborough,0.0,Coffee Shop,Fast Food Restaurant,Trail,Paper / Office Supplies Store,Bakery


In [74]:
# Cluster 1 - Purple

toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 1, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Scarborough,1.0,Pizza Place,Bank,Coffee Shop,Restaurant,Fast Food Restaurant
3,Scarborough,1.0,Coffee Shop,Park,Mobile Phone Shop,Chinese Restaurant,Fast Food Restaurant
4,Scarborough,1.0,Bakery,Pharmacy,Gas Station,Bank,Coffee Shop
5,Scarborough,1.0,Ice Cream Shop,Coffee Shop,Women's Store,Sandwich Place,Pizza Place
6,Scarborough,1.0,Coffee Shop,Discount Store,Chinese Restaurant,Pizza Place,Grocery Store
7,Scarborough,1.0,Intersection,Pizza Place,Bus Line,Bakery,Coffee Shop
8,Scarborough,1.0,Pizza Place,Ice Cream Shop,Beach,Sports Bar,Hardware Store
10,Scarborough,1.0,Furniture / Home Store,Coffee Shop,Asian Restaurant,Chinese Restaurant,Pharmacy
11,Scarborough,1.0,Pizza Place,Grocery Store,Middle Eastern Restaurant,Burger Joint,Bakery
12,Scarborough,1.0,Chinese Restaurant,Shopping Mall,Bakery,Restaurant,Pizza Place


In [75]:
#Cluster 2 - Blue

toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 2, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Scarborough,2.0,Breakfast Spot,Playground,Italian Restaurant,Burger Joint,Park
9,Scarborough,2.0,Park,Thai Restaurant,Gym Pool,Café,General Entertainment
18,North York,2.0,Coffee Shop,Clothing Store,Juice Bar,Bank,Japanese Restaurant
21,North York,2.0,Korean Restaurant,Café,Middle Eastern Restaurant,Pizza Place,Coffee Shop
22,North York,2.0,Korean Restaurant,Sushi Restaurant,Pizza Place,Ramen Restaurant,Grocery Store
23,North York,2.0,Restaurant,Park,Coffee Shop,Convenience Store,Gas Station
26,North York,2.0,Restaurant,Japanese Restaurant,Gym,Coffee Shop,Pizza Place
27,North York,2.0,Restaurant,Japanese Restaurant,Gym,Coffee Shop,Pizza Place
34,North York,2.0,Coffee Shop,Portuguese Restaurant,Gym / Fitness Center,Playground,Park
36,East York,2.0,Park,Coffee Shop,Pizza Place,Café,Sandwich Place


In [76]:
#Cluster 3 - Green

toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 3, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
102,Etobicoke,3.0,Coffee Shop,Hotel,Zoo,Dumpling Restaurant,Eastern European Restaurant


In [77]:
#Cluster 4 - Orange

toronto_merged_nonan.loc[toronto_merged_nonan['Cluster Labels'] == 4, toronto_merged_nonan.columns[[1] + list(range(5, toronto_merged_nonan.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
20,North York,4.0,Park,Pool,Zoo,Fabric Shop,Dumpling Restaurant
91,Etobicoke,4.0,Park,Gym / Fitness Center,Eastern European Restaurant,Italian Restaurant,Shopping Mall
