In [96]:
# Importing modules to work with

import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

#These 2 are useless, but I'm leaving them here for illustration purposes
import geocoder
import geopy

#This is for the visualizations
import folium

#This is for clustering
from sklearn.cluster import KMeans

In [2]:
#Converting Wikipedia page to an html code
html = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(html).text
soup = BeautifulSoup(r, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XkmcCApAIDAAADLj3gkAAAAY","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":935851093,"wgRevisionId":935851093,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communi

In [3]:
#Now we need to select the table of interest within the HTML

table = soup.find('table',{'class':'wikitable sortable'})
print(table)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Harbourfront</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights</a>
</td></tr>
<tr>
<td>M6A</td>
<td><a href="/wiki/North

In [4]:
#Initializng empty list to be filled with data from each column
postcode = []
borough = []
hood = []

#Looping through each row within table having all 3 values (to exclude headers) and populating our list
for row in table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells) == 3: 
    #Only extract text, because some of the values in the table are also links with tags
        postcode.append(cells[0].find(text=True))
        borough.append(cells[1].find(text=True))
        hood.append(cells[2].find(text=True))

In [5]:
#Some values in hood list have added newline symbols, it needs to be cleaned
hood =  [x.replace('\n','') for x in hood ]

#Creating a dataframe out of the list for convinience
table_df = pd.DataFrame(postcode,columns = ['Postcode'])
table_df['Borough'] = borough
table_df['Neighborhoods'] = hood

table_df.head()

Unnamed: 0,Postcode,Borough,Neighborhoods
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
#Treating Not Assigned values as NaN 
table_df.replace('Not assigned', np.nan, inplace=True)

#Checking if there are NaN Neighborhoods that have assigned Boroughs
table_df[table_df['Neighborhoods'].isna() == True][table_df['Borough'].isna() == False]

#Even though it only shows one, let's do it in more automated way
table_df.loc[table_df['Neighborhoods'].isna() == True, 'Neighborhoods'] = table_df['Borough']

#Now we can drop NaN values, because they are of no use to us in further analysis
table_df.dropna(inplace=True)
table_df.reset_index(drop=True)

  """


Unnamed: 0,Postcode,Borough,Neighborhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [7]:
#Now we need to group neighborhoods by boroughs
hoods_grouped = table_df.copy()
hoods_grouped = hoods_grouped.groupby(['Postcode','Borough'])['Neighborhoods'].apply(lambda hoods: ', '.join(hoods)).to_frame()
hoods_grouped.reset_index(inplace=True)

In [8]:
hoods_grouped.shape
#And this concludes the first part of the assignment
hoods_grouped.head()

Unnamed: 0,Postcode,Borough,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Now at this point I tried to use **geocode** package to retrieve location information. Unfortunatelly, I kept getting service errors.

I was still interested in retrieving that data, as opposed to just getting it from the file. So I've tried different package as well. **GeoPy** with Nominatim geocoder seemed to work, but only on certain Postcodes. It may be due to limitations on map coverage in OpenStreetMap project it uses behind the hood or due to something else, but only some postcodes could be resolved.

Which is why, eventually, I decided to clean up failed code and just use the file provided by Coursera.

In [9]:
#Let's add the file with locations
columns = ['postcode', 'latitude','longitude']
csv_url = "https://cocl.us/Geospatial_data"

loc_df = pd.read_csv(csv_url, names = columns, header=0)

In [10]:
#Just checking to see the datatypes are correct
loc_df.head()
loc_df.dtypes

postcode      object
latitude     float64
longitude    float64
dtype: object

In [133]:
#Now we merge two dataframes to have a single df with all the necessary data
#merged = hoods_grouped.join(loc_df.set_index('postcode'), on='Postcode')
merged.head()

Unnamed: 0,Postcode,Borough,Neighborhoods,latitude,longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


At this point onwards we will create an interactive map of Toronto with the Boroughs and neighboorhods we have to further explore it. 
Firstly, we define what Toronto is. And then we map the actual data within the area of Toronto using Folium map.

In [12]:
#Defining what the area will be and where Toronto is
address = 'Toronto, Ontario'

geolocator = geopy.Nominatim(user_agent="toronto_walker")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [13]:
# create map of Toronto using latitude and longitude values
toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

#add markers to map
#each marker represents borough and not the individual neighborhood, because we do not have specific coordinates for neighborhoods

for lat, lng, borough, neighborhood in zip(merged['latitude'], merged['longitude'], merged['Borough'], merged['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto)  
    
toronto

Now it's time to employ Foursquare API to do some area exploring

In [14]:
#Defining API credentials
CLIENT_ID = 'VR0U1WLKIP3ZMFOBPHGN1KHDPULY4AICGYLTLMVZOLTUBAWL' # your Foursquare ID
CLIENT_SECRET = '03URU1P3WMPEEJHKEMHH5RH55TSKP3MGF4XQM3NN3BRF3HEX' # your Foursquare Secret
VERSION = '20200201' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VR0U1WLKIP3ZMFOBPHGN1KHDPULY4AICGYLTLMVZOLTUBAWL
CLIENT_SECRET:03URU1P3WMPEEJHKEMHH5RH55TSKP3MGF4XQM3NN3BRF3HEX


In [15]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

The purpose of this notebook is to try and compare what neighborhood would be the best match for coffee-lovers and freelancers. 

In [23]:
#Borrowing and slightly modifying the function from the lab

def getNearbyVenues(names, latitudes, longitudes, radius=500, limit=100):
    
    """This function is to populate required category of venues for the selected boroughs"""
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Postcode Latitude', 
                  'Postcode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [37]:
toronto_venues = getNearbyVenues(names=merged['Postcode'],
                                   latitudes=merged['latitude'],
                                   longitudes=merged['longitude']
                                  )

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


In [132]:
toronto_venues.head()

Unnamed: 0,Postcode,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1E,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,M1E,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [26]:
toronto_venues.groupby('Postcode').count()

Unnamed: 0_level_0,Postcode Latitude,Postcode Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M1B,2,2,2,2,2,2
M1C,1,1,1,1,1,1
M1E,8,8,8,8,8,8
M1G,4,4,4,4,4,4
M1H,8,8,8,8,8,8
M1J,1,1,1,1,1,1
M1K,6,6,6,6,6,6
M1L,10,10,10,10,10,10
M1M,2,2,2,2,2,2
M1N,4,4,4,4,4,4


In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 267 uniques categories.


In [60]:
#Now we add postcode, boroughs and other info
toronto_supreme = pd.merge(toronto_venues,temp_tor, how='inner', on='Postcode')
#Then we delete duplicated columns
toronto_supreme.drop(['latitude', 'longitude'], axis=1, inplace=True)


In [126]:
#One hot encoding the categories of the venues
toronto_onehot = pd.get_dummies(toronto_supreme[['Venue Category']], prefix="", prefix_sep="")
#And adding back the indentifiying columns
toronto_onehot[['Postcode','Borough','Neighborhoods']] = toronto_supreme[['Postcode','Borough','Neighborhoods']]
#Then we rearrange the columns so that the columns with id info will be in front
cols = toronto_onehot.columns.tolist()
cols = cols[-3:] + cols[:-3]
toronto_onehot = toronto_onehot[cols] 

In [127]:
toronto_onehot.head()

Unnamed: 0,Postcode,Borough,Neighborhoods,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1B,Scarborough,"Rouge, Malvern",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1E,Scarborough,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
#Now let's group the categories by the Borough
toronto_grouped = toronto_onehot.groupby('Postcode').mean().reset_index()
toronto_grouped

Unnamed: 0,Postcode,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,M1B,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
1,M1C,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
2,M1E,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,M1G,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,M1H,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
5,M1J,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
6,M1K,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
7,M1L,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
8,M1M,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.500000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
9,M1N,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [90]:
#This function returns top n venues

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [129]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Postcode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
borough_venues_sorted = pd.DataFrame(columns=columns)
borough_venues_sorted['Postcode'] = toronto_grouped['Postcode']

for ind in np.arange(toronto_grouped.shape[0]):
    borough_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

borough_venues_sorted.head()

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Print Shop,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,M1C,Bar,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
2,M1E,Electronics Store,Rental Car Location,Spa,Pizza Place,Breakfast Spot,Intersection,Mexican Restaurant,Medical Center,Discount Store,Dim Sum Restaurant
3,M1G,Coffee Shop,Korean Restaurant,Convenience Store,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,M1H,Hakka Restaurant,Fried Chicken Joint,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Gas Station,Bakery,Donut Shop,Doner Restaurant


In [130]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Postcode', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 0, 3, 3, 3, 1, 3, 3, 0, 3])

In [135]:
# add clustering labels and coordinates for future visualisation
borough_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
borough_venues_sorted.insert(2, 'Lat', merged['latitude'])
borough_venues_sorted.insert(3, 'Lon', merged['longitude'])
borough_venues_sorted.head() 
#borough_venues_sorted.info

Unnamed: 0,Cluster Labels,Postcode,Lat,Lon,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,3,M1B,43.806686,-79.194353,Fast Food Restaurant,Print Shop,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,0,M1C,43.784535,-79.160497,Bar,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
2,3,M1E,43.763573,-79.188711,Electronics Store,Rental Car Location,Spa,Pizza Place,Breakfast Spot,Intersection,Mexican Restaurant,Medical Center,Discount Store,Dim Sum Restaurant
3,3,M1G,43.770992,-79.216917,Coffee Shop,Korean Restaurant,Convenience Store,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,3,M1H,43.773136,-79.239476,Hakka Restaurant,Fried Chicken Joint,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Gas Station,Bakery,Donut Shop,Doner Restaurant


In [136]:
borough_venues_sorted.info

<bound method DataFrame.info of     Cluster Labels Postcode        Lat        Lon 1st Most Common Venue  \
0                3      M1B  43.806686 -79.194353  Fast Food Restaurant   
1                0      M1C  43.784535 -79.160497                   Bar   
2                3      M1E  43.763573 -79.188711     Electronics Store   
3                3      M1G  43.770992 -79.216917           Coffee Shop   
4                3      M1H  43.773136 -79.239476      Hakka Restaurant   
5                1      M1J  43.744734 -79.239476            Playground   
6                3      M1K  43.727929 -79.262029      Department Store   
7                3      M1L  43.711112 -79.284577              Bus Line   
8                0      M1M  43.716316 -79.239476   American Restaurant   
9                3      M1N  43.692657 -79.264848                  Café   
10               3      M1P  43.757410 -79.273304     Indian Restaurant   
11               3      M1R  43.750072 -79.295849        Sandwich Pl

In [137]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(borough_venues_sorted['Lat'], borough_venues_sorted['Lon'], borough_venues_sorted['Postcode'], borough_venues_sorted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [150]:
#Now let's see what makes up a cluster

borough_venues_sorted.loc[borough_venues_sorted['Cluster Labels'] == 0, borough_venues_sorted.columns[[1] + list(range(4, borough_venues_sorted.shape[1]))]]

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,M1C,Bar,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
8,M1M,American Restaurant,Motel,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Yoga Studio
19,M2L,Cafeteria,Yoga Studio,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
30,M3M,Food Truck,Baseball Field,Home Service,Yoga Studio,Dessert Shop,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
61,M5N,Garden,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop,Deli / Bodega
89,M8Y,Locksmith,Baseball Field,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant,Farmers Market
94,M9M,Baseball Field,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
99,M9W,Rental Car Location,Bar,Drugstore,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop


In [151]:
borough_venues_sorted.loc[borough_venues_sorted['Cluster Labels'] == 1, borough_venues_sorted.columns[[1] + list(range(4, borough_venues_sorted.shape[1]))]]

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
5,M1J,Playground,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
46,M4T,Restaurant,Playground,Yoga Studio,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant


In [152]:
borough_venues_sorted.loc[borough_venues_sorted['Cluster Labels'] == 2, borough_venues_sorted.columns[[1] + list(range(4, borough_venues_sorted.shape[1]))]]

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
91,M9B,Golf Course,Yoga Studio,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore


In [153]:
borough_venues_sorted.loc[borough_venues_sorted['Cluster Labels'] == 3, borough_venues_sorted.columns[[1] + list(range(4, borough_venues_sorted.shape[1]))]]

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Fast Food Restaurant,Print Shop,Yoga Studio,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
2,M1E,Electronics Store,Rental Car Location,Spa,Pizza Place,Breakfast Spot,Intersection,Mexican Restaurant,Medical Center,Discount Store,Dim Sum Restaurant
3,M1G,Coffee Shop,Korean Restaurant,Convenience Store,Yoga Studio,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
4,M1H,Hakka Restaurant,Fried Chicken Joint,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Gas Station,Bakery,Donut Shop,Doner Restaurant
6,M1K,Department Store,Chinese Restaurant,Bus Station,Coffee Shop,Convenience Store,Hobby Shop,Diner,Discount Store,Dog Run,Doner Restaurant
7,M1L,Bus Line,Bakery,Soccer Field,Ice Cream Shop,Metro Station,Bus Station,Intersection,Park,Gay Bar,Department Store
9,M1N,Café,Skating Rink,General Entertainment,College Stadium,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Deli / Bodega
10,M1P,Indian Restaurant,Pet Store,Chinese Restaurant,Vietnamese Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
11,M1R,Sandwich Place,Auto Garage,Breakfast Spot,Bakery,Middle Eastern Restaurant,Yoga Studio,Diner,Discount Store,Dog Run,Doner Restaurant
12,M1S,Lounge,Skating Rink,Latin American Restaurant,Clothing Store,Breakfast Spot,Drugstore,Diner,Discount Store,Dog Run,Doner Restaurant


In [154]:
borough_venues_sorted.loc[borough_venues_sorted['Cluster Labels'] == 4, borough_venues_sorted.columns[[1] + list(range(4, borough_venues_sorted.shape[1]))]]

Unnamed: 0,Postcode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
14,M1V,Park,Playground,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
21,M2P,Park,Bank,Convenience Store,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Dessert Shop,Drugstore
23,M3A,Park,Food & Drink Shop,Pool,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
28,M3K,Park,Snack Place,Airport,Bus Stop,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
38,M4J,Park,Convenience Store,Coffee Shop,Rental Car Location,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
42,M4N,Park,Swim School,Bus Line,Dog Run,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Doner Restaurant,Event Space
48,M4W,Park,Trail,Playground,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop
71,M6C,Park,Trail,Field,Hockey Arena,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
72,M6E,Park,Women's Store,Market,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Dumpling Restaurant
77,M6L,Park,Basketball Court,Bakery,Construction & Landscaping,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore


<h2> In conclusion </h2>


    Clustering revealed that there are, indeed, strong separation between different clusters in Toronto. 
    Judging by the top 10 places one can observe that cluster 1 (or 0, if you count the way python does) is generally good for places to eat and buy groceries;
    Cluster 2 (or 1) contains postcodes most suited to have fun with your kids, have a yoga session and so on - a few places to have a bite too.
    Cluster 3 (or 2) would be the least pronounced of them all. I would assume it has something to do with the golf course being so dominant in the area. Golf courses are big: generally, where they are - not much space will be left to other places.
    Cluster 4 (or 3) is a busy busy place: banks, intersections and fast food places - to eat on the go while you shop
    Cluster 5 (or 4) is my favorite. Rent a place somewhere within that quarter if you love parks and long walks, because this is where most green spaces are concentrated!