# Explore and cluster the neighborhoods in Toronto with geographical visualization
### Please skip through the first and second sections and go to the third section for more details.
#### Built upone two previous notebooks: 1) Scraping information from Wikipedia and perform data format transformation, and 2) Obtain geographical coordinates. 
#### Author: Ruoyu Yan

# Section 1
### Install packages

In [2]:
!pip install bs4
!pip install lxml



### Import packages

In [9]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import urllib.request

### Obtain the Wikipedia article as a local copy.

In [40]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
request = urllib.request.urlopen(url)
wiki_article = request.read().decode()

with open('List_of_postal_codes_of_Canada:_M.html', 'w') as fo:
    fo.write(wiki_article)

### Extract table and load into pandas dataframe

In [41]:

# Load article, use beautiful soup to get the tables.
wiki_article = open('List_of_postal_codes_of_Canada:_M.html').read()
soup = BeautifulSoup(wiki_article, 'html.parser')
tables = soup.find_all('table', class_='sortable')

# Search through all the tables, identify the table with the header we want.
for table in tables:
    all_tables = table.find_all('th')
    header = [th.text.strip() for th in all_tables]
    if header[:5] == ['Postcode', 'Borough', 'Neighborhood']:
        break

# Extract the columns we want and write to a semicolon-delimited text file.
with open('List_of_postal_codes_of_Canada:_M.txt', 'w') as fo:
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        Postcode, Borough, Neighborhood = [td.text.strip() for td in tds[:4]]
        
        print('; '.join([Postcode, Borough, Neighborhood]), file=fo)
        
df = pd.read_table('List_of_postal_codes_of_Canada:_M.txt', delimiter = ';', header = None)
df.columns = ['PostCode', 'Borough', 'Neighborhood']
df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Obtain dataframe that has assigned borough; ignore a borough that is Not assigned; if a neighborhood is not assigned but its borough is assigned, use the borough as the value for neighborhood.

In [60]:
# Ignore not assigned borough
df1 = df[df['Borough'] != ' Not assigned']
df1.reset_index(inplace = True, drop = True)
df1

# Assign borough value to neighborhood if the neighborhood is not assigned.
position = 0
neigh_list = []
for i,j in zip(df1['Borough'], df1['Neighborhood']):
    if j == ' Not assigned':
        neigh_list.append(i)
    else:
        neigh_list.append(j)

post_list = df1['PostCode'].tolist()
br_list = df1['Borough'].tolist()

df1 = pd.DataFrame([post_list, br_list, neigh_list]).T
df1.columns = ['PostCode', 'Borough', 'Neighborhood']
df1.head()


Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### The following block is to facilitate data grouping for later sections.

In [61]:
borough_list =[] 
Neighborhood_list = []

for item in df1['Borough']:
    item_new = str(item)[1:] + ':'
    borough_list.append(item_new)

for item in df1['Neighborhood']:
    item_new = str(item)[1:] + ':'
    Neighborhood_list.append(item_new)
    
PostCode_list = df1['PostCode'].tolist()

df2 = pd.DataFrame([PostCode_list, borough_list, Neighborhood_list]).T
df2.columns = ['PostCode', 'Borough', 'Neighborhood']

df2.head()


Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York:,Parkwoods:
1,M4A,North York:,Victoria Village:
2,M5A,Downtown Toronto:,Harbourfront:
3,M6A,North York:,Lawrence Heights:
4,M6A,North York:,Lawrence Manor:


### Group dataframe by PostCode and combine neighborhood values.

In [62]:
new_df = df2.groupby('PostCode').sum()

borough_list=[]
Neighborhood_list = []
PostCode_list = new_df.index.tolist()

for item in new_df['Borough']:
    item_new = np.unique(np.array(str(item).split(':')))[1]
    borough_list.append(item_new)

for item in new_df['Neighborhood']:
    item_new = str(np.array(str(item).split(':'))[:-1].tolist())[1:][:-1].replace("'","")
    Neighborhood_list.append(item_new)

df3 = pd.DataFrame([PostCode_list, borough_list, Neighborhood_list]).T
df3.columns = ['PostCode', 'Borough', 'Neighborhood']
df3    

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


### Use .shape method to print the number of rows in the dataframe

In [63]:
print(df3.shape)
print( 'There are %d rows in the dataframe' %(df3.shape[0]))

(103, 3)
There are 103 rows in the dataframe


# Section 2
## Download coordinate data of Postal Code

In [64]:
df_co = pd.read_csv('https://cocl.us/Geospatial_data')
df_co.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Add latitude and longitude to the table.

In [68]:
lat_list = []
long_list = []

for PostCode_target in df3['PostCode']:
    for PostCode, Latitude, Longitude in zip (df_co['Postal Code'],
                                                 df_co['Latitude'],
                                               df_co['Longitude']):
        if PostCode_target == PostCode:
            lat_list.append(Latitude)
            long_list.append(Longitude)

Final_df = pd.DataFrame([df3['PostCode'].tolist(), 
                         df3['Borough'].tolist(), 
                         df3['Neighborhood'].tolist(),
                         lat_list,
                         long_list]).T
Final_df.columns = ['PostCode','Borough','Neighborhood', 'Latitude', 'Longitude']
Final_df

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
...,...,...,...,...,...
98,M9N,York,Weston,43.7069,-79.5182
99,M9P,Etobicoke,Westmount,43.6963,-79.5322
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.6889,-79.5547
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.7394,-79.5884


# Section 3

### Import libraries

In [79]:
import requests # library to handle requests

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Extract latitude and longitude values for downtown Toronto

In [85]:
# Extract latitude and longitude of downtown Toronto
lat = Final_df[Final_df['Borough']=='Downtown Toronto'].iloc[0].Latitude
long = Final_df[Final_df['Borough']=='Downtown Toronto'].iloc[0].Longitude

print('The geograpical coordinate of downtown Toronto City are {}, {}.'.format(lat, long))

The geograpical coordinate of downtown Toronto City are 43.6795626, -79.37752940000001.


In [87]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[lat, long], zoom_start=10)

map_toronto


In [94]:
unique_borough = Final_df['Borough'].unique().tolist()
unique_borough

['Scarborough',
 'North York',
 'East York',
 'East Toronto',
 'Central Toronto',
 'Downtown Toronto',
 'York',
 'West Toronto',
 'Mississauga',
 'Etobicoke',
 "Queen's Park"]

In [110]:
# Select boroughs that contain the word Toronto
df_to = Final_df[Final_df['Borough'].isin
                 (['East Toronto','Downtown Toronto', 'West Toronto','Central Toronto'])]
df_to.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.6764,-79.293
41,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.669,-79.3156
43,M4M,East Toronto,Studio District,43.6595,-79.3409
44,M4N,Central Toronto,Lawrence Park,43.728,-79.3888


In [121]:
# create map of Toronto using latitude and longitude values of newly selected regions
map_to = folium.Map(location=[lat, long], zoom_start=11.5)

# add markers to map
for lat, lng, borough, postcode in zip(df_to['Latitude'], df_to['Longitude'], df_to['Borough'], 
                                           df_to['PostCode']):
    label = '{}, {}'.format(postcode, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_to)  
    
map_to

In [None]:
### Define foursqaure credential and version

In [119]:
# Enter the following information. The actual info was entered in a hidden cell.
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20200111' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


In [120]:
# @hidden_cell
CLIENT_ID = 'EVFK24SPK25UERFGG2OVEKQBTRXKGWDTRRKLKII4GYCGPT5C' # your Foursquare ID
CLIENT_SECRET = 'LBGUOXCNPHOBDRTUH4V1VLH5GVCSWOUC0GRQ0CU1MWB5Y4EF' # your Foursquare Secret
VERSION = '20200111' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EVFK24SPK25UERFGG2OVEKQBTRXKGWDTRRKLKII4GYCGPT5C
CLIENT_SECRET:LBGUOXCNPHOBDRTUH4V1VLH5GVCSWOUC0GRQ0CU1MWB5Y4EF


### Get the top 100 venues that are in downtown Toronto with postal code M5C within a radius of 300 meters.

In [134]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 300 # define radius

neighborhood_latitude = df_to[df_to['PostCode']=='M5C'].Latitude.iloc[0]
neighborhood_longitude = df_to[df_to['PostCode']=='M5C'].Longitude.iloc[0]

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL


'https://api.foursquare.com/v2/venues/explore?&client_id=EVFK24SPK25UERFGG2OVEKQBTRXKGWDTRRKLKII4GYCGPT5C&client_secret=LBGUOXCNPHOBDRTUH4V1VLH5GVCSWOUC0GRQ0CU1MWB5Y4EF&v=20200111&ll=43.6514939,-79.3754179&radius=300&limit=100'

In [136]:
results = requests.get(url).json()
# results # uncomment if you want to see the results

### Obtain results from Foursquare, load data into pandas and prepare dataframe format.

In [143]:
from pandas.io.json import json_normalize
# function that extracts the category of the venue

def Category(row):
    try:
        cat_list = row['categories']
    except:
        cat_list = row['venue.categories']
        
    if len(cat_list) == 0:
        return None
    else:
        return cat_list[0]['name']
    
venues = results['response']['groups'][0]['items']

# flatten JSON using the following line
nearby_venues = json_normalize(venues) 

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(Category, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()



Unnamed: 0,name,categories,lat,lng
0,Gyu-Kaku Japanese BBQ,Japanese Restaurant,43.651422,-79.375047
1,Terroni,Italian Restaurant,43.650927,-79.375602
2,Crepe TO,Creperie,43.650063,-79.374587
3,Pearl Diver,Gastropub,43.651481,-79.3736
4,GoodLife Fitness Toronto 137 Yonge Street,Gym,43.651242,-79.378068


In [144]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

32 venues were returned by Foursquare.


### Repeat the same process for other postal code

In [148]:
# This function is from applied data science course materials, which will be utilized by this assignment.
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    
    nearby_venues.columns = ['PostCode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    
    return(nearby_venues)

In [149]:
to_venues = getNearbyVenues(names=df_to['PostCode'],
                                   latitudes=df_to['Latitude'],
                                   longitudes=df_to['Longitude']
                                  )

M4E
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6G
M6H
M6J
M6K
M6P
M6R
M6S
M7A
M7Y


### Check the size of the returned dataframe

In [150]:
print(to_venues.shape)
to_venues.head()

(1720, 7)


Unnamed: 0,PostCode,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M4E,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,M4K,43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


### Check how many venues were returned for each postal code

In [151]:
to_venues.groupby('PostCode').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
PostCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4E,4,4,4,4,4,4
M4K,43,43,43,43,43,43
M4L,20,20,20,20,20,20
M4M,42,42,42,42,42,42
M4N,4,4,4,4,4,4
M4P,8,8,8,8,8,8
M4R,21,21,21,21,21,21
M4S,35,35,35,35,35,35
M4T,3,3,3,3,3,3
M4V,15,15,15,15,15,15


In [152]:
print('There are {} uniques categories.'.format(len(to_venues['Venue Category'].unique())))

There are 236 uniques categories.


### Analyze each postal code area. Perform one hot encoding on venue_category for each postal code.

In [189]:
# one hot encoding
to_onehot = pd.get_dummies(to_venues[['Venue Category']], prefix="", prefix_sep="")

# add postcal code column back to dataframe
to_onehot['PostCode'] = to_venues['PostCode'] 

# move postal code column to the first column
fixed_columns = [to_onehot.columns[-1]] + list(to_onehot.columns[:-1])
to_onehot = to_onehot[fixed_columns]

to_onehot.head()

Unnamed: 0,PostCode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4K,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [190]:
# The shape of the one hot table
to_onehot.shape

(1720, 237)

### Group rows by PostCode and take the mean of the frequency of occurrence of each category¶

In [191]:
to_grouped = to_onehot.groupby('PostCode').mean().reset_index()
to_grouped

Unnamed: 0,PostCode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,...,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02381,0.0,0.0,0.0,0.02381
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0


In [192]:
# The new size is
to_grouped.shape

(39, 237)

### Print each postal code area and its top 5 most common venues¶

In [193]:
num_top_venues = 5

for hood in to_grouped['PostCode']:
    print("----"+hood+"----")
    temp = to_grouped[to_grouped['PostCode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----M4E----
                     venue  freq
0        Health Food Store  0.25
1                      Pub  0.25
2                    Trail  0.25
3             Neighborhood  0.25
4  New American Restaurant  0.00


----M4K----
                    venue  freq
0        Greek Restaurant  0.21
1             Coffee Shop  0.09
2          Ice Cream Shop  0.07
3      Italian Restaurant  0.07
4  Furniture / Home Store  0.05


----M4L----
                venue  freq
0                Park  0.10
1      Sandwich Place  0.10
2         Pizza Place  0.05
3  Italian Restaurant  0.05
4   Fish & Chips Shop  0.05


----M4M----
                 venue  freq
0                 Café  0.10
1          Coffee Shop  0.07
2              Brewery  0.05
3   Italian Restaurant  0.05
4  American Restaurant  0.05


----M4N----
         venue  freq
0         Park  0.25
1     Bus Line  0.25
2         Lake  0.25
3  Swim School  0.25
4      Airport  0.00


----M4P----
               venue  freq
0  Convenience Store  0.12
1     

### Create the new dataframe and display the top 10 venues for each postal code area.

In [194]:
# The following function is from applied data science capstone course material.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['PostCode']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postcode_venues_sorted = pd.DataFrame(columns=columns)
postcode_venues_sorted['PostCode'] = to_grouped['PostCode']

for ind in np.arange(to_grouped.shape[0]):
    postcode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(to_grouped.iloc[ind, :], num_top_venues)

postcode_venues_sorted.head()

Unnamed: 0,PostCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Pub,Health Food Store,Trail,Neighborhood,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Diner,Falafel Restaurant
1,M4K,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Restaurant,Bookstore,Furniture / Home Store,Indian Restaurant,Fruit & Vegetable Store,Juice Bar
2,M4L,Park,Sandwich Place,Ice Cream Shop,Pub,Brewery,Burrito Place,Fast Food Restaurant,Fish & Chips Shop,Italian Restaurant,Steakhouse
3,M4M,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Gastropub,Yoga Studio,Fish Market,Bookstore
4,M4N,Park,Lake,Swim School,Bus Line,Yoga Studio,Dog Run,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


### Cluster areas using kmeans. 

In [199]:
# Drop the postcode column
kclusters = 3
to_grouped_clustering = to_grouped.drop('PostCode', 1)

# Perform k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0, init = 'k-means++', max_iter = 10000).fit(to_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

### create a new dataframe that includes the cluster as well as the top 10 venues for each postal code area.

In [200]:
# add clustering labels
postcode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

to_merged = df_to

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
to_merged = to_merged.join(postcode_venues_sorted.set_index('PostCode'), on='PostCode')

to_merged.head() 

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,M4E,East Toronto,The Beaches,43.6764,-79.293,0,Pub,Health Food Store,Trail,Neighborhood,Event Space,Ethiopian Restaurant,Empanada Restaurant,Electronics Store,Diner,Falafel Restaurant
41,M4K,East Toronto,"The Danforth West, Riverdale",43.6796,-79.3522,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Restaurant,Bookstore,Furniture / Home Store,Indian Restaurant,Fruit & Vegetable Store,Juice Bar
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.669,-79.3156,0,Park,Sandwich Place,Ice Cream Shop,Pub,Brewery,Burrito Place,Fast Food Restaurant,Fish & Chips Shop,Italian Restaurant,Steakhouse
43,M4M,East Toronto,Studio District,43.6595,-79.3409,0,Café,Coffee Shop,Bakery,Italian Restaurant,Brewery,American Restaurant,Gastropub,Yoga Studio,Fish Market,Bookstore
44,M4N,Central Toronto,Lawrence Park,43.728,-79.3888,0,Park,Lake,Swim School,Bus Line,Yoga Studio,Dog Run,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


### Visualize clusters on the map

In [204]:
# create map
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(to_merged['Latitude'], to_merged['Longitude'], to_merged['PostCode'], to_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters