In [141]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests
import csv

import json
from pandas.io.json import json_normalize

from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium
print('Libraries imported.')

Libraries imported.


### Part 1 of Segmenting and Clustering Neighborhoods in Toronto

#### Code to scrape the Wikipedia page 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [142]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

rows = soup.table.find_all('tr')
table_list=[]
for row in rows:
    cols=row.find_all('td')
    cols=[x.text.strip() for x in cols]
    table_list.append(cols)
    #print(cols)
table_list

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', 'Downtown Toronto', "Queen's Park"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M9B', 'Etobicoke', 'Martin Grove'],
 ['M9B

#### Transform the data into a pandas datafram.

#### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.

In [143]:
df=pd.DataFrame(data=table_list,columns=["Postcode","Borough","Neighbourhood"])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
...,...,...,...
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West
286,M8Z,Etobicoke,South of Bloor


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [144]:
df.drop(df.index[0],inplace=True)
df.replace(to_replace="Not assigned",value=np.nan,inplace=True)
df.dropna(axis=0,subset=['Borough'],inplace=True)
df['Neighbourhood'].replace(np.nan, df['Borough'], inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
...,...,...,...
282,M8Z,Etobicoke,Kingsway Park South West
283,M8Z,Etobicoke,Mimico NW
284,M8Z,Etobicoke,The Queensway West
285,M8Z,Etobicoke,Royal York South West


#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [145]:
df['Neighbourhood'] = df.groupby('Postcode')['Neighbourhood'].transform(lambda x: "%s" % ', '.join(x)).values
df = df.drop_duplicates().reset_index(drop=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


#### Use the .shape method to print the number of rows of your dataframe.

In [146]:
df.shape

(103, 3)

### Part 2 of Segmenting and Clustering Neighborhoods in Toronto

#### Get the latitude and the longitude coordinates of each neighborhood.

In [147]:
geo_df = pd.read_csv('https://cocl.us/Geospatial_data')
geo_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


#### Build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, latitude and the longitude coordinates of each neighborhood.

In [148]:
df = pd.merge(left=df,right=geo_df, left_on='Postcode', right_on='Postal Code')
df.drop(columns=['Postal Code'],inplace=True)
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


### Part 3 of Segmenting and Clustering Neighborhoods in Toronto

In [149]:
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [150]:
df['Borough'].unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [152]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighbourhood,postcode in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood'],df['Postcode']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

In [153]:
CLIENT_ID = 'MW0HNMAAVZ2AHN2GJQVRURBGN5HEORAR0EVEYMKI10WYYNNM' 
CLIENT_SECRET = 'DQ1ZEWFMCBKBC0311AJQE1T3HNSTQRP1RDVWOFZCVYLS20DB'
VERSION = '20180604'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: MW0HNMAAVZ2AHN2GJQVRURBGN5HEORAR0EVEYMKI10WYYNNM
CLIENT_SECRET:DQ1ZEWFMCBKBC0311AJQE1T3HNSTQRP1RDVWOFZCVYLS20DB


In [154]:
radius=500
LIMIT=100
venues_list=[]
for lat, lng, borough, neighbourhood,postcode in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood'],df['Postcode']):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    for v in results:
        venues_list.append((
            postcode,
            borough,
            neighbourhood,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']))

nearby_venues = pd.DataFrame(venues_list)
nearby_venues.columns = ['Post Code', 
                         'Borough', 
                         'Neighbourhood', 
                         'Borough Latitude', 
                         'Borough Longitude',
                         'Venue',
                         'Venue Latitude',
                         'Venue Longitude',
                         'Venue Category']
nearby_venues

Unnamed: 0,Post Code,Borough,Neighbourhood,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M3A,North York,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.332140,Park
1,M3A,North York,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,M4A,North York,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,M4A,North York,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,M4A,North York,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...,...,...
2231,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999,RONA,43.629393,-79.518320,Hardware Store
2232,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999,Koala Tan Tanning Salon & Sunless Spa,43.631370,-79.519006,Tanning Salon
2233,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999,Value Village,43.631269,-79.518238,Thrift / Vintage Store
2234,M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen...",43.628841,-79.520999,Kingsway Boxing Club,43.627254,-79.526684,Gym


In [155]:
nearby_venues.shape

(2236, 9)

In [156]:
len(nearby_venues['Venue Category'].unique())

267

In [157]:
venue_category_dummies = pd.DataFrame()
venue_category_dummies['Neighbourhood'] = nearby_venues['Neighbourhood'] 
venue_category_dummies = pd.concat([venue_category_dummies,pd.get_dummies(nearby_venues[['Venue Category']], prefix="", prefix_sep="")], axis=1)

venue_category_dummies

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231,"Kingsway Park South West, Mimico NW, The Queen...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2232,"Kingsway Park South West, Mimico NW, The Queen...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2233,"Kingsway Park South West, Mimico NW, The Queen...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2234,"Kingsway Park South West, Mimico NW, The Queen...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [158]:
venue_category_dummies.shape

(2236, 268)

In [159]:
grouped = venue_category_dummies.groupby('Neighbourhood').mean().reset_index()
grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.02,0.0,0.000000,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Willowdale West,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
96,Woburn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
97,"Woodbine Gardens, Parkview Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.000000,0.0,0.0,0.00,0.0,0.0,0.00,0.0
98,Woodbine Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,0.00,0.0,0.111111,0.0,0.0,0.00,0.0,0.0,0.00,0.0


In [160]:
grouped.shape

(100, 268)

In [161]:
num_top_venues = 5

for hood in grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = grouped[grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.07
1       Restaurant  0.05
2             Café  0.04
3  Thai Restaurant  0.04
4       Steakhouse  0.03


----Agincourt----
                       venue  freq
0             Breakfast Spot  0.25
1                     Lounge  0.25
2             Clothing Store  0.25
3  Latin American Restaurant  0.25
4          Accessories Store  0.00


----Agincourt North, L'Amoreaux East, Milliken, Steeles East----
                             venue  freq
0                             Park   0.5
1                       Playground   0.5
2                Accessories Store   0.0
3               Mexican Restaurant   0.0
4  Molecular Gastronomy Restaurant   0.0


----Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown----
                 venue  freq
0        Grocery Store  0.22
1          Pizza Place  0.22
2             Pharmacy  0.11
3  Fried Chicken Joint  0.11
4        

                        venue  freq
0                        Park  0.25
1  Construction & Landscaping  0.25
2            Basketball Court  0.25
3                      Bakery  0.25
4           Accessories Store  0.00


----East Birchmount Park, Ionview, Kennedy Park----
                venue  freq
0      Discount Store  0.25
1    Department Store  0.25
2         Coffee Shop  0.25
3   Convenience Store  0.25
4  Miscellaneous Shop  0.00


----East Toronto----
                venue  freq
0                Park  0.50
1   Convenience Store  0.25
2       Metro Station  0.25
3   Accessories Store  0.00
4  Mexican Restaurant  0.00


----Emery, Humberlea----
                             venue  freq
0                   Baseball Field   1.0
1                Accessories Store   0.0
2               Mexican Restaurant   0.0
3  Molecular Gastronomy Restaurant   0.0
4       Modern European Restaurant   0.0


----Fairview, Henry Farm, Oriole----
                  venue  freq
0        Clothing Store  0.13

               venue  freq
0          Cafeteria   1.0
1  Accessories Store   0.0
2              Motel   0.0
3             Market   0.0
4     Massage Studio   0.0


----St. James Town----
                venue  freq
0         Coffee Shop  0.07
1                Café  0.06
2          Restaurant  0.05
3               Hotel  0.04
4  Italian Restaurant  0.04


----Stn A PO Boxes 25 The Esplanade----
                venue  freq
0         Coffee Shop  0.12
1                Café  0.04
2          Restaurant  0.04
3               Hotel  0.03
4  Seafood Restaurant  0.03


----Studio District----
                venue  freq
0                Café  0.10
1         Coffee Shop  0.07
2              Bakery  0.05
3             Brewery  0.05
4  Italian Restaurant  0.05


----The Annex, North Midtown, Yorkville----
                           venue  freq
0                           Café  0.14
1                 Sandwich Place  0.14
2                    Coffee Shop  0.09
3                 Cosmetics Shop  0.05


In [162]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [163]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = grouped['Neighbourhood']

for ind in np.arange(grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Restaurant,Thai Restaurant,Café,Steakhouse
1,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Clothing Store,Doner Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Fried Chicken Joint,Pharmacy,Fast Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Gym,Athletics & Sports,Coffee Shop,Pharmacy


In [164]:
neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Restaurant,Thai Restaurant,Café,Steakhouse
1,Agincourt,Lounge,Latin American Restaurant,Breakfast Spot,Clothing Store,Doner Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Fried Chicken Joint,Pharmacy,Fast Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Gym,Athletics & Sports,Coffee Shop,Pharmacy
...,...,...,...,...,...,...
95,Willowdale West,Grocery Store,Pizza Place,Discount Store,Home Service,Butcher
96,Woburn,Coffee Shop,Korean Restaurant,Yoga Studio,Donut Shop,Diner
97,"Woodbine Gardens, Parkview Hill",Pizza Place,Bank,Gym / Fitness Center,Breakfast Spot,Intersection
98,Woodbine Heights,Skating Rink,Curling Ice,Park,Pharmacy,Diner


In [165]:
# set number of clusters
clusters = 5

clustering = grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(clustering)

# check cluster labels generated for each row in the dataframe
#kmeans.labels_[0:10] 

In [166]:
kmeans.labels_

array([0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 1, 3, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 4, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int32)

In [167]:
len(kmeans.labels_)

100

In [168]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

merged = df

merged = merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
merged.dropna(inplace=True)
merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Dog Run,Dessert Shop,Dim Sum Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Hockey Arena,Coffee Shop,Intersection,Pizza Place,Portuguese Restaurant
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636,0.0,Coffee Shop,Pub,Café,Bakery,Park
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Carpet Store
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0.0,Coffee Shop,Park,Yoga Studio,Discount Store,Portuguese Restaurant
...,...,...,...,...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,1.0,Park,River,Pool,Discount Store,Department Store
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,0.0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558,0.0,Light Rail Station,Yoga Studio,Auto Workshop,Comic Shop,Park
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509,3.0,Baseball Field,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner


In [170]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lng, borough, neighbourhood, postcode, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Borough'], merged['Neighbourhood'],merged['Postcode'], merged['Cluster Labels']):
    cluster = int(cluster)
    label = folium.Popup(str(neighbourhood) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [176]:
merged.loc[merged['Cluster Labels'] == 0, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,North York,0.0,Hockey Arena,Coffee Shop,Intersection,Pizza Place,Portuguese Restaurant
2,Downtown Toronto,0.0,Coffee Shop,Pub,Café,Bakery,Park
3,North York,0.0,Clothing Store,Furniture / Home Store,Accessories Store,Coffee Shop,Carpet Store
4,Downtown Toronto,0.0,Coffee Shop,Park,Yoga Studio,Discount Store,Portuguese Restaurant
6,Scarborough,0.0,Fast Food Restaurant,Print Shop,Yoga Studio,Doner Restaurant,Dim Sum Restaurant
...,...,...,...,...,...,...,...
96,Downtown Toronto,0.0,Coffee Shop,Park,Restaurant,Pub,Bakery
97,Downtown Toronto,0.0,Coffee Shop,Café,Restaurant,Gym,Hotel
99,Downtown Toronto,0.0,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Gay Bar,Restaurant
100,East Toronto,0.0,Light Rail Station,Yoga Studio,Auto Workshop,Comic Shop,Park


In [177]:
merged.loc[merged['Cluster Labels'] == 1, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,North York,1.0,Park,Food & Drink Shop,Dog Run,Dessert Shop,Dim Sum Restaurant
16,York,1.0,Trail,Park,Field,Hockey Arena,Yoga Studio
21,York,1.0,Park,Women's Store,Market,Doner Restaurant,Dim Sum Restaurant
35,East York,1.0,Park,Convenience Store,Metro Station,Dessert Shop,Dim Sum Restaurant
40,North York,1.0,Park,Airport,Bus Stop,Doner Restaurant,Dim Sum Restaurant
49,North York,1.0,Park,Bakery,Construction & Landscaping,Basketball Court,Doner Restaurant
61,Central Toronto,1.0,Park,Lake,Swim School,Bus Line,Dog Run
66,North York,1.0,Park,Bank,Bar,Convenience Store,Doner Restaurant
68,Central Toronto,1.0,Park,Sushi Restaurant,Jewelry Store,Trail,Distribution Center
91,Downtown Toronto,1.0,Park,Playground,Trail,Distribution Center,Department Store


In [178]:
merged.loc[merged['Cluster Labels'] == 2, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
11,Etobicoke,2.0,Golf Course,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant


In [179]:
merged.loc[merged['Cluster Labels'] == 3, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
53,North York,3.0,Food Truck,Baseball Field,Home Service,Yoga Studio,Dessert Shop
57,North York,3.0,Baseball Field,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner
101,Etobicoke,3.0,Baseball Field,Yoga Studio,Doner Restaurant,Dim Sum Restaurant,Diner


In [180]:
merged.loc[merged['Cluster Labels'] == 4, merged.columns[[1] + list(range(5, merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
32,Scarborough,4.0,Playground,Yoga Studio,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
83,Central Toronto,4.0,Playground,Trail,Yoga Studio,Distribution Center,Department Store
85,Scarborough,4.0,Park,Playground,Doner Restaurant,Dessert Shop,Dim Sum Restaurant
