In [2]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd
import json

import matplotlib.cm as cm
import matplotlib.colors as colors
import folium

from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim

In [43]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [44]:
response = requests.get(url)
print(response)
data = response.text
soup = BeautifulSoup(data,'html.parser')

<Response [200]>


In [45]:
S_no = 0
d = dict()

table = soup.find('table')
table_rows = table.find_all('tr')
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text for i in td]
    #print(row)
    S_no += 1
    d[S_no] = row

In [46]:
d[4]

['M3A\n', 'North York\n', 'Parkwoods\n']

In [47]:
df = pd.DataFrame.from_dict(d,orient='index',columns=['Postal_Code','Borough','Neighborhood'])
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
1,,,
2,M1A\n,Not assigned\n,Not assigned\n
3,M2A\n,Not assigned\n,Not assigned\n
4,M3A\n,North York\n,Parkwoods\n
5,M4A\n,North York\n,Victoria Village\n


In [48]:
df = df.replace('\n','', regex=True)

In [49]:
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,,,
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,"Regent Park, Harbourfront"
4,M6A,North York,"Lawrence Manor, Lawrence Heights"


In [50]:
print('Shape of a dataframe before dropping the 1st row is',df.shape)

Shape of a dataframe before dropping the 1st row is (104, 3)


In [51]:
df.drop(df.index[0], inplace = True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [53]:
print('Shape of a dataframe after dropping the 1st row is',df.shape)

Shape of a dataframe after dropping the 1st row is (103, 3)


In [55]:
df['latitude'] = None
df['longitude'] = None
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [61]:
df['new'] = pd.DataFrame(df['Neighborhood'].str.split(',', n = 1, expand = True)[0])
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,latitude,longitude,new
0,M3A,North York,Parkwoods,,,Parkwoods
1,M4A,North York,Victoria Village,,,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,,Regent Park
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,,Lawrence Manor
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,,Queen's Park


In [62]:
df['Neighborhood'] = df['new']
del df['new']
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,Regent Park,,
3,M6A,North York,Lawrence Manor,,
4,M7A,Downtown Toronto,Queen's Park,,


In [63]:
for i in range(len(df)):
    address = df.iloc[i,2]+' , Toronto, Ontario'
    geolocator = Nominatim(user_agent="ny_explorer",timeout=3) #timeout=3, dealing with timeout error
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
        df.iloc[i,3] = latitude
        df.iloc[i,4] = longitude
    except:
        print(df.iloc[i,2])
        df.iloc[i,3] = 'NA'
        df.iloc[i,4] = 'NA'

Caledonia-Fairbanks
Central Bay Street
Del Ray
Canada Post Gateway Processing Centre
Stn A PO Boxes
Business reply mail Processing Centre


In [64]:
df.shape

(103, 5)

In [65]:
df.drop(df.index[df['latitude'] == 'NA'], inplace = True)
df = df.reset_index(drop=True)
df.shape

(97, 5)

In [66]:
df.head()

Unnamed: 0,Postal_Code,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7611,-79.3241
1,M4A,North York,Victoria Village,43.7327,-79.3112
2,M5A,Downtown Toronto,Regent Park,43.6607,-79.3605
3,M6A,North York,Lawrence Manor,43.7221,-79.4375
4,M7A,Downtown Toronto,Queen's Park,43.6597,-79.3903


In [None]:
#Different method of getting coordinates
'''import sys
!{sys.executable} -m pip install geocoder
import geocoder
from geopy.geocoders import arcgis as arcgis'''

In [None]:
'''code = 'M5A'
g = geocoder.arcgis('{}, Toronto, Ontario'.format(code))
lat_lng_coords = g.latlng
lat_lng_coords'''

In [67]:
print('Dataframe has {} Borough and {} neighborhood.'.format(len(df.Borough.unique()),df.shape[0]))

Dataframe has 9 Borough and 97 neighborhood.


In [68]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="can_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [70]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11,tiles='stamen toner')

for lat, lng, borough, neighborhood in zip(df['latitude'], df['longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto.save('map.html')
map_toronto

##Define Foursquare Credentials and Version

In [71]:
CLIENT_ID = 'PYANEEI0VEF3DT0UYBXNCSB2JR0U230EJF0NYRKSZMNBSL1H' # your Foursquare ID
CLIENT_SECRET = 'OASGQ1NIHPEOJZI3MOSTE4E0KDX0ZY1R0JIPHL3OP5LEXVUV' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: PYANEEI0VEF3DT0UYBXNCSB2JR0U230EJF0NYRKSZMNBSL1H
CLIENT_SECRET:OASGQ1NIHPEOJZI3MOSTE4E0KDX0ZY1R0JIPHL3OP5LEXVUV


In [73]:
#Top 100 venues that are within a radius of 2000 meter
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df['latitude'], df['longitude'], df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
venues_df = pd.DataFrame(venues,columns=['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory'])
venues_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Parkwoods,43.761124,-79.324059,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.761124,-79.324059,LCBO,43.757774,-79.314257,Liquor Store
2,Parkwoods,43.761124,-79.324059,Fat Bastard Burrito Co,43.774146,-79.322276,Burrito Place
3,Parkwoods,43.761124,-79.324059,Lara's Restaurant,43.772751,-79.332812,Middle Eastern Restaurant
4,Parkwoods,43.761124,-79.324059,Graydon Hall Manor,43.763923,-79.342961,Event Space


In [74]:
venues_df.groupby(["Neighborhood"]).count() #Number of venues returned for each neighbor remember max limit was 100

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,100,100,100,100,100,100
Alderwood,100,100,100,100,100,100
Bathurst Manor,100,100,100,100,100,100
Bayview Village,79,79,79,79,79,79
Bedford Park,84,84,84,84,84,84
...,...,...,...,...,...,...
Willowdale,300,300,300,300,300,300
Woburn,64,64,64,64,64,64
Woodbine Heights,92,92,92,92,92,92
York Mills,86,86,86,86,86,86


In [75]:
print('There are {} unique categories.'.format(len(venues_df['VenueCategory'].unique())))  

There are 319 unique categories.


In [77]:
df.shape

(97, 5)

In [78]:
venues_df['VenueCategory'].shape #It should be 9700(97 x 100) if all neighbours have 100 top venues data

(8209,)

One-hot encoding

In [94]:
# one hot encoding
tr_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tr_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tr_onehot.columns[-1]] + list(tr_onehot.columns[:-1])
tr_onehot = tr_onehot[fixed_columns]

print(tr_onehot.shape)
tr_onehot.head()

(8209, 320)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Beach Bar,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Bookstore,Botanical Garden,Boutique,Bowling Alley,...,Stables,Stadium,Steakhouse,Storage Facility,Street Art,Supermarket,Supplement Shop,Sushi Restaurant,Szechuan Restaurant,Taco Place,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tech Startup,Tennis Court,Tennis Stadium,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [95]:
# group rows by neighborhood taking the mean of the frequency of occurrence of each category
tr_grouped = tr_onehot.groupby(["Neighborhoods"]).mean().reset_index()
print(tr_grouped.shape)
tr_grouped

(89, 320)


Unnamed: 0,Neighborhoods,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Antique Shop,Aquarium,Arcade,Argentinian Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Beach,Beach Bar,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Bookstore,Botanical Garden,Boutique,Bowling Alley,...,Stables,Stadium,Steakhouse,Storage Facility,Street Art,Supermarket,Supplement Shop,Sushi Restaurant,Szechuan Restaurant,Taco Place,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tech Startup,Tennis Court,Tennis Stadium,Thai Restaurant,Theater,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo,Zoo Exhibit
0,Agincourt,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.010000,0.010000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.020000,0.030000,0.010000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.010000,0.00,0.00000,0.0,0.0,0.020000,0.0,0.0,0.00000,...,0.0,0.0,0.000000,0.0,0.0,0.040000,0.0,0.010000,0.0,0.00,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,0.010000,0.00,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.020000,0.0,0.00,0.00,0.010000,0.0,0.000000,0.00,0.0,0.0
1,Alderwood,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.020000,0.020000,0.010000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.010000,0.00,0.00000,0.0,0.0,0.010000,0.0,0.0,0.00000,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.020000,0.0,0.00,0.0,0.00,0.0,0.010000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.000000,0.0,0.020000,0.0,0.000000,0.000000,0.0,0.00,0.00,0.01,0.010000,0.0,0.00,0.00,0.020000,0.0,0.000000,0.00,0.0,0.0
2,Bathurst Manor,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.020000,0.020000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.010000,0.000000,0.030000,0.000000,0.0,0.0,0.0,0.0,0.0,0.03,0.000000,0.01,0.00000,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.000000,0.0,0.0,0.010000,0.0,0.010000,0.0,0.02,0.0,0.02,0.0,0.000000,0.0,0.000000,0.0,0.010000,0.00,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.00,0.04,0.00,0.010000,0.0,0.01,0.01,0.000000,0.0,0.000000,0.01,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.012658,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.012658,0.037975,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.012658,0.00,0.00000,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.000000,0.0,0.0,0.012658,0.0,0.000000,0.0,0.00,0.0,0.00,0.0,0.000000,0.0,0.012658,0.0,0.025316,0.00,0.0,0.000000,0.0,0.012658,0.0,0.012658,0.012658,0.0,0.00,0.00,0.00,0.000000,0.0,0.00,0.00,0.012658,0.0,0.000000,0.00,0.0,0.0
4,Bedford Park,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.011905,0.000000,0.0,0.0,0.011905,0.011905,0.0,0.023810,0.059524,0.011905,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.00,0.00000,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.011905,0.0,0.0,0.000000,0.0,0.059524,0.0,0.00,0.0,0.00,0.0,0.011905,0.0,0.011905,0.0,0.023810,0.00,0.0,0.000000,0.0,0.011905,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.000000,0.0,0.00,0.00,0.011905,0.0,0.000000,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,Willowdale,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.000000,0.0,0.0,0.010000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.010000,0.010000,0.010000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.010000,0.00,0.00000,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.010000,0.0,0.0,0.010000,0.0,0.030000,0.0,0.00,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,0.020000,0.01,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.01,0.00,0.00,0.020000,0.0,0.00,0.00,0.010000,0.0,0.000000,0.00,0.0,0.0
85,Woburn,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.015625,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.062500,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.031250,0.00,0.03125,0.0,0.0,0.015625,0.0,0.0,0.00000,...,0.0,0.0,0.000000,0.0,0.0,0.015625,0.0,0.000000,0.0,0.00,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.00,0.0,0.015625,0.0,0.015625,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.015625,0.0,0.00,0.00,0.000000,0.0,0.015625,0.00,0.0,0.0
86,Woodbine Heights,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.032609,0.000000,0.010870,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.032609,0.032609,0.021739,0.000000,0.0,0.0,0.0,0.0,0.0,0.00,0.010870,0.00,0.00000,0.0,0.0,0.000000,0.0,0.0,0.01087,...,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.010870,0.0,0.00,0.0,0.00,0.0,0.000000,0.0,0.000000,0.0,0.021739,0.00,0.0,0.010870,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.000000,0.0,0.00,0.00,0.000000,0.0,0.000000,0.00,0.0,0.0
87,York Mills,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.011628,0.000000,0.0,0.0,0.011628,0.000000,0.0,0.011628,0.046512,0.046512,0.000000,0.011628,0.0,0.0,0.0,0.0,0.0,0.00,0.000000,0.00,0.00000,0.0,0.0,0.000000,0.0,0.0,0.00000,...,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.058140,0.0,0.00,0.0,0.00,0.0,0.011628,0.0,0.011628,0.0,0.034884,0.00,0.0,0.000000,0.0,0.011628,0.0,0.000000,0.000000,0.0,0.00,0.00,0.00,0.000000,0.0,0.00,0.00,0.000000,0.0,0.000000,0.00,0.0,0.0


In [96]:
len(df.Neighborhood.unique())

89

**Create a new DataFrame for Shopping Mall data only**

In [97]:
df_mall = tr_grouped[["Neighborhoods","Shopping Mall"]]
print(df_mall.shape)
df_mall.head()

(89, 2)


Unnamed: 0,Neighborhoods,Shopping Mall
0,Agincourt,0.02
1,Alderwood,0.01
2,Bathurst Manor,0.0
3,Bayview Village,0.012658
4,Bedford Park,0.0


Cluster Neighborhoods


In [98]:
# set number of clusters
kclusters = 3

tr_clustering = df_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tr_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 1], dtype=int32)

In [99]:
tr_merged = df_mall.copy()

# add clustering labels
tr_merged["Cluster Labels"] = kmeans.labels_

In [100]:
tr_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
tr_merged.head()

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels
0,Agincourt,0.02,1
1,Alderwood,0.01,1
2,Bathurst Manor,0.0,0
3,Bayview Village,0.012658,1
4,Bedford Park,0.0,0


In [101]:
tr_merged = tr_merged.join(df.set_index("Neighborhood"), on="Neighborhood")

print(tr_merged.shape)
tr_merged.head() 

(97, 7)


Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Postal_Code,Borough,latitude,longitude
0,Agincourt,0.02,1,M1S,Scarborough,43.7854,-79.2785
1,Alderwood,0.01,1,M8W,Etobicoke,43.6017,-79.5452
2,Bathurst Manor,0.0,0,M3H,North York,43.6655,-79.4119
3,Bayview Village,0.012658,1,M2K,North York,43.7692,-79.3767
4,Bedford Park,0.0,0,M5M,North York,43.7374,-79.4109


In [102]:
# sort the results by Cluster Labels
tr_merged.sort_values(["Cluster Labels"], inplace=True)
tr_merged

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Postal_Code,Borough,latitude,longitude
44,Little Portugal,0.000000,0,M6J,West Toronto,43.6474,-79.4311
72,The Beaches,0.000000,0,M4E,East Toronto,43.671,-79.2967
62,Roselawn,0.000000,0,M5N,Central Toronto,43.7082,-79.4123
30,Harbourfront East,0.000000,0,M5J,Downtown Toronto,43.6401,-79.3801
31,High Park,0.000000,0,M6P,West Toronto,43.6539,-79.4669
...,...,...,...,...,...,...,...
56,Parkview Hill,0.010000,1,M4B,East York,43.6535,-79.3839
58,Queen's Park,0.020000,1,M7A,Downtown Toronto,43.6597,-79.3903
61,Rosedale,0.010000,1,M4W,Downtown Toronto,43.6784,-79.3807
33,Humber Summit,0.052632,2,M9L,North York,43.7601,-79.5718


In [103]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11,tiles='stamen toner')

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tr_merged['latitude'], tr_merged['longitude'], tr_merged['Neighborhood'], tr_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters.save('cluster.html')
map_clusters

Examine the Clusters

#### Cluster 0

In [104]:
tr_merged.loc[tr_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Postal_Code,Borough,latitude,longitude
44,Little Portugal,0.0,0,M6J,West Toronto,43.6474,-79.4311
72,The Beaches,0.0,0,M4E,East Toronto,43.671,-79.2967
62,Roselawn,0.0,0,M5N,Central Toronto,43.7082,-79.4123
30,Harbourfront East,0.0,0,M5J,Downtown Toronto,43.6401,-79.3801
31,High Park,0.0,0,M6P,West Toronto,43.6539,-79.4669
32,Hillcrest Village,0.0,0,M2H,North York,43.6817,-79.4257
34,Humberlea,0.0,0,M9M,North York,43.7213,-79.5332
35,Humewood-Cedarvale,0.0,0,M6C,York,43.6883,-79.4281
36,India Bazaar,0.0,0,M4L,East Toronto,43.6722,-79.3235
38,Kennedy Park,0.0,0,M1K,Scarborough,43.7249,-79.254


#### Cluster 1

In [105]:
tr_merged.loc[tr_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Postal_Code,Borough,latitude,longitude
75,Thorncliffe Park,0.01,1,M4H,East York,43.7046,-79.3454
84,Willowdale,0.01,1,M2R,North York,43.7615,-79.4109
66,South Steeles,0.02,1,M9V,Etobicoke,43.8162,-79.3145
76,Toronto Dominion Centre,0.01,1,M5K,Downtown Toronto,43.6474,-79.3814
84,Willowdale,0.01,1,M2N,North York,43.7615,-79.4109
77,University of Toronto,0.02,1,M5S,Downtown Toronto,43.6635,-79.3978
69,Studio District,0.01,1,M4M,East Toronto,43.6496,-79.3907
84,Willowdale,0.01,1,M2M,North York,43.7615,-79.4109
70,Summerhill West,0.01,1,M4V,Central Toronto,43.6817,-79.3905
67,St. James Town,0.01,1,M5C,Downtown Toronto,43.6694,-79.3727


#### Cluster 2

In [106]:
tr_merged.loc[tr_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Shopping Mall,Cluster Labels,Postal_Code,Borough,latitude,longitude
33,Humber Summit,0.052632,2,M9L,North York,43.7601,-79.5718
60,Richmond,0.033333,2,M5H,Downtown Toronto,43.8126,-79.2634


###Quick Result


*   Cluster 1 has medium level of competition.
*   Cluster 0 has least competition with no present mall situated.
*   Cluster 2 has highest level of competition.

