In [64]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

# !conda install -c conda-forge folium=0.5.0 --yes
import folium

### Load table data into dataframe

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = df[0]

### Clean dataframe

In [3]:
df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True)

### Dataframe shape

In [4]:
df.shape

(103, 3)

### Get geographical coordinates for the postal codes

In [5]:
coord_df = pd.read_csv('http://cocl.us/Geospatial_data')

In [6]:
df = df.merge(coord_df, left_on='Postal Code', right_on='Postal Code')

In [7]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Explore and cluster neighbourhoods

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
import json
import requests
from pandas.io.json import json_normalize

#### Use only Boroughs with "Toronto" in their name

In [10]:
df = df[df['Borough'].str.contains('Toronto')].reset_index()
df

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,19,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


#### Test Foursquare API with a single postal code

In [11]:
df.loc[0, 'Postal Code']

'M5A'

In [12]:
test_lat = df.loc[0, 'Latitude']
test_long = df.loc[0, 'Longitude']
test_postal_code = df.loc[0, 'Postal Code']

In [13]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    test_lat,
    test_long,
    500,
    LIMIT
)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=2MW505KVIAFPKCPCOM5CLYVZVCNIUNT5FM1RSCGODZEILNFZ&client_secret=FH33H1PV040WUKHXCSMACQ31PYQT5VFCAE2XKNJE04EETM55&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [14]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5f33422986af52694585313b'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 45,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

#### Create function for getting venues based on geographical coordinates

In [15]:
def getVenues(postal_codes, latitudes, longitudes, radius=500):
    venues_list = []
    
    for postal_code, lat, lng in zip(postal_codes, latitudes, longitudes):        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            postal_code,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = [
            'Neighborhood',
            'Neighborhood Latitude', 
            'Neighborhood Longitude', 
            'Venue', 
            'Venue Latitude', 
            'Venue Longitude', 
            'Venue Category'
        ]
    
    print("Retrieved venues!")
    return(nearby_venues)

In [16]:
toronto_venues = getVenues(
    postal_codes=df['Postal Code'],
    latitudes=df['Latitude'],
    longitudes=df['Longitude']
)

Retrieved venues!


In [17]:
print(toronto_venues.shape)
toronto_venues.head()

(1631, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [18]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
M4E,5,5,5,5,5,5
M4K,41,41,41,41,41,41
M4L,21,21,21,21,21,21
M4M,41,41,41,41,41,41
M4N,3,3,3,3,3,3
M4P,8,8,8,8,8,8
M4R,18,18,18,18,18,18
M4S,37,37,37,37,37,37
M4T,2,2,2,2,2,2
M4V,16,16,16,16,16,16


#### One-hot encoding

In [21]:
venues_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix='', prefix_sep='')
venues_onehot['Neighborhood'] = toronto_venues['Neighborhood']
fixed_columns = [venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])
venues_onehot = venues_onehot[fixed_columns]

print(venues_onehot.shape)
venues_onehot.head()

(1631, 234)


Unnamed: 0,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Group (by neighborhood) and analyze most common venues

In [22]:
venues_grouped = venues_onehot.groupby('Neighborhood').mean().reset_index()
print(venues_grouped.shape)
venues_grouped

(39, 234)


Unnamed: 0,Neighborhood,Yoga Studio,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Theater,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,...,0.0,0.0,0.0,0.02439,0.0,0.0,0.0,0.0,0.0,0.0
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.02439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04878,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439,0.0
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0


In [49]:
def most_common_venues(row, num_top_venues):
    row_cats = row.iloc[1:]
    row_cats_sorted = row_cats.sort_values(ascending=False)
    
    return row_cats_sorted.index.values[0:num_top_venues]

In [50]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for i in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(i + 1, indicators[i]))
    except:
        columns.append('{}th Most Common Venues'.format(i + 1))

venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighborhood'] = venues_grouped['Neighborhood']

for i in np.arange(venues_grouped.shape[0]):
    venues_sorted.iloc[i, 1:] = most_common_venues(venues_grouped.iloc[i, :], num_top_venues)

venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
0,M4E,Asian Restaurant,Pub,Trail,Health Food Store,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Women's Store
1,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Restaurant,Ice Cream Shop,Dessert Shop,Brewery,Bubble Tea Shop,Café
2,M4L,Park,Sandwich Place,Gym,Liquor Store,Fish & Chips Shop,Ice Cream Shop,Fast Food Restaurant,Italian Restaurant,Brewery,Restaurant
3,M4M,Café,Coffee Shop,Gastropub,Bakery,American Restaurant,Brewery,Yoga Studio,Latin American Restaurant,Ice Cream Shop,Fish Market
4,M4N,Park,Swim School,Bus Line,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


### Cluster neighborhoods

#### Use k-means to cluster 

In [51]:
k = 5

venues_grouped_clustering = venues_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(venues_grouped_clustering)

kmeans.labels_[0:10]

array([2, 2, 2, 2, 3, 2, 2, 2, 0, 2], dtype=int32)

#### Merged common venues and neighborhoods together
#### Also performed renaming to be able to join on Postal Code

In [66]:
# venues_sorted
venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
venues_sorted.rename(columns={'Neighborhood': 'Postal Code'}, inplace=True)
toronto_merged = df
toronto_merged.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)
toronto_merged = toronto_merged.join(venues_sorted.set_index('Postal Code'), on='Postal Code')

toronto_merged.head()

Unnamed: 0,index,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venues,5th Most Common Venues,6th Most Common Venues,7th Most Common Venues,8th Most Common Venues,9th Most Common Venues,10th Most Common Venues
0,2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,2,Coffee Shop,Bakery,Café,Pub,Park,Theater,Breakfast Spot,Restaurant,Event Space,Shoe Store
1,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,2,Coffee Shop,Diner,Distribution Center,Portuguese Restaurant,Persian Restaurant,Park,Mexican Restaurant,Japanese Restaurant,Italian Restaurant,Hobby Shop
2,9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,2,Clothing Store,Coffee Shop,Café,Japanese Restaurant,Cosmetics Shop,Bubble Tea Shop,Italian Restaurant,Lingerie Store,Middle Eastern Restaurant,Ramen Restaurant
3,15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,2,Café,Coffee Shop,Restaurant,Clothing Store,Cosmetics Shop,Cocktail Bar,American Restaurant,Gym,Park,Beer Bar
4,19,M4E,East Toronto,The Beaches,43.676357,-79.293031,2,Asian Restaurant,Pub,Trail,Health Food Store,Dog Run,Dessert Shop,Diner,Discount Store,Distribution Center,Women's Store


### Visualize the clusters

In [78]:
toronto_lat = 43.653
toronto_long = -79.383
map_clusters = folium.Map(location=[toronto_lat, toronto_long], zoom_start=12)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

marker_colors = []
for lat, long, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'],
                                   toronto_merged['Postal Code'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=8,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7
    ).add_to(map_clusters)

map_clusters