Segmenting and Clustering Neighborhoods in Toronto

Importing pandas

In [1]:
import pandas as pd

Scrapping the tables from wikipedia.

In [2]:
table = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", match = 'Postcode')

print(table)

[    Postcode           Borough  \
0        M1A      Not assigned   
1        M2A      Not assigned   
2        M3A        North York   
3        M4A        North York   
4        M5A  Downtown Toronto   
5        M6A        North York   
6        M6A        North York   
7        M7A  Downtown Toronto   
8        M8A      Not assigned   
9        M9A      Queen's Park   
10       M1B       Scarborough   
11       M1B       Scarborough   
12       M2B      Not assigned   
13       M3B        North York   
14       M4B         East York   
15       M4B         East York   
16       M5B  Downtown Toronto   
17       M5B  Downtown Toronto   
18       M6B        North York   
19       M7B      Not assigned   
20       M8B      Not assigned   
21       M9B         Etobicoke   
22       M9B         Etobicoke   
23       M9B         Etobicoke   
24       M9B         Etobicoke   
25       M9B         Etobicoke   
26       M1C       Scarborough   
27       M1C       Scarborough   
28       M1C 

Assigning the table as a dataframe

In [3]:
df = table[0]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
df2 = df[~df.Borough.str.contains("Not assigned")]

In [5]:
df3 = df2.groupby(['Postcode', 'Borough'], sort = False).agg( ', '.join)

In [6]:
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Downtown Toronto,Queen's Park


In [7]:
df3.shape

(103, 1)

In [8]:
df3.reset_index(inplace = True)

End of the part 1

In [9]:
df_geospatial = pd.read_csv('http://cocl.us/Geospatial_data')
df_geospatial.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
df_geospatial.rename(columns={"Postal Code": "Postcode"}, inplace = True)
df_geospatial.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [11]:
new_df = pd.merge(df3, df_geospatial, on='Postcode')
new_df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Queen's Park,Not assigned,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


End of the part 2

## Importing required libraries for further analysis

In [12]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

In order to define an instance of the geocoder, we need to define a user_agent.

In [13]:
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, ON, Canada are 43.653963, -79.387207.


Create a map of Toronto

In [14]:
new_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


In [15]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# adding markers to map
for lat, lng, borough, neighborhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Borough'], new_df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Accessing foursquare

In [16]:
CLIENT_ID = 'MWMUH45FTFS1QDSXIUYX5M3VC5AT50PC214STFH5N23CILCN' # your Foursquare ID
CLIENT_SECRET = 'EESSX2LZUJ1B14HRBUEYO214QTS2RMYZH53CMO243BWFZZLU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Let's get the location of the Neighbourhood in 21st row.

In [19]:
new_df.loc[20, 'Neighbourhood']

'Berczy Park'

Location of Berczy Park

In [21]:
neighbourhood_latitude = new_df.loc[20, 'Latitude']
neighbourhood_longitude = new_df.loc[20, 'Longitude']

neighbourhood_name = new_df.loc[20, 'Neighbourhood']

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Berczy Park are 43.644770799999996, -79.3733064.


Getting the 50 top venues within 1 km from the Berczy Park

In [22]:
limit = 50
radius = 1000

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    limit)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=MWMUH45FTFS1QDSXIUYX5M3VC5AT50PC214STFH5N23CILCN&client_secret=EESSX2LZUJ1B14HRBUEYO214QTS2RMYZH53CMO243BWFZZLU&v=20180605&ll=43.644770799999996,-79.3733064&radius=1000&limit=50'

In [23]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e41c6a89da7ee001b7f9bfe'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Downtown Toronto',
  'headerFullLocation': 'Downtown Toronto, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 243,
  'suggestedBounds': {'ne': {'lat': 43.65377080900001,
    'lng': -79.36089236171087},
   'sw': {'lat': 43.635770790999985, 'lng': -79.38572043828914}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b56a44ff964a5206e1728e3',
       'name': 'The Keg Steakhouse + Bar',
       'location': {'address': '26 The Esplanade',
        'lat': 43.64667637593993,
        'lng': -79.37482154865866,
        'labeledLatLngs': [{'label': 'displa

Extracting category of the place

In [24]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Transforming into pd dataframe

In [25]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Keg Steakhouse + Bar,Steakhouse,43.646676,-79.374822
1,LCBO,Liquor Store,43.642944,-79.37244
2,Fresh On Front,Vegetarian / Vegan Restaurant,43.647815,-79.374453
3,Hockey Hall Of Fame (Hockey Hall of Fame),Museum,43.646974,-79.377323
4,St. Lawrence Market (South Building),Farmers Market,43.648743,-79.371597


In [26]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

50 venues were returned by Foursquare.


In [28]:
count = nearby_venues.groupby(['categories']).count() 
print(count)

                               name  lat  lng
categories                                   
Art Gallery                       1    1    1
Bakery                            2    2    2
Basketball Stadium                1    1    1
Beach                             1    1    1
Beer Bar                          3    3    3
Bistro                            1    1    1
Café                              3    3    3
Cheese Shop                       1    1    1
Cocktail Bar                      2    2    2
Coffee Shop                       5    5    5
Concert Hall                      1    1    1
Cosmetics Shop                    1    1    1
Creperie                          1    1    1
Deli / Bodega                     1    1    1
Farmers Market                    2    2    2
Fish Market                       1    1    1
Food Truck                        1    1    1
Fountain                          1    1    1
French Restaurant                 1    1    1
Gastropub                         

Most common venue in top 50 are the Coffee Shop

In [32]:
coffee_venue = nearby_venues[nearby_venues['categories']=='Coffee Shop'].reset_index(drop=True)

#neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)

In [33]:
coffee_venue

Unnamed: 0,name,categories,lat,lng
0,Starbucks,Coffee Shop,43.644285,-79.369771
1,Mos Mos,Coffee Shop,43.64164,-79.377552
2,Pilot Coffee Roasters,Coffee Shop,43.645018,-79.380415
3,Everyday Gourmet (Teas & Coffees),Coffee Shop,43.648757,-79.371645
4,Starbucks,Coffee Shop,43.647261,-79.378599


In [37]:
map_toronto2 = folium.Map(location=[latitude, longitude], zoom_start=14)

# adding markers to map
for lat, lng, name in zip(coffe_venue['lat'], coffe_venue['lng'], coffee_venue['name']):
    label = '{}'.format(name)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto2)  
    
map_toronto2

End of task 3