# Applied Data Science - Capstone Project

## Install required Python packages

In [43]:
!conda install -c conda-forge geopy --yes 
!conda install -c conda-forge folium --yes 
!conda install -c conda-forge pyquery --yes

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.



## Get Wiki page containing Toronto Boroughs/Neighborhoods
### Note: using pandas.io.hmtl to get the wiki table into pandas DataFrame

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(res.content,'lxml')
wikitables = soup.find_all('table') 
Toronto = pd.read_html(str(wikitables[0]), index_col=None, header=0)[0]
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [2]:
# TO VERIFY THAT BOTH METHODS PRODUCE SIMILAR DIMENSIONS
Toronto.shape

(289, 3)

## Alternative way to read in the content and produce a dataframe using pandas.io.html
### Note: Results are consistent between the two methods

In [3]:
import requests
import numpy as np
import pandas as pd
from pandas.io.html import read_html

# Define the wiki page url var
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# Issue HTTP request to get the URL content
req = requests.get(WIKI_URL)
# Use pandas read_html to read in the content
wikitables = read_html(WIKI_URL, index_col=None, header=0, attrs={"class":["sortable","wikitable"]})
# Get pandas dataframe
Toronto = wikitables[0]
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [4]:
# TO VERIFY THAT BOTH METHODS PRODUCE SIMILAR DIMENSIONS
Toronto.shape

(289, 3)

## Data cleaning

In [5]:
# Empty entries to np.nan to drop them in the next step
Toronto['Borough'].replace('', np.nan, inplace=True)
# Drop np.nan to remove rows not containing meaningful data
Toronto.dropna(subset=['Borough'], inplace=True)
# Leave behind rows containing 'Not assigned' in 'Borough'
Toronto = Toronto[Toronto['Borough'] != 'Not assigned']

## Data processing - 'Not assigned' to value

In [6]:
# Iterate over the dataframe and fix 'Not assigned' for column 'Neighborhood'
for i, _ in Toronto.iterrows():
    if Toronto.loc[i]['Neighbourhood'] == 'Not assigned': Toronto.loc[i]['Neighborhood'] = Toronto.loc[i]['Borough']

## Dataframe shape

In [7]:
# Check datafame shape
Toronto.shape

(212, 3)

## Number of rows in the dataframe

In [8]:
# Print the number of rows in the dataframe
print('Number of rows in Toronto dataframe: {}'.format(Toronto.shape[0]))

Number of rows in Toronto dataframe: 212


In [9]:
!conda install -c conda-forge geocoder --yes

Solving environment: done

# All requested packages already installed.



## API compensator - at times geocoder.google returns None for the same postal code
### Create a dictionary of all the postal codes to add to the dictionary at the next step

In [10]:
TPS = Toronto['Postcode'].unique()
len(TPS)

103

## Get offline cache to support throttled Geocoder API

In [11]:
!wget -q --no-check-certificate -O 'latitude.pickle' 'https://docs.google.com/uc?export=download&id=1PdEOkPErrpBtDgSlDwczIv_KLlpY-YcO'
!wget -q --no-check-certificate -O 'longitude.pickle' 'https://docs.google.com/uc?export=download&id=1XujA04dCARQnlxu-X2ItOVcYQz0MMQh9'

In [12]:
!ls -l *.pickle

-rw-rw-r-- 1 unixdev unixdev 1965 Sep 12 11:35 latitude.pickle
-rw-rw-r-- 1 unixdev unixdev 1965 Sep 12 11:35 longitude.pickle
-rw-rw-r-- 1 unixdev unixdev 2694 Sep 12 09:27 toronto_boroughs.pickle


In [13]:
import pickle

with open('latitude.pickle', 'rb') as flat:
    latitude = pickle.load(flat)
with open('longitude.pickle', 'rb') as flon:
    longitude = pickle.load(flon)

In [14]:
import geocoder
import time

for postcode in TPS:
    # When offline cache is available use it to avoid Geocoder Google API throttling
    if postcode in latitude.keys(): 
        continue
    while True:
        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
        lat_lng_coords = g.latlng
        if lat_lng_coords == None:
            print('Trottled response to {}'.format(postcode))
            time.sleep(5)
            continue
        break
    
    if lat_lng_coords != None:
        latitude[postcode] = lat_lng_coords[0]
        longitude[postcode] = lat_lng_coords[1]
print('Successfully populated geo locations')

Successfully populated geo locations


In [15]:
lat = []
lon = []
for i, _ in Toronto.iterrows():
    lat.append(latitude[Toronto.loc[i]['Postcode']])
    lon.append(longitude[Toronto.loc[i]['Postcode']])
                        

In [16]:
Toronto = Toronto.assign(Latitude = lat, Longitude=lon)
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
5,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
6,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [17]:
Toronto = Toronto.reset_index(drop=True)
Toronto.to_csv('Toronto.csv')
Toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


## Additional imports

In [18]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [19]:
address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
T_lat = location.latitude
T_lon = location.longitude
print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(T_lat, T_lon))



The geograpical coordinate of Toronto, ON, Canada are 43.653963, -79.387207.


# Create a map of Toronto with neighbourhoods

In [21]:
T_map = folium.Map(location=[T_lat, T_lon], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto['Latitude'], Toronto['Longitude'], Toronto['Borough'], Toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(T_map)  
    
T_map

# Prepare Foursquare credentials

In [22]:
CLIENT_ID = '0C4ANZOALOMRDOJZU2BUYUKB2LOF3N5ADH4A3WJ3UDS3YIJD'
CLIENT_SECRET = 'MIJAG0GZX3JF2RXLGPCOOH25QZ3NBF2MLFYQJUG2DD5LCNM3'
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0C4ANZOALOMRDOJZU2BUYUKB2LOF3N5ADH4A3WJ3UDS3YIJD
CLIENT_SECRET:MIJAG0GZX3JF2RXLGPCOOH25QZ3NBF2MLFYQJUG2DD5LCNM3


## Explore West Toronto

In [23]:
HighPark = Toronto[Toronto['Borough'] == 'West Toronto']
HighPark.reset_index(drop=True)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M6H,West Toronto,Dovercourt Village,43.669005,-79.442259
1,M6H,West Toronto,Dufferin,43.669005,-79.442259
2,M6J,West Toronto,Little Portugal,43.647927,-79.41975
3,M6J,West Toronto,Trinity,43.647927,-79.41975
4,M6K,West Toronto,Brockton,43.636847,-79.428191
5,M6K,West Toronto,Exhibition Place,43.636847,-79.428191
6,M6K,West Toronto,Parkdale Village,43.636847,-79.428191
7,M6P,West Toronto,High Park,43.661608,-79.464763
8,M6P,West Toronto,The Junction South,43.661608,-79.464763
9,M6R,West Toronto,Parkdale,43.64896,-79.456325


Retrieve West Toronto Neighbourhood

In [24]:
HP_lat = HighPark['Latitude'].values[0] # neighborhood latitude value
print(type(HP_lat))
HP_lon = HighPark['Longitude'].values[0] # neighborhood longitude value

HP_name = HighPark['Neighbourhood'].values[0] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(HP_name, 
                                                               HP_lat, 
                                                               HP_lon))

<class 'numpy.float64'>
Latitude and longitude values of Dovercourt Village are 43.6690051, -79.4422593.


Prepare the Foursquare URL request

In [25]:
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    HP_lat, 
    HP_lon, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=0C4ANZOALOMRDOJZU2BUYUKB2LOF3N5ADH4A3WJ3UDS3YIJD&client_secret=MIJAG0GZX3JF2RXLGPCOOH25QZ3NBF2MLFYQJUG2DD5LCNM3&v=20180605&ll=43.6690051,-79.4422593&radius=500&limit=100'

Issue the Foursquare request and check the response

In [26]:
results = requests.get(url).json()
# results
if results != None:
    print("Request successfully processed")

Request successfully processed


In [27]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Parse Foursquare results and get venues

In [28]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,The Greater Good Bar,Bar,43.669409,-79.439267
1,Parallel,Middle Eastern Restaurant,43.669543,-79.438701
2,Planet Fitness Toronto Galleria,Gym / Fitness Center,43.667588,-79.442574
3,FreshCo,Supermarket,43.667918,-79.440754
4,Happy Bakery & Pastries,Bakery,43.66705,-79.441791


Print number of venues returned by Foursquare

In [29]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

17 venues were returned by Foursquare.


Define the function to get nearby venues

In [30]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [51]:
# Get the nearby venues
HP_venues = getNearbyVenues(names=HighPark['Neighbourhood'],
                                   latitudes=HighPark['Latitude'],
                                   longitudes=HighPark['Longitude']
                                  )

Dovercourt Village
Dufferin
Little Portugal
Trinity
Brockton
Exhibition Place
Parkdale Village
High Park
The Junction South
Parkdale
Roncesvalles
Runnymede
Swansea


Print the venues shape and content

In [32]:
print(HP_venues.shape)
HP_venues.head()

(373, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Dovercourt Village,43.669005,-79.442259,The Greater Good Bar,43.669409,-79.439267,Bar
1,Dovercourt Village,43.669005,-79.442259,Parallel,43.669543,-79.438701,Middle Eastern Restaurant
2,Dovercourt Village,43.669005,-79.442259,Planet Fitness Toronto Galleria,43.667588,-79.442574,Gym / Fitness Center
3,Dovercourt Village,43.669005,-79.442259,FreshCo,43.667918,-79.440754,Supermarket
4,Dovercourt Village,43.669005,-79.442259,Happy Bakery & Pastries,43.66705,-79.441791,Bakery


Count venues by neighbourhood

In [52]:
HP_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brockton,22,22,22,22,22,22
Dovercourt Village,18,18,18,18,18,18
Dufferin,18,18,18,18,18,18
Exhibition Place,22,22,22,22,22,22
High Park,23,23,23,23,23,23
Little Portugal,64,64,64,64,64,64
Parkdale,13,13,13,13,13,13
Parkdale Village,22,22,22,22,22,22
Roncesvalles,13,13,13,13,13,13
Runnymede,35,35,35,35,35,35


Find out the number of unique venues

In [34]:
print('There are {} uniques categories.'.format(len(HP_venues['Venue Category'].unique())))

There are 85 uniques categories.


Perform one-hot encoding on venue categories

In [53]:
# one hot encoding
HP_onehot = pd.get_dummies(HP_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
HP_onehot['Neighborhood'] = HP_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [HP_onehot.columns[-1]] + list(HP_onehot.columns[:-1])
HP_onehot = HP_onehot[fixed_columns]

HP_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bank,Bar,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Burrito Place,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Climbing Gym,Cocktail Bar,Coffee Shop,Convenience Store,Cuban Restaurant,Cupcake Shop,Dessert Shop,Diner,Discount Store,Dog Run,Eastern European Restaurant,Falafel Restaurant,Fast Food Restaurant,Fish & Chips Shop,Flea Market,Food,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gastropub,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Liquor Store,Mac & Cheese Joint,Malay Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Music Venue,New American Restaurant,Park,Performing Arts Venue,Pet Store,Pharmacy,Piano Bar,Pizza Place,Pub,Record Shop,Restaurant,Salon / Barbershop,Sandwich Place,Smoke Shop,Smoothie Shop,Southern / Soul Food Restaurant,Sports Bar,Stadium,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Dovercourt Village,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Dovercourt Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Dovercourt Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Dovercourt Village,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,Dovercourt Village,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
HP_onehot.shape

(372, 87)

Find out the stats per Neighbourhood

In [37]:
HP_grouped = HP_onehot.groupby('Neighborhood').mean().reset_index()
HP_grouped

Unnamed: 0,Neighborhood,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bakery,Bank,Bar,Bookstore,Boutique,Breakfast Spot,Brewery,Burger Joint,Burrito Place,Café,Cajun / Creole Restaurant,Caribbean Restaurant,Climbing Gym,Cocktail Bar,Coffee Shop,Convenience Store,Cuban Restaurant,Cupcake Shop,Dessert Shop,Diner,Discount Store,Dog Run,Eastern European Restaurant,Falafel Restaurant,Fast Food Restaurant,Fish & Chips Shop,Flea Market,Food,French Restaurant,Fried Chicken Joint,Furniture / Home Store,Gastropub,Gift Shop,Gourmet Shop,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Ice Cream Shop,Indie Movie Theater,Italian Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Liquor Store,Mac & Cheese Joint,Malay Restaurant,Men's Store,Mexican Restaurant,Middle Eastern Restaurant,Miscellaneous Shop,Movie Theater,Music Venue,New American Restaurant,Park,Performing Arts Venue,Pet Store,Pharmacy,Piano Bar,Pizza Place,Pub,Record Shop,Restaurant,Salon / Barbershop,Sandwich Place,Smoothie Shop,Southern / Soul Food Restaurant,Sports Bar,Stadium,Supermarket,Sushi Restaurant,Tapas Restaurant,Tea Room,Thai Restaurant,Theater,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar,Yoga Studio
0,Brockton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.095238,0.0,0.0,0.047619,0.095238,0.0,0.047619,0.047619,0.0,0.142857,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.047619,0.047619,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Dovercourt Village,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.058824,0.058824,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Dufferin,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.058824,0.058824,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.058824,0.0,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Exhibition Place,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.095238,0.0,0.0,0.047619,0.095238,0.0,0.047619,0.047619,0.0,0.142857,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.047619,0.047619,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,High Park,0.0,0.043478,0.0,0.043478,0.0,0.0,0.043478,0.0,0.086957,0.043478,0.0,0.0,0.0,0.0,0.0,0.086957,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.043478,0.043478,0.043478,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,0.0,0.0,0.0,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043478,0.0,0.0,0.0,0.0,0.0
5,Little Portugal,0.015625,0.0,0.015625,0.0,0.03125,0.015625,0.03125,0.0,0.125,0.0,0.015625,0.0,0.015625,0.0,0.0,0.0625,0.0,0.0,0.0,0.03125,0.046875,0.0,0.015625,0.015625,0.0,0.015625,0.0,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,0.015625,0.0,0.015625,0.0,0.0,0.015625,0.015625,0.0,0.015625,0.015625,0.015625,0.0,0.0,0.015625,0.015625,0.03125,0.0,0.0,0.015625,0.0,0.015625,0.015625,0.015625,0.0,0.0,0.0,0.0,0.03125,0.015625,0.015625,0.046875,0.015625,0.0,0.0,0.015625,0.015625,0.0,0.0,0.0,0.015625,0.0,0.0,0.015625,0.015625,0.03125,0.015625,0.015625
6,Parkdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.076923,0.0,0.076923,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Parkdale Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.095238,0.0,0.0,0.047619,0.095238,0.0,0.047619,0.047619,0.0,0.142857,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.047619,0.047619,0.047619,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Roncesvalles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.076923,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.076923,0.0,0.076923,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.153846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Runnymede,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.026316,0.026316,0.026316,0.0,0.0,0.0,0.0,0.026316,0.078947,0.0,0.0,0.0,0.0,0.105263,0.0,0.0,0.0,0.026316,0.052632,0.0,0.0,0.0,0.026316,0.0,0.026316,0.0,0.026316,0.026316,0.0,0.0,0.026316,0.0,0.026316,0.0,0.0,0.026316,0.0,0.0,0.026316,0.052632,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316,0.0,0.052632,0.026316,0.0,0.052632,0.0,0.026316,0.026316,0.0,0.0,0.0,0.0,0.052632,0.0,0.026316,0.0,0.0,0.026316,0.0,0.0,0.0


In [38]:
HP_grouped.shape

(13, 86)

Iterater over the grouped venues and calculate venue frequency

In [39]:
num_top_venues = 5

for hood in HP_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = HP_grouped[HP_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brockton----
            venue  freq
0     Coffee Shop  0.14
1            Café  0.10
2  Breakfast Spot  0.10
3             Gym  0.05
4         Stadium  0.05


----Dovercourt Village----
                       venue  freq
0                Supermarket  0.12
1                     Bakery  0.12
2  Middle Eastern Restaurant  0.06
3       Fast Food Restaurant  0.06
4                       Café  0.06


----Dufferin----
                       venue  freq
0                Supermarket  0.12
1                     Bakery  0.12
2  Middle Eastern Restaurant  0.06
3       Fast Food Restaurant  0.06
4                       Café  0.06


----Exhibition Place----
            venue  freq
0     Coffee Shop  0.14
1            Café  0.10
2  Breakfast Spot  0.10
3             Gym  0.05
4         Stadium  0.05


----High Park----
                venue  freq
0  Mexican Restaurant  0.09
1                 Bar  0.09
2                Café  0.09
3               Diner  0.04
4        Antique Shop  0.04


----Little

Helper function to retrieve the most common venues

In [40]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Get the 10 top venues per neighbourhood

In [55]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = HP_grouped['Neighborhood']

for ind in np.arange(HP_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(HP_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brockton,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant
1,Dovercourt Village,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
2,Dufferin,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
3,Exhibition Place,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant
4,High Park,Mexican Restaurant,Café,Bar,Bookstore,Music Venue,Diner,Park,Cajun / Creole Restaurant,Fast Food Restaurant,Italian Restaurant
5,Little Portugal,Bar,Café,Restaurant,Coffee Shop,French Restaurant,Men's Store,Cocktail Bar,Pizza Place,Bakery,Asian Restaurant
6,Parkdale,Gift Shop,Coffee Shop,Breakfast Spot,Burger Joint,Dessert Shop,Piano Bar,Movie Theater,Bar,Dog Run,Eastern European Restaurant
7,Parkdale Village,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant
8,Roncesvalles,Gift Shop,Coffee Shop,Breakfast Spot,Burger Joint,Dessert Shop,Piano Bar,Movie Theater,Bar,Dog Run,Eastern European Restaurant
9,Runnymede,Coffee Shop,Café,Diner,Italian Restaurant,Restaurant,Sushi Restaurant,Pizza Place,Indie Movie Theater,Falafel Restaurant,Pub


## Clustering

In [42]:
# set number of clusters
kclusters = 5

HP_grouped_clustering = HP_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(HP_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 3, 3, 1, 4, 0, 2, 1, 2, 0], dtype=int32)

In [43]:
HP_merged = HighPark

# add clustering labels
HP_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
HP_merged = HP_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

HP_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
52,M6H,West Toronto,Dovercourt Village,43.669005,-79.442259,1,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
53,M6H,West Toronto,Dufferin,43.669005,-79.442259,3,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
64,M6J,West Toronto,Little Portugal,43.647927,-79.41975,3,Bar,Café,Restaurant,Coffee Shop,French Restaurant,Men's Store,Cocktail Bar,Pizza Place,Bakery,Asian Restaurant
65,M6J,West Toronto,Trinity,43.647927,-79.41975,1,Bar,Café,Restaurant,Coffee Shop,French Restaurant,Men's Store,Cocktail Bar,Pizza Place,Bakery,Asian Restaurant
76,M6K,West Toronto,Brockton,43.636847,-79.428191,4,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant


## West Toronto Map with Neighbourhoods 

In [57]:
# create map
map_clusters = folium.Map(location=[T_lat, T_lon], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(HP_merged['Latitude'], HP_merged['Longitude'], HP_merged['Neighbourhood'], HP_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### Cluster 1

In [47]:
HP_merged.loc[HP_merged['Cluster Labels'] == 0, HP_merged.columns[[1] + list(range(5, HP_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
77,West Toronto,0,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant
134,West Toronto,0,Gift Shop,Coffee Shop,Breakfast Spot,Burger Joint,Dessert Shop,Piano Bar,Movie Theater,Bar,Dog Run,Eastern European Restaurant
135,West Toronto,0,Gift Shop,Coffee Shop,Breakfast Spot,Burger Joint,Dessert Shop,Piano Bar,Movie Theater,Bar,Dog Run,Eastern European Restaurant
146,West Toronto,0,Coffee Shop,Café,Diner,Italian Restaurant,Restaurant,Sushi Restaurant,Pizza Place,Indie Movie Theater,Falafel Restaurant,Pub


In [48]:
HP_merged.loc[HP_merged['Cluster Labels'] == 1, HP_merged.columns[[1] + list(range(5, HP_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
52,West Toronto,1,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
65,West Toronto,1,Bar,Café,Restaurant,Coffee Shop,French Restaurant,Men's Store,Cocktail Bar,Pizza Place,Bakery,Asian Restaurant
124,West Toronto,1,Mexican Restaurant,Café,Bar,Bookstore,Music Venue,Diner,Park,Cajun / Creole Restaurant,Fast Food Restaurant,Italian Restaurant


In [49]:
HP_merged.loc[HP_merged['Cluster Labels'] == 3, HP_merged.columns[[1] + list(range(5, HP_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
53,West Toronto,3,Bakery,Supermarket,Brewery,Fast Food Restaurant,Music Venue,Park,Pet Store,Pharmacy,Discount Store,Liquor Store
64,West Toronto,3,Bar,Café,Restaurant,Coffee Shop,French Restaurant,Men's Store,Cocktail Bar,Pizza Place,Bakery,Asian Restaurant


In [50]:
HP_merged.loc[HP_merged['Cluster Labels'] == 4, HP_merged.columns[[1] + list(range(5, HP_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
76,West Toronto,4,Coffee Shop,Café,Breakfast Spot,Gym,Burrito Place,Grocery Store,Furniture / Home Store,Gym / Fitness Center,Italian Restaurant,Falafel Restaurant
145,West Toronto,4,Coffee Shop,Café,Diner,Italian Restaurant,Restaurant,Sushi Restaurant,Pizza Place,Indie Movie Theater,Falafel Restaurant,Pub


# Thank you for reviwing my work!