In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from pandas.io.json import json_normalize # Utility function json_normalize for flattening semi-structured JSON objects
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
import requests
from bs4 import BeautifulSoup

In [18]:
wikiPostalCodes = requests.get("https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=890001620").text
soup = BeautifulSoup(wikiPostalCodes,'lxml')

In [19]:
result = soup.prettify().splitlines()
print('\n'.join(result[:20] + result[-20:]))

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Xn5DiQpAEJsAAFyItDYAAACS","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":946188405,"wgRevisionId":890001620,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with citations using unsupported parameters","Communications in Ontario","Postal codes in Canada",

In [20]:
My_table = soup.find('table',{'class':'wikitable sortable'})
My_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [21]:
rows = My_table.findAll('tr')
rows

[<tr>
 <th>Postcode</th>
 <th>Borough</th>
 <th>Neighbourhood
 </th></tr>, <tr>
 <td>M1A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M2A</td>
 <td>Not assigned</td>
 <td>Not assigned
 </td></tr>, <tr>
 <td>M3A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td></tr>, <tr>
 <td>M4A</td>
 <td><a href="/wiki/North_York" title="North York">North York</a></td>
 <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td></tr>, <tr>
 <td>M5A</td>
 <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
 <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td></tr>, <tr>
 <td>M6A</td>
 <td

In [22]:
parsed_data = []


In [23]:
for row in rows:
    children = row.findChildren(recursive=False)
    row_text = []
    for child in children: 
        clean_text = child.text 
        clean_text = clean_text.split('&#91;')[0] # This is to discard reference/citation links
        clean_text = clean_text.split('&#160;')[-1] # This is to clean the header row of the sort icons
        clean_text = clean_text.strip()
        row_text.append(clean_text)
    parsed_data.append(row_text)

In [24]:
parsed_data[:5]


[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [25]:
# Define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']

# Instantiate and populate the dataframe
df = pd.DataFrame(parsed_data[1:], columns=column_names)

# Examine the resulting dataframe
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [26]:
# Process the cells that have an assigned borough. Ignore cells with a borough that is not assigned.
df.drop(df[df['Borough']=='Not assigned'].index, inplace=True)
df.reset_index(inplace=True, drop=True)
print("The new number of rows in dataframe after dropping unassigned boroughs:", df.shape[0])

The new number of rows in dataframe after dropping unassigned boroughs: 211


In [27]:
# The neighborhood will be the same as the borough if a cell has a borough but a Not assigned neighborhood.
df['Neighborhood'].where(df['Neighborhood'] != 'Not assigned', df['Borough'], inplace=True)

In [28]:
# More than one neighborhood can exist in one postal code area. 
# Combined the rows into one row with the neighborhoods separated with a comma
df=df.groupby("PostalCode").agg(lambda x:','.join(set(x)))

In [29]:
df=df.reset_index()


In [30]:
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,Scarborough,"Morningside,West Hill,Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [31]:
print("The number of rows in dataframe:", df.shape[0])
df.shape

The number of rows in dataframe: 103


(103, 3)

In [32]:
df_copy = df.copy() # Make a copy of the dataframe


In [33]:
geodata = pd.read_csv('https://cocl.us/Geospatial_data')
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
geodata.rename(index=str, columns={"Postal Code":"PostalCode"},inplace=True)
geodata.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
df = df_copy.merge(geodata, how='inner', on='PostalCode')

In [36]:
df.head()


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [39]:
# Check how many boroughs and neighborhoods there are
print('The dataframe has',len(df['Borough'].unique()), 'boroughs and',df.shape[0], 'neighborhoods')

The dataframe has 11 boroughs and 103 neighborhoods


In [40]:
import folium # Import Folium visualization library


In [41]:
# Segment and Cluster by Downtown Toronto
tor_data = df[df['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
tor_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937


In [43]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


In [44]:
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [45]:
# Create a map of Downtown Toronto using Latitude and Longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=13)

# Add markers to map
for lat, lng, label in zip(tor_data['Latitude'], tor_data['Longitude'], tor_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

In [46]:
# Next, I am going to start utilizing the Foursquare API to explore the neighborhoods and segment them.
CLIENT_ID = 'RDZDO5MSITL4N20HVKR2WPZ1RHFP3JVEI1OZHZLTRJC1MYMX' # Foursquare ID
CLIENT_SECRET = 'WB3GMUZTGTJCTXN0CSCJGG54F0UBXRE1IWC5WMDJ1KSCAM4X' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Credentails:
CLIENT_ID: RDZDO5MSITL4N20HVKR2WPZ1RHFP3JVEI1OZHZLTRJC1MYMX
CLIENT_SECRET:WB3GMUZTGTJCTXN0CSCJGG54F0UBXRE1IWC5WMDJ1KSCAM4X


In [47]:
# Get the neighborhoods' latitude and longitude values
neighborhood_latitude = tor_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = tor_data.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = tor_data.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Rosedale are 43.6795626, -79.37752940000001.


In [48]:
# Let's get the top 100 venues that are in Rosedale within a radius of 500 meters.
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?client_id=RDZDO5MSITL4N20HVKR2WPZ1RHFP3JVEI1OZHZLTRJC1MYMX&client_secret=WB3GMUZTGTJCTXN0CSCJGG54F0UBXRE1IWC5WMDJ1KSCAM4X&v=20180605&ll=43.6056466,-79.50132070000001&radius=500&limit=100'.format(CLIENT_ID, CLIENT_SECRET, VERSION, neighborhood_latitude, neighborhood_longitude, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=RDZDO5MSITL4N20HVKR2WPZ1RHFP3JVEI1OZHZLTRJC1MYMX&client_secret=WB3GMUZTGTJCTXN0CSCJGG54F0UBXRE1IWC5WMDJ1KSCAM4X&v=20180605&ll=43.6056466,-79.50132070000001&radius=500&limit=100'

In [49]:
# Send the GET request and examine the results
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e7e48af9fcb92001b08e4be'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 14,
  'suggestedBounds': {'ne': {'lat': 43.6101466045, 'lng': -79.49511771930959},
   'sw': {'lat': 43.6011465955, 'lng': -79.50752368069043}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b119977f964a520488023e3',
       'name': 'LCBO',
       'location': {'address': '2762 Lake Shore Blvd W',
        'crossStreet': 'btwn 1st & 2nd St',
        'lat': 43.60228082768786,
        'lng': -79.4993016827402,
        'labeledLatLngs': [{'label': 'display',
          'lat':

In [50]:
# All the information is in the items key. Let's borrow the get_category_type function from the Foursquare lab.
# Function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [53]:
# Clean the json and structure in into a pandas dataframe.
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,LCBO,Liquor Store,43.602281,-79.499302
1,New Toronto Fish & Chips,Restaurant,43.601849,-79.503281
2,Domino's Pizza,Pizza Place,43.601583,-79.500905
3,Delicia Bakery & Pastry,Bakery,43.601403,-79.503012
4,Lucky Dice Restaurant,Café,43.601392,-79.503056
5,Popeyes Louisiana Kitchen,Fried Chicken Joint,43.602069,-79.4994
6,McDonald's,Fast Food Restaurant,43.602464,-79.498859
7,Shoppers Drug Mart,Pharmacy,43.601677,-79.502239
8,Sunbelt Rentals Aerial Work Platform,Rental Service,43.607654,-79.505362
9,Sunbelt Rentals,Business Service,43.607654,-79.505362


In [52]:
# How many values were returned by Foursquare?
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

14 venues were returned by Foursquare.


In [54]:
# Use the function from the lab to repeat the same process to all the neighborhoods in Downtown Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [58]:
# Run the above function on each neighborhood and create a new dataframe called tor_venues.
tor_venues = getNearbyVenues(names=tor_data['Neighborhood'],
                                   latitudes=tor_data['Latitude'],
                                   longitudes=tor_data['Longitude']
                                  )

Rosedale
Cabbagetown,St. James Town
Church and Wellesley
Regent Park,Harbourfront
Ryerson,Garden District
St. James Town
Berczy Park
Central Bay Street
King,Adelaide,Richmond
Harbourfront East,Toronto Islands,Union Station
Toronto Dominion Centre,Design Exchange
Commerce Court,Victoria Hotel
Harbord,University of Toronto
Chinatown,Kensington Market,Grange Park
Harbourfront West,South Niagara,CN Tower,Island airport,Railway Lands,King and Spadina,Bathurst Quay
Stn A PO Boxes 25 The Esplanade
Underground city,First Canadian Place
Christie


In [59]:
# Check the size of the resulting dataframe
print(tor_venues.shape)
tor_venues.head()

(1277, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown,St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


In [60]:

# Find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

There are 209 uniques categories.


In [61]:
# tor_venues['Venue Category'] = 'Restaurant' 
tor_venuestest = tor_venues[tor_venues['Venue Category'].str.contains('staurant')]

In [62]:
# Find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

There are 209 uniques categories.


In [63]:
# Analyze each neighborhood
# one hot encoding
tor_onehot = pd.get_dummies(tor_venuestest[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venuestest['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]

tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,American Restaurant,Asian Restaurant,Belgian Restaurant,Brazilian Restaurant,Caribbean Restaurant,Chinese Restaurant,Colombian Restaurant,Comfort Food Restaurant,Doner Restaurant,Dumpling Restaurant,Eastern European Restaurant,Empanada Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,French Restaurant,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Hotpot Restaurant,Indian Restaurant,Italian Restaurant,Japanese Restaurant,Korean Restaurant,Latin American Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,New American Restaurant,Polish Restaurant,Ramen Restaurant,Restaurant,Seafood Restaurant,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.181818,0.0,0.0,0.090909,0.0,0.090909,0.0
1,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.2,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.1,0.1,0.0,0.0,0.0
2,Central Bay Street,0.0,0.038462,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.038462,0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.038462,0.153846,0.115385,0.038462,0.0,0.038462,0.0,0.115385,0.038462,0.0,0.0,0.0,0.038462,0.038462,0.038462,0.038462,0.0,0.076923,0.0,0.038462,0.0
3,"Chinatown,Kensington Market,Grange Park",0.0,0.0,0.0,0.034483,0.0,0.034483,0.068966,0.0,0.068966,0.034483,0.103448,0.0,0.034483,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.034483,0.0,0.034483,0.034483,0.0,0.0,0.0,0.103448,0.0,0.0,0.0,0.0,0.0,0.034483,0.0,0.0,0.0,0.0,0.034483,0.0,0.137931,0.172414
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
tor_grouped.shape


(16, 44)

In [68]:
tor_onehot.shape

(320, 44)

In [69]:
# Let's print each neighborhood along with the top 10 most common venues
num_top_venues = 10

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                           venue  freq
0             Seafood Restaurant  0.18
1                     Restaurant  0.18
2    Eastern European Restaurant  0.09
3  Vegetarian / Vegan Restaurant  0.09
4                Thai Restaurant  0.09
5               Greek Restaurant  0.09
6        Comfort Food Restaurant  0.09
7            Japanese Restaurant  0.09
8              French Restaurant  0.09
9     Modern European Restaurant  0.00


----Cabbagetown,St. James Town----
                        venue  freq
0                  Restaurant   0.2
1          Italian Restaurant   0.2
2           Indian Restaurant   0.1
3         Japanese Restaurant   0.1
4             Thai Restaurant   0.1
5        Caribbean Restaurant   0.1
6          Chinese Restaurant   0.1
7        Taiwanese Restaurant   0.1
8  Modern European Restaurant   0.0
9           Korean Restaurant   0.0


----Central Bay Street----
                        venue  freq
0          Italian Restaurant  0.15
1         Japanes

In [70]:
# Let's put that into a pandas dataframe
# Use the function to sort the venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [71]:
# Create the new dataframe and display the top 15 venues for each neighborhood
num_top_venues = 15
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(16)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,Berczy Park,Seafood Restaurant,Restaurant,Comfort Food Restaurant,Greek Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Japanese Restaurant,French Restaurant,Eastern European Restaurant,Dumpling Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Vietnamese Restaurant
1,"Cabbagetown,St. James Town",Restaurant,Italian Restaurant,Thai Restaurant,Taiwanese Restaurant,Indian Restaurant,Caribbean Restaurant,Chinese Restaurant,Japanese Restaurant,Vietnamese Restaurant,Eastern European Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Comfort Food Restaurant
2,Central Bay Street,Italian Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Thai Restaurant,Modern European Restaurant,French Restaurant,Vegetarian / Vegan Restaurant,Indian Restaurant,Korean Restaurant,Mediterranean Restaurant,Falafel Restaurant,Sushi Restaurant,American Restaurant,Ramen Restaurant
3,"Chinatown,Kensington Market,Grange Park",Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Dumpling Restaurant,Chinese Restaurant,Comfort Food Restaurant,Belgian Restaurant,Caribbean Restaurant,Doner Restaurant,Empanada Restaurant,Filipino Restaurant,Italian Restaurant,Japanese Restaurant,Hotpot Restaurant,Thai Restaurant
4,Christie,Italian Restaurant,Restaurant,Vietnamese Restaurant,Dumpling Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Eastern European Restaurant,Doner Restaurant,Gluten-free Restaurant,Comfort Food Restaurant,Colombian Restaurant
5,Church and Wellesley,Japanese Restaurant,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Vietnamese Restaurant,Mexican Restaurant,American Restaurant,Caribbean Restaurant,Chinese Restaurant,Ethiopian Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Afghan Restaurant,Theme Restaurant
6,"Commerce Court,Victoria Hotel",Restaurant,American Restaurant,Italian Restaurant,Seafood Restaurant,Japanese Restaurant,Thai Restaurant,Vegetarian / Vegan Restaurant,Latin American Restaurant,Asian Restaurant,French Restaurant,Gluten-free Restaurant,New American Restaurant,Greek Restaurant,Vietnamese Restaurant,Empanada Restaurant
7,"Harbord,University of Toronto",Restaurant,Italian Restaurant,Japanese Restaurant,Comfort Food Restaurant,Chinese Restaurant,French Restaurant,Sushi Restaurant,Brazilian Restaurant,Empanada Restaurant,Filipino Restaurant,Fast Food Restaurant,American Restaurant,Asian Restaurant,Falafel Restaurant,Ethiopian Restaurant
8,"Harbourfront East,Toronto Islands,Union Station",Italian Restaurant,Restaurant,Sushi Restaurant,Seafood Restaurant,Vegetarian / Vegan Restaurant,Indian Restaurant,New American Restaurant,Chinese Restaurant,Japanese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant
9,"King,Adelaide,Richmond",Restaurant,Thai Restaurant,American Restaurant,Asian Restaurant,Sushi Restaurant,Seafood Restaurant,Vegetarian / Vegan Restaurant,Colombian Restaurant,Mediterranean Restaurant,Gluten-free Restaurant,Greek Restaurant,Japanese Restaurant,Latin American Restaurant,Fast Food Restaurant,Brazilian Restaurant


In [72]:
# Run k-means to cluster the neighborhood into 5 clusters
# set number of clusters
kclusters = 5

tor_grouped_clustering = tor_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 4, 4, 2, 4, 1, 0, 0, 1])

In [73]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

tor_merged = tor_data

# merge to add latitude/longitude for each neighborhood
tor_merged = tor_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

tor_merged.head() # check the columns

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,,,,,,,,,,,,,,,,
1,M4X,Downtown Toronto,"Cabbagetown,St. James Town",43.667967,-79.367675,0.0,Restaurant,Italian Restaurant,Thai Restaurant,Taiwanese Restaurant,Indian Restaurant,Caribbean Restaurant,Chinese Restaurant,Japanese Restaurant,Vietnamese Restaurant,Eastern European Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Comfort Food Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,4.0,Japanese Restaurant,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Vietnamese Restaurant,Mexican Restaurant,American Restaurant,Caribbean Restaurant,Chinese Restaurant,Ethiopian Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Afghan Restaurant,Theme Restaurant
3,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636,3.0,Restaurant,Mexican Restaurant,French Restaurant,Asian Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Filipino Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Eastern European Restaurant,Doner Restaurant,Gluten-free Restaurant,Comfort Food Restaurant
4,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,4.0,Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Restaurant,Ramen Restaurant,Fast Food Restaurant,Modern European Restaurant,Ethiopian Restaurant,Chinese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Seafood Restaurant,Sushi Restaurant,Thai Restaurant,American Restaurant


In [74]:
# Ignore/drop NaNs
tor_merged.dropna(axis=0, how='any',inplace=True)
tor_merged.reset_index(inplace=True, drop=True)
print("Number of rows after dropping NaNs:", len(tor_merged))
print("Number of NaNs:", tor_merged.isna().sum())

Number of rows after dropping NaNs: 16
Number of NaNs: PostalCode                0
Borough                   0
Neighborhood              0
Latitude                  0
Longitude                 0
Cluster Labels            0
1st Most Common Venue     0
2nd Most Common Venue     0
3rd Most Common Venue     0
4th Most Common Venue     0
5th Most Common Venue     0
6th Most Common Venue     0
7th Most Common Venue     0
8th Most Common Venue     0
9th Most Common Venue     0
10th Most Common Venue    0
11th Most Common Venue    0
12th Most Common Venue    0
13th Most Common Venue    0
14th Most Common Venue    0
15th Most Common Venue    0
dtype: int64


In [75]:
# Visualize the Clusters
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=13)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [76]:
tor_merged.groupby('Cluster Labels').count()


Unnamed: 0_level_0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0.0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1.0,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
2.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3.0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4.0,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4


In [77]:
# Cluster 1
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]]


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,"Cabbagetown,St. James Town",Restaurant,Italian Restaurant,Thai Restaurant,Taiwanese Restaurant,Indian Restaurant,Caribbean Restaurant,Chinese Restaurant,Japanese Restaurant,Vietnamese Restaurant,Eastern European Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Comfort Food Restaurant
8,"Harbourfront East,Toronto Islands,Union Station",Italian Restaurant,Restaurant,Sushi Restaurant,Seafood Restaurant,Vegetarian / Vegan Restaurant,Indian Restaurant,New American Restaurant,Chinese Restaurant,Japanese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant
11,"Harbord,University of Toronto",Restaurant,Italian Restaurant,Japanese Restaurant,Comfort Food Restaurant,Chinese Restaurant,French Restaurant,Sushi Restaurant,Brazilian Restaurant,Empanada Restaurant,Filipino Restaurant,Fast Food Restaurant,American Restaurant,Asian Restaurant,Falafel Restaurant,Ethiopian Restaurant


In [78]:
# Cluster 1
tor_merged.loc[tor_merged['Cluster Labels'] == 0, tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]]


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
0,"Cabbagetown,St. James Town",Restaurant,Italian Restaurant,Thai Restaurant,Taiwanese Restaurant,Indian Restaurant,Caribbean Restaurant,Chinese Restaurant,Japanese Restaurant,Vietnamese Restaurant,Eastern European Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Comfort Food Restaurant
8,"Harbourfront East,Toronto Islands,Union Station",Italian Restaurant,Restaurant,Sushi Restaurant,Seafood Restaurant,Vegetarian / Vegan Restaurant,Indian Restaurant,New American Restaurant,Chinese Restaurant,Japanese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant
11,"Harbord,University of Toronto",Restaurant,Italian Restaurant,Japanese Restaurant,Comfort Food Restaurant,Chinese Restaurant,French Restaurant,Sushi Restaurant,Brazilian Restaurant,Empanada Restaurant,Filipino Restaurant,Fast Food Restaurant,American Restaurant,Asian Restaurant,Falafel Restaurant,Ethiopian Restaurant


In [79]:
# Cluster 3
tor_merged.loc[tor_merged['Cluster Labels'] == 2, tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
15,Christie,Italian Restaurant,Restaurant,Vietnamese Restaurant,Dumpling Restaurant,French Restaurant,Filipino Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Eastern European Restaurant,Doner Restaurant,Gluten-free Restaurant,Comfort Food Restaurant,Colombian Restaurant


In [80]:
# Cluster 4
tor_merged.loc[tor_merged['Cluster Labels'] == 3, tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
2,"Regent Park,Harbourfront",Restaurant,Mexican Restaurant,French Restaurant,Asian Restaurant,Vietnamese Restaurant,Dumpling Restaurant,Filipino Restaurant,Fast Food Restaurant,Falafel Restaurant,Ethiopian Restaurant,Empanada Restaurant,Eastern European Restaurant,Doner Restaurant,Gluten-free Restaurant,Comfort Food Restaurant


In [81]:
# Cluster 5
tor_merged.loc[tor_merged['Cluster Labels'] == 4, tor_merged.columns[[2] + list(range(6, tor_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue
1,Church and Wellesley,Japanese Restaurant,Sushi Restaurant,Restaurant,Mediterranean Restaurant,Vietnamese Restaurant,Mexican Restaurant,American Restaurant,Caribbean Restaurant,Chinese Restaurant,Ethiopian Restaurant,Fast Food Restaurant,Indian Restaurant,Italian Restaurant,Afghan Restaurant,Theme Restaurant
3,"Ryerson,Garden District",Middle Eastern Restaurant,Japanese Restaurant,Italian Restaurant,Restaurant,Ramen Restaurant,Fast Food Restaurant,Modern European Restaurant,Ethiopian Restaurant,Chinese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Seafood Restaurant,Sushi Restaurant,Thai Restaurant,American Restaurant
6,Central Bay Street,Italian Restaurant,Japanese Restaurant,Middle Eastern Restaurant,Chinese Restaurant,Thai Restaurant,Modern European Restaurant,French Restaurant,Vegetarian / Vegan Restaurant,Indian Restaurant,Korean Restaurant,Mediterranean Restaurant,Falafel Restaurant,Sushi Restaurant,American Restaurant,Ramen Restaurant
12,"Chinatown,Kensington Market,Grange Park",Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Mexican Restaurant,Dumpling Restaurant,Chinese Restaurant,Comfort Food Restaurant,Belgian Restaurant,Caribbean Restaurant,Doner Restaurant,Empanada Restaurant,Filipino Restaurant,Italian Restaurant,Japanese Restaurant,Hotpot Restaurant,Thai Restaurant
