In [1]:
import requests
from bs4 import BeautifulSoup

res = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup = BeautifulSoup(res.content, 'html.parser')

table_data = soup.select('table.wikitable.sortable')

for row in table_data:
    if len(row.get_text()) > 0:
        postal_codes = row.get_text().split('\n')

### Scrape postal codes in Toronto from wikipedia

Then make a list which has each elements in it.

In [2]:
postal_codes = [i for i in postal_codes if i != '']
postal_codes

['Postcode',
 'Borough',
 'Neighbourhood',
 'M1A',
 'Not assigned',
 'Not assigned',
 'M2A',
 'Not assigned',
 'Not assigned',
 'M3A',
 'North York',
 'Parkwoods',
 'M4A',
 'North York',
 'Victoria Village',
 'M5A',
 'Downtown Toronto',
 'Harbourfront',
 'M6A',
 'North York',
 'Lawrence Heights',
 'M6A',
 'North York',
 'Lawrence Manor',
 'M7A',
 'Downtown Toronto',
 "Queen's Park",
 'M8A',
 'Not assigned',
 'Not assigned',
 'M9A',
 'Etobicoke',
 'Islington Avenue',
 'M1B',
 'Scarborough',
 'Rouge',
 'M1B',
 'Scarborough',
 'Malvern',
 'M2B',
 'Not assigned',
 'Not assigned',
 'M3B',
 'North York',
 'Don Mills North',
 'M4B',
 'East York',
 'Woodbine Gardens',
 'M4B',
 'East York',
 'Parkview Hill',
 'M5B',
 'Downtown Toronto',
 'Ryerson',
 'M5B',
 'Downtown Toronto',
 'Garden District',
 'M6B',
 'North York',
 'Glencairn',
 'M7B',
 'Not assigned',
 'Not assigned',
 'M8B',
 'Not assigned',
 'Not assigned',
 'M9B',
 'Etobicoke',
 'Cloverdale',
 'M9B',
 'Etobicoke',
 'Islington',
 'M9B',

In [3]:
import numpy as np
postal_codes = np.array(postal_codes, dtype=object)

### Change postal_codes as a numpy array, then reshape it.

As each row of table has 3 components, reshape it which has 3 columns.

In [4]:
postal_codes = postal_codes.reshape(-1, 3)[1:,:]

In [5]:
postal_codes

array([['M1A', 'Not assigned', 'Not assigned'],
       ['M2A', 'Not assigned', 'Not assigned'],
       ['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights'],
       ['M6A', 'North York', 'Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M8A', 'Not assigned', 'Not assigned'],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge'],
       ['M1B', 'Scarborough', 'Malvern'],
       ['M2B', 'Not assigned', 'Not assigned'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens'],
       ['M4B', 'East York', 'Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson'],
       ['M5B', 'Downtown Toronto', 'Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M7B', 'Not assigned', 'Not assigned'],
       ['M8B', 'Not assigned', 'Not assigned'

### Delete the row which has unassigned borough.

In [6]:
assigned_postal_codes = np.array([])

for postal_code in postal_codes:
    if postal_code[1] != 'Not assigned':
        assigned_postal_codes = np.append(assigned_postal_codes.reshape(-1, 3), postal_code.reshape(-1, 3), axis=0)
        
assigned_postal_codes

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights'],
       ['M6A', 'North York', 'Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge'],
       ['M1B', 'Scarborough', 'Malvern'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens'],
       ['M4B', 'East York', 'Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson'],
       ['M5B', 'Downtown Toronto', 'Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M9B', 'Etobicoke', 'Cloverdale'],
       ['M9B', 'Etobicoke', 'Islington'],
       ['M9B', 'Etobicoke', 'Martin Grove'],
       ['M9B', 'Etobicoke', 'Princess Gardens'],
       ['M9B', 'Etobicoke', 'West Deane Park'],
       ['M1C', 'Scarborough', 'Highland Creek'],
       ['

### We got a numpy array which has information of postal codes!

Now let's combine the rows which have the same postal code. Then the neighborhoods will be separated with a comma.

**For example**, shown as below, in the row of postal code 'M6A'

'Lawrence Heights',  'Lawrence Manor' => 'Lawrence Heights, Lawrence Manor'

In [7]:
unique_postal_codes = assigned_postal_codes[0]

for postal_code in assigned_postal_codes[1:]:
    
    if postal_code[0] not in unique_postal_codes:
        
        unique_postal_codes = np.append(unique_postal_codes.reshape(-1, 3), postal_code.reshape(-1, 3), axis=0)
        
    else:
        for unique_postal_code in unique_postal_codes:
            if postal_code[0] == unique_postal_code[0]:
                unique_postal_code[2] = unique_postal_code[2] + ', ' + postal_code[2]

unique_postal_codes

array([['M3A', 'North York', 'Parkwoods'],
       ['M4A', 'North York', 'Victoria Village'],
       ['M5A', 'Downtown Toronto', 'Harbourfront'],
       ['M6A', 'North York', 'Lawrence Heights, Lawrence Manor'],
       ['M7A', 'Downtown Toronto', "Queen's Park"],
       ['M9A', 'Etobicoke', 'Islington Avenue'],
       ['M1B', 'Scarborough', 'Rouge, Malvern'],
       ['M3B', 'North York', 'Don Mills North'],
       ['M4B', 'East York', 'Woodbine Gardens, Parkview Hill'],
       ['M5B', 'Downtown Toronto', 'Ryerson, Garden District'],
       ['M6B', 'North York', 'Glencairn'],
       ['M9B', 'Etobicoke',
        'Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park'],
       ['M1C', 'Scarborough', 'Highland Creek, Rouge Hill, Port Union'],
       ['M3C', 'North York', 'Flemingdon Park, Don Mills South'],
       ['M4C', 'East York', 'Woodbine Heights'],
       ['M5C', 'Downtown Toronto', 'St. James Town'],
       ['M6C', 'York', 'Humewood-Cedarvale'],
       ['M9C', 'Etob

### Now, let's make a dataframe of postal code!

In [8]:
import pandas as pd
df_postal_code = pd.DataFrame(unique_postal_codes, columns=['PostalCode', 'Borough', 'Neighborhood'])
df_postal_code

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."


### See the size of dataframe

In [9]:
df_postal_code.shape

(103, 3)

### Import geospatial dataframe by using read_csv function of pandas

In [10]:
geospatial_data = pd.read_csv('Geospatial_Coordinates.csv')
geospatial_data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Rename the postal code column of Geospatia_data as the same name of df_postal_code

In [11]:
geospatial_data.rename({'Postal Code':'PostalCode'}, axis='columns', inplace=True)

### Merge two dataframes by the key of 'PostalCode'

We can use merge function of pandas as we have common values in PostalCode column

In [12]:
df_postal_code_latlon = pd.merge(df_postal_code, geospatial_data, on='PostalCode')
df_postal_code_latlon

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business Reply Mail Processing Centre 969 Eastern,43.662744,-79.321558
101,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509


In [13]:
from pandas.io.json import json_normalize

In [14]:
CLIENT_ID = 'UUPY1FRBDNAYNQEEKEKGBQLTT320RYEMQKGUQZ1VYD2DZYCT'
CLIENT_SECRET = 'EMR1AYU0H0ZUUVUFXHHBNMTYTJAXTZVDSYZXNATJFVMSRXBJ'
VERSION = '20180323'
LIMIT = 30
print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

My credentails:
CLIENT_ID: UUPY1FRBDNAYNQEEKEKGBQLTT320RYEMQKGUQZ1VYD2DZYCT
CLIENT_SECRET:EMR1AYU0H0ZUUVUFXHHBNMTYTJAXTZVDSYZXNATJFVMSRXBJ


In [15]:
borough_latitude = df_postal_code_latlon.loc[0, 'Latitude']
borough_longitude = df_postal_code_latlon.loc[0, 'Longitude']

borough_name = df_postal_code_latlon.loc[0, 'Borough']

print('Latitude and longitude values of {} are {}, {}.'.format(borough_name,
                                                               borough_latitude,
                                                               borough_longitude))

Latitude and longitude values of North York are 43.7532586, -79.3296565.


### Now, let's get the top 100 venues that are in North York within a radius of 500 meters

First, let's create the GET request URL.

In [16]:
LIMIT = 100
RADIUS = 500

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,
                                                                                                                           CLIENT_SECRET,
                                                                                                                           VERSION, 
                                                                                                                           borough_latitude,
                                                                                                                           borough_longitude,
                                                                                                                           RADIUS,
                                                                                                                           LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=UUPY1FRBDNAYNQEEKEKGBQLTT320RYEMQKGUQZ1VYD2DZYCT&client_secret=EMR1AYU0H0ZUUVUFXHHBNMTYTJAXTZVDSYZXNATJFVMSRXBJ&v=20180323&ll=43.7532586,-79.3296565&radius=500&limit=100'

Send the GET request and examine the results

In [17]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e5aecfddf2774001b8e62a2'},
  'headerLocation': 'Parkwoods - Donalda',
  'headerFullLocation': 'Parkwoods - Donalda, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.757758604500005,
    'lng': -79.32343823984928},
   'sw': {'lat': 43.7487585955, 'lng': -79.33587476015072}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4e8d9dcdd5fbbbb6b3003c7b',
       'name': 'Brookbanks Park',
       'contact': {},
       'location': {'address': 'Toronto',
        'lat': 43.751976046055574,
        'lng': -79.33214044722958,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.751976046055574,
          'lng': -79.33214044722958}],
        'distance': 245,
        

From the Foursquare lab, let's borrow **get_category_type** function.

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Let's make cleaned dataframe as we did in the last lab.

In [19]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues)

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Brookbanks Park,Park,43.751976,-79.33214
1,Careful & Reliable Painting,Construction & Landscaping,43.752622,-79.331957
2,Variety Store,Food & Drink Shop,43.751974,-79.333114


In borough North York, How many venues were returned by Foursquare?

In [20]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

3 venues were returned by Foursquare.


## Explore boroughs in Toronto

#### Let's create a function to repeat the same process to all the boroughs in Toronto.

I will borrow the fucntion from the lab we did.

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
toronto_venues = getNearbyVenues(names=df_postal_code_latlon['Borough'],
                                 latitudes=df_postal_code_latlon['Latitude'],
                                 longitudes=df_postal_code_latlon['Longitude'])

North York
North York
Downtown Toronto
North York
Downtown Toronto
Etobicoke
Scarborough
North York
East York
Downtown Toronto
North York
Etobicoke
Scarborough
North York
East York
Downtown Toronto
York
Etobicoke
Scarborough
East Toronto
Downtown Toronto
York
Scarborough
East York
Downtown Toronto
Downtown Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East York
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
West Toronto
Scarborough
North York
North York
East Toronto
Downtown Toronto
North York
North York
Scarborough
North York
North York
East Toronto
North York
York
North York
Scarborough
North York
North York
Central Toronto
Central Toronto
York
York
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Etobicoke
Scarborough
North York
Central Toronto
Central Toronto
West Toronto
Mississauga
Etobicoke
Scarborough
Central Toronto
Downtown Toronto
West Toron

Now we saved all data of nearby venues of each borough in a dataframe.

Let's take a look at it.

In [23]:
print(toronto_venues.shape)
toronto_venues.head()

(2240, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,North York,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,North York,43.753259,-79.329656,Careful & Reliable Painting,43.752622,-79.331957,Construction & Landscaping
2,North York,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,North York,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,North York,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


Let's find out how many unique categories can be curated from all the returned venues.

In [24]:
print('There are {} unique categories!'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 unique categories!


## Analyze Each Borough

Now we will encode the dataframe by using **get_dummies** function from **pandas**.

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add borough column back to dataframe
toronto_onehot['Borough'] = toronto_venues['Borough']

# move borough column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

print("Shape of the dataframe : ", toronto_onehot.shape)
toronto_onehot.head()

Shape of the dataframe :  (2240, 271)


Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,North York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Next, we will group rows by borough and by taking the mean of the frequency of occurrence of each category

In [26]:
toronto_grouped = toronto_onehot.groupby('Borough').mean().reset_index()
print('The size of the dataframe: ', toronto_grouped.shape)
toronto_grouped

The size of the dataframe:  (10, 271)


Unnamed: 0,Borough,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017857,...,0.0,0.008929,0.0,0.0,0.008929,0.0,0.0,0.0,0.0,0.008929
1,Downtown Toronto,0.0,0.000764,0.000764,0.000764,0.000764,0.001528,0.002292,0.001528,0.015279,...,0.002292,0.012223,0.001528,0.0,0.00382,0.0,0.006875,0.001528,0.000764,0.00382
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.008,0.0,0.0,0.016
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013158,0.0,0.013158,0.0,0.0,0.0,0.013158
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013158,0.0,0.0
5,Mississauga,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076923,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,North York,0.003968,0.0,0.003968,0.0,0.0,0.0,0.0,0.0,0.007937,...,0.0,0.0,0.003968,0.003968,0.007937,0.0,0.0,0.003968,0.011905,0.0
7,Scarborough,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011236,...,0.0,0.0,0.0,0.0,0.011236,0.0,0.0,0.0,0.0,0.0
8,West Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.011696,0.0,0.0,0.011696,0.0,0.011696,0.0,0.0,0.011696
9,York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0


### Let's print each borough and top 5 common venues

In [27]:
num_top_venues = 5

for borough in toronto_grouped['Borough']:
    print("-----" + borough + "-----")
    temp = toronto_grouped[toronto_grouped['Borough'] == borough].T.reset_index()
    temp.columns = ['Venue', 'Frequency']
    temp = temp.iloc[1:]
    temp['Frequency'] = temp['Frequency'].astype(float)
    temp = temp.round({'Frequency':2})
    print(temp.sort_values('Frequency', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

-----Central Toronto-----
            Venue  Frequency
0     Coffee Shop       0.07
1  Sandwich Place       0.06
2            Café       0.05
3      Restaurant       0.04
4     Pizza Place       0.04


-----Downtown Toronto-----
                 Venue  Frequency
0          Coffee Shop       0.10
1                 Café       0.05
2           Restaurant       0.04
3               Bakery       0.03
4  Japanese Restaurant       0.03


-----East Toronto-----
                Venue  Frequency
0    Greek Restaurant       0.07
1         Coffee Shop       0.06
2  Italian Restaurant       0.05
3                Café       0.04
4             Brewery       0.04


-----East York-----
          Venue  Frequency
0          Park       0.05
1   Coffee Shop       0.05
2  Burger Joint       0.04
3   Pizza Place       0.04
4          Bank       0.04


-----Etobicoke-----
            Venue  Frequency
0     Pizza Place       0.11
1     Coffee Shop       0.07
2  Sandwich Place       0.07
3            Café     

### Put that in pandas dataframe

Make the function which can sort the venues in descending order.

In [28]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Then let's create the new dataframe and display the top 10 venues for each borough.

In [29]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{} Most Common Venue'.format(ind+1))

# create a new dataframe
boroughs_venues_sorted = pd.DataFrame(columns=columns)
boroughs_venues_sorted['Borough'] = toronto_grouped['Borough']

for ind in np.arange(toronto_grouped.shape[0]):
    boroughs_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

boroughs_venues_sorted

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4 Most Common Venue,5 Most Common Venue,6 Most Common Venue,7 Most Common Venue,8 Most Common Venue,9 Most Common Venue,10 Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Café,Pizza Place,Park,Clothing Store,Sushi Restaurant,Dessert Shop,Gym,Restaurant
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Bakery,Japanese Restaurant,Hotel,Italian Restaurant,Bar,Park,Seafood Restaurant
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Café,Brewery,Ice Cream Shop,Park,Bakery,American Restaurant,Pub
3,East York,Coffee Shop,Park,Bank,Pharmacy,Pizza Place,Sporting Goods Shop,Burger Joint,Supermarket,Pet Store,Liquor Store
4,Etobicoke,Pizza Place,Coffee Shop,Sandwich Place,Grocery Store,Café,Fast Food Restaurant,Pharmacy,Gym,Beer Store,Discount Store
5,Mississauga,Coffee Shop,Hotel,Intersection,Gym,Middle Eastern Restaurant,Mediterranean Restaurant,American Restaurant,Sandwich Place,Burrito Place,Fried Chicken Joint
6,North York,Coffee Shop,Clothing Store,Japanese Restaurant,Pizza Place,Sandwich Place,Park,Restaurant,Grocery Store,Fast Food Restaurant,Bank
7,Scarborough,Breakfast Spot,Bakery,Chinese Restaurant,Fast Food Restaurant,Coffee Shop,Pizza Place,Pharmacy,Indian Restaurant,Bus Station,Discount Store
8,West Toronto,Bar,Café,Coffee Shop,Bakery,Restaurant,Italian Restaurant,Breakfast Spot,Grocery Store,Pizza Place,Mexican Restaurant
9,York,Park,Caribbean Restaurant,Field,Dog Run,Bus Line,Hockey Arena,Sandwich Place,Bar,Trail,Convenience Store


## Cluster Neighborhoods


Run k-means to cluster the boroughs.

In [30]:
from sklearn.cluster import KMeans

k = 5

toronto_grouped_clustering = toronto_grouped.drop('Borough', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10]

array([2, 2, 2, 0, 4, 3, 2, 0, 2, 1])

Let's create a new dataframe that includes as well as the top 10 common venues for each borough.

In [31]:
# add clustering labels
boroughs_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_postal_code_latlon

toronto_merged = toronto_merged.join(boroughs_venues_sorted.set_index('Borough'), on='Borough')

toronto_merged.head() # check the dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4 Most Common Venue,5 Most Common Venue,6 Most Common Venue,7 Most Common Venue,8 Most Common Venue,9 Most Common Venue,10 Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Coffee Shop,Clothing Store,Japanese Restaurant,Pizza Place,Sandwich Place,Park,Restaurant,Grocery Store,Fast Food Restaurant,Bank
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,Coffee Shop,Clothing Store,Japanese Restaurant,Pizza Place,Sandwich Place,Park,Restaurant,Grocery Store,Fast Food Restaurant,Bank
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,2,Coffee Shop,Café,Restaurant,Bakery,Japanese Restaurant,Hotel,Italian Restaurant,Bar,Park,Seafood Restaurant
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,2,Coffee Shop,Clothing Store,Japanese Restaurant,Pizza Place,Sandwich Place,Park,Restaurant,Grocery Store,Fast Food Restaurant,Bank
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,2,Coffee Shop,Café,Restaurant,Bakery,Japanese Restaurant,Hotel,Italian Restaurant,Bar,Park,Seafood Restaurant


## Visualization

In [32]:
#!conda install -c conda-forge folium=0.5.0 --yes

In [33]:
import folium

toronto_latitude = 43.651070
toronto_longitude = -79.347015

# creat map
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=10, tiles='Stamen Toner')

# # set color scheme for the clusters
# x = np.arange(kclusters)
# ys = [i + x + (i*x)**2 for i in range(kclusters)]
# colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
# rainbow = [colors.rgb2hex(i) for i in colors_array]

# # add markers to the map
# markers_colors = []
# for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Borough'], toronto_merged['Cluster Labels']):
#     label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
#     folium.CircleMarker(
#         [lat, lon],
#         radius=5,
#         popup=label,
#         color=rainbow[cluster-1],
#         fill=True,
#         fill_color=rainbow[cluster-1],
#         fill_opacity=0.7).add_to(map_clusters)

map_clusters