# Week 3 - Clustering in Toronto!

# Table of Contents

<div style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Get Neighborhoods Data from Wikipedia</a>

2. <a href="#item2">Get and Merge Geocode Data</a>

3. <a href="#item3">Analyze and Cluster the Data</a>

</font>
</div>

<a id='item1'></a>
## Part 1: Get the data from Wikipedia

In [63]:
import pandas as pd
import numpy as np
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from IPython.display import display # make printed dataframes look nice
import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors
from sklearn.cluster import KMeans # import k-means from clustering stage
import folium # map rendering library

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(url)

In [4]:
# Table we want is the first one
df = dfs[0]

# Get rid of the rows without useful data (note that there aren't any rows that have a Borough but not a neighborhood)
df = df[df.Borough != "Not assigned"].reset_index(drop=True)

# Don't need to do any more processing because the data on Wikipedia is already in desired form
# Print dataframe's shape
print(df.shape)

# Show off the dataframe
display(df.head())

(103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Now, add the lat/long coordinates to the dataset

### Ok after lots of banging my head against the wall I've decided to just download the darn data
### The following code represents my various attempts to get this to work. Skip all these blocks and start again with **"Part 2"**

<a id='item2'></a>
## Part 2: Get and merge the geocoding data

In [5]:
# Load the data in a dataframe, merge with initial data
df_geo = pd.read_csv("Geospatial_Coordinates.csv")
df_merged = pd.merge(df, df_geo, on=['Postal Code'])
df_merged

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


<a id='item3'></a>
## SECTION 3 - Analyze and Cluster the Data

In [9]:
# Begin by stripping out rows that don't contain the word "Toronto" in the Borough column
df_tor = df_merged[df_merged.Borough.str.contains("Toronto")].reset_index(drop=True)
df_tor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [18]:
CLIENT_ID = 'R1H1D2GMPVIW3DV0ISLESHSVU0Q5TDMSLJHSX3GZQZM0RC1U' # your Foursquare ID
CLIENT_SECRET = '5YPLTXG0405EEDDYWZ03Z0S3OUFANDX1O5BOO1PU13R2QK0D' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

<br>
<br>

#### Begin by seeing the neighborhoods on a map

In [10]:
# Get coordinates for Toronto
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [15]:
# create map using latitude and longitude values
map_tor = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, neighborhood in zip(df_tor['Latitude'], df_tor['Longitude'], df_tor['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  
    
map_tor

<br>

#### Get first neighborhood's info

In [20]:
neighborhood_latitude = df_tor.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_tor.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_tor.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Regent Park, Harbourfront are 43.6542599, -79.3606359.


<br>

#### Set the GET request URL, show the results

In [None]:
radius = 500
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
venues1 = results['response']['venues']
venues1 = pd.json_normalize(venues1)
venues1

<br>

#### Define function to get the category from the JSON

In [24]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        #print(row['categories'][0]['name'])
        return row['categories'][0]['name']
    except:
        #print("Uncategorized")
        return "Uncategorized"
        

<br>

#### Organize the data

In [46]:
# filter columns
filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
venues = venues1.loc[:, filtered_columns]

# filter the category for each row
venues['categories'] = venues.apply(get_category_type, axis=1)

# clean columns
#venues.columns = [col.split(".")[-1] for col in venues.columns]
#venues = venues.dropna(how='any',axis=0)
#venues.reset_index(drop=True, inplace=True)

venues.head()

Unnamed: 0,name,categories,location.lat,location.lng
0,Oldtown Bodega,Café,43.653966,-79.360752
1,Sackville Playground,Park,43.654656,-79.359871
2,Tandem Coffee,Coffee Shop,43.653559,-79.361809
3,Terroni Sud Forno Produzione e Spaccio,Gourmet Shop,43.653903,-79.360018
4,TTC Streetcar #503 Kingston Rd,Moving Target,43.648099,-79.382932


<br>

#### Define function to get nearby venues

In [47]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()
        #with open('debug.json', 'w') as outfile:
        #    json.dump(results, outfile)
        venues1 = results['response']['venues']
        venues1 = pd.json_normalize(venues1)
        filtered_columns = ['name', 'categories', 'location.lat', 'location.lng']
        venues = venues1.loc[:, filtered_columns]
        venues = venues.rename(columns={'location.lat': 'latitude', 'location.lng': 'longitude'})
        venues['categories'] = venues.apply(get_category_type, axis=1)
        
        for row in venues.itertuples():
            venues_list.append([(name, 
                               lat, 
                               lng, 
                               row.name, 
                               row.latitude, 
                               row.longitude, 
                               row.categories
                              )])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

<br>

#### Run function on our neighborhoods and see the results

In [48]:
tor_venues = []
tor_venues = getNearbyVenues(names=df_tor['Neighborhood'],
                                   latitudes=df_tor['Latitude'],
                                   longitudes=df_tor['Longitude']
                                  )

In [54]:
tor_venues = tor_venues[tor_venues["Venue Category"] != "Uncategorized"].reset_index(drop=True)
print(tor_venues.shape)
tor_venues.head(20)

(3284, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Oldtown Bodega,43.653966,-79.360752,Café
1,"Regent Park, Harbourfront",43.65426,-79.360636,Sackville Playground,43.654656,-79.359871,Park
2,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
3,"Regent Park, Harbourfront",43.65426,-79.360636,Terroni Sud Forno Produzione e Spaccio,43.653903,-79.360018,Gourmet Shop
4,"Regent Park, Harbourfront",43.65426,-79.360636,TTC Streetcar #503 Kingston Rd,43.648099,-79.382932,Moving Target
5,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
6,"Regent Park, Harbourfront",43.65426,-79.360636,Cam's Auto Service,43.654195,-79.360545,Automotive Shop
7,"Regent Park, Harbourfront",43.65426,-79.360636,TTC Streetcar #504 King St,43.646151,-79.396,Moving Target
8,"Regent Park, Harbourfront",43.65426,-79.360636,Gusto 501,43.65481,-79.359595,Italian Restaurant
9,"Regent Park, Harbourfront",43.65426,-79.360636,Globe and Mail Centre,43.65152,-79.364804,Building


<br>

#### Get some information about the dataset

In [55]:
display(tor_venues.groupby('Neighborhood').count())
print('There are {} uniques categories.'.format(len(tor_venues['Venue Category'].unique())))

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,84,84,84,84,84,84
"Brockton, Parkdale Village, Exhibition Place",92,92,92,92,92,92
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",79,79,79,79,79,79
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",62,62,62,62,62,62
Central Bay Street,95,95,95,95,95,95
Christie,94,94,94,94,94,94
Church and Wellesley,90,90,90,90,90,90
"Commerce Court, Victoria Hotel",93,93,93,93,93,93
Davisville,79,79,79,79,79,79
Davisville North,71,71,71,71,71,71


There are 408 uniques categories.


<br>

#### Use data for neighborhoods analysis

In [56]:
# one hot encoding
tor_onehot = pd.get_dummies(tor_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = tor_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [tor_onehot.columns[-1]] + list(tor_onehot.columns[:-1])
tor_onehot = tor_onehot[fixed_columns]
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()

<br>

#### Get top five venues for each neighborhood

In [57]:
num_top_venues = 5

for hood in tor_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = tor_grouped[tor_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                                      venue  freq
0                                    Office  0.11
1                                  Building  0.07
2  Residential Building (Apartment / Condo)  0.04
3                                   Parking  0.04
4                                       Pub  0.02


----Brockton, Parkdale Village, Exhibition Place----
                                      venue  freq
0                                    Office  0.26
1  Residential Building (Apartment / Condo)  0.21
2                              Tech Startup  0.07
3                           Conference Room  0.05
4                                  Building  0.04


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                venue  freq
0            Building  0.05
1  Light Rail Station  0.04
2              Office  0.04
3      Medical Center  0.03
4        Antique Shop  0.03


----CN Tower, King and Spadina, Railway Lands, Harbourfront 

In [59]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = tor_grouped['Neighborhood']

for ind in np.arange(tor_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(tor_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Berczy Park,Office,Building,Residential Building (Apartment / Condo),Parking,Laundry Service,Food Truck,Assisted Living,Event Space,Hotel,Pub
1,"Brockton, Parkdale Village, Exhibition Place",Office,Residential Building (Apartment / Condo),Tech Startup,Conference Room,Building,Advertising Agency,Café,Coworking Space,Convenience Store,Bar
2,"Business reply mail Processing Centre, South C...",Building,Office,Light Rail Station,Butcher,Convenience Store,Theater,Medical Center,Athletics & Sports,Rental Car Location,Fast Food Restaurant
3,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Gate,Airport Service,Moving Target,Airport Terminal,Coffee Shop,General Travel,Airport Lounge,Park,Airport,Boat or Ferry
4,Central Bay Street,Hospital,Hospital Ward,Medical Center,Coffee Shop,Office,Pharmacy,Emergency Room,Mediterranean Restaurant,Residential Building (Apartment / Condo),Japanese Restaurant
