# Segmenting and Clustering Neighborhoods in Toronto

###### Import libraries 

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import requests 
import bs4
import folium
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import time

###### Get Data from Wiki

In [6]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
if response.status_code == 200: 
    print ("Get Successful!")
else:
    print ("Get Failed! Check URL")

Get Successful!


###### Payload Received From Wiki

In [7]:
# print(response.text) # Commented out to avoid filling the notebook with the response text

###### Parse the response text using BeautifulSoap

In [8]:
resp_soap = bs4.BeautifulSoup(response.text, 'html.parser')
resp_soap.title

<title>List of postal codes of Canada: M - Wikipedia</title>

### Assumptions
 * Postal Code & Borough combination is expected have same value for all the nieighbourhoods with those combinations.
 * **_check_\__for_\__discrepancy_** is used to shout if there are any discrepancies found while processing the rows. 

In [9]:
def check_for_discrepancy(pos_dict,row_list):
    
    if row_list[0] in pos_dict and pos_dict[row_list[0]][0] != row_list[1]:
        print ("Warning! Existing Postal Codes and Borough combination is differeing from the new row")
        print (f"Existing Postal Code {row_list[0]} Borough {pos_dict[row_list[0]][0]}")
        print (f"New Postal Code {row_list[0]} Borough {row_list[1]}")

### Approach
* Very first table from the Response Text will be parsed to get all the Postal Codes
* **_get_\__list_\__of_\__postal_\__codes_** will parse all the rows.
* **_get_\__row_\__list_** will parse the columns.
* Postal Code Dictionary (_with postal code as a Key_) is maintained to store all the parsed postal codes. 
* If Borough i.e. Column #2 is "Not Assigned", it will skip adding the rows to the Postal Code Dictionary 
* If Nieighbourhood i.e. Column #3 is "Not Assigned" but if the Borough is valid, it will assign the Borough as its neighbourhood before adding the row to the Dictionary. 
* List is maintained to keep track of the Neighbourhoods that are having same postal code
* Another List is maintained to keep track of the rows that are having valid Borough but "Not assigned" Neighbourhood

In [10]:
def add_to_dictionary(pos_dict,row_list,more_nbh):
    
    if (row_list[0]) in pos_dict:
        more_nbh.append(row_list[0])
        pos_dict[row_list[0]][1] = pos_dict[row_list[0]][1] + ", " + row_list[2]
    else:
        pos_dict[row_list[0]] = [row_list[1],row_list[2]]

In [11]:
def get_row_list(row):
    
    row_list = []    
    for column in row:
        
        if isinstance(column,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char            
            row_list.append(column.get_text(strip=True))
            
    return row_list

In [12]:
def get_list_of_postal_codes(soap_msg_table):
    
    pos_dict = {}
    more_nbh = []
    na_nbh = []
    for row in soap_msg_table:
        
        if isinstance(row,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char
            row_list = get_row_list(row)
            
            if row_list[1].replace(' ','').lower() != 'notassigned' and row_list[2].replace(' ','').lower() == 'notassigned':
                na_nbh.append(row_list[0])
                row_list[2] = row_list[1]
                add_to_dictionary(pos_dict,row_list,more_nbh)
                
            elif row_list[1].replace(' ','').lower() != 'notassigned':                
                add_to_dictionary(pos_dict,row_list,more_nbh)
            
            # Check for discrepancy
            check_for_discrepancy(pos_dict,row_list)
            
    return pos_dict,more_nbh,na_nbh

###### Extracting & Parsing the very first table from the response text

In [13]:
resp_table = resp_soap.table
postal_code_list, more_nbh, na_nbh = get_list_of_postal_codes(resp_table.tbody)

###### Creating a DataFrame from the Parsed Dictionary. 
* _orient_ is used to parse the keys as Index
* Removing the first row as it is a header from the table
* Resetting the Index to get the DataFrame in an expected way. 
* Adding the expected column names to the DataFrame 

In [14]:
pos_df = pd.DataFrame.from_dict(postal_code_list,orient='index')
pos_df = pos_df[1:]
pos_df = pos_df.reset_index()
pos_df.columns = ['PostalCode','Borough','Neighborhood']

###### Postal Code with more than one neightbourhood - Eg : M5A is used

In [15]:
pos_df.loc[pos_df['PostalCode'] == 'M5A']['Neighborhood'].values

array(['Harbourfront, Regent Park'], dtype=object)

###### All Postal Codes that are having more than one neightbourhood

In [16]:
pos_df.loc[pos_df['PostalCode'].isin(set(more_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M1B,Scarborough,"Rouge, Malvern"
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
11,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
13,M3C,North York,"Flemingdon Park, Don Mills South"
17,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"


###### Postal Codes that were having "Not assigned" neighbourhood

In [17]:
pos_df.loc[pos_df['PostalCode'].isin(set(na_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Queen's Park


###### Resultant DataFrame - Only first 10 rows are displayed

In [18]:
pos_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


###### Shape of the Resultant Dataframe

In [19]:
print(f"{pos_df.shape}")

(103, 3)


## Extracting & Merging Geospatial Data

<font color=red>**Note:**</font> I don't seem to be recieveing data that are consistent with the Geospatial Data Provided in the Assignment Section. So, i will be using the data file provided in the Assignment Section

In [20]:
!wget -q http://cocl.us/Geospatial_data

###### Read the data file into a DataFrame & Rename the columns to match the existing DataFrame

In [21]:
geo_data = pd.read_csv('Geospatial_data',header=0)
geo_data.columns = ['PostalCode','Latitude','Longitude']

In [22]:
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###### Make sure all the postal codes are having the corresponding Latitude & Longitude values from Geospatial_data

In [23]:
pos_df.loc[~pos_df['PostalCode'].isin(geo_data['PostalCode'])].size

0

###### Merge the DataFrames based on Postal Codes

In [24]:
mgd_pos_df = pos_df.merge(right=geo_data,on='PostalCode')
mgd_pos_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


###### Shape of the Merged Dataframe

In [25]:
print(f"{mgd_pos_df.shape}")

(103, 5)


## Clustering the neighbourhoods 

In [26]:
print (f"There are {mgd_pos_df['Borough'].unique().size} unique Borough's from the merged DataFrame")

There are 11 unique Borough's from the merged DataFrame


### Approach
* **Only Borough's that contains the word "Toronto" will be analyzed below**. 
* https://nominatim.openstreetmap.org/ will be queried directly to get the Geocodes 
* foursquare api will be used to analyze the neighbourhoods. 

###### Get Geo codes for a specific location. 

In [27]:
def get_geo_codes(area):
    longitude = ""
    latitude = ""
    try:
        # Replace the space with +
        area_str = area.replace(" ","+")
        
        # Get the Geolocation for given area
        response = requests.get('https://nominatim.openstreetmap.org/search?q='+area_str+'&format=geojson&limit=1')
        if response.status_code == 200:
            print ("Get Successful")
            
            # Extract the Latitude & Longitude from the response text
            longitude,latitude = json.loads(response.text)['features'][0]['geometry']['coordinates']
            print(f"latitude & longitude values of {area} are {latitude},{longitude}")
        else:
            print ("Get Failed!")
    except:
        pass
    
    return latitude,longitude

###### Extract Latitude & Longitude for Toronto,ON

In [28]:
latitude,longitude = get_geo_codes('Toronto ON')

Get Successful
latitude & longitude values of Toronto ON are 43.653963,-79.387207


#### Create a map of Toronto with neighborhoods superimposed on top.

In [29]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(mgd_pos_df['Latitude'], mgd_pos_df['Longitude'], mgd_pos_df['Borough'], mgd_pos_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green').add_to(map_toronto)  
    
map_toronto

###### Create a dataframe with Boroughs that has a word "Toronto" Note : Only this dataframe toronto_df will be analyzed futher.

In [83]:
toronto_df = mgd_pos_df
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [85]:
toronto_df.shape

(103, 5)

###### Set the Credentials for Foursquare API 

In [31]:
CLIENT_ID = 'KGU5MEU4ZHYXQPJT2Y1B2DNRFFTV1GSD4Y5KXV0WMSFFDLQO' # your Foursquare ID
CLIENT_SECRET = 'S30A30GKJ1SITKL3YOTXQAPTCJBYQGUMONMF4H4CNVJTAVOT' # your Foursquare Secret
VERSION = '20190531' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KGU5MEU4ZHYXQPJT2Y1B2DNRFFTV1GSD4Y5KXV0WMSFFDLQO
CLIENT_SECRET:S30A30GKJ1SITKL3YOTXQAPTCJBYQGUMONMF4H4CNVJTAVOT


###### Get Nearby Venues from each Neighbourhood
* radius is set to 2000 to have enough datapoints for each neighbourhood
* LIMIT is set to 100 for all the neighbourhoods. However, there may be some cases with less number of records recived from the API

In [86]:
def getVenueLikes(id):
    
    url = 'https://api.foursquare.com/v2/venues/{}/likes?&client_id={}&client_secret={}&v={}'.format(
            id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
    
    # make the GET request
    results = requests.get(url).json()["response"]['likes']

    return results['count']
    

In [87]:
getVenueLikes('4bd6e35d637ba5933ad3f870')

62

In [88]:
def getNearbyVenues(names, latitudes, longitudes, radius=10000):
    
    LIMIT = 500
    # Athletics & Sports {4f4528bc4b90abdf24c9de85}, Pool {4bf58dd8d48988d15e941735},
    
    CATEGORY = '4f4528bc4b90abdf24c9de85,4bf58dd8d48988d15e941735'
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            CATEGORY)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng,
            v['venue']['id'],
            v['venue']['name'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['location']['distance'], 
            v['venue']['categories'][0]['name']) for v in results])
        
        #print (venues_list)
        #for v in venues_list:
        #    print ("v",v)
        #    for v1 in v:
        #        print ("v1", v1)
       #         count,summary = getVenueLikes(v1[3])
       #        v.append(count)
        #        v.append(summary)
            
        #break
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Venue Id',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Distance', 
                  'Venue Category']
    
    return(nearby_venues)

###### Get Venues from each Neighbourhood

In [165]:
#toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
#                                   latitudes=toronto_df['Latitude'],
#                                   longitudes=toronto_df['Longitude']
#                                  )
#csv_export = toronto_venues.to_csv (r'toronto_dataframe.csv', index = None, header=True)
toronto_venues = pd.read_csv('toronto_dataframe.csv',header=0)
toronto_venues.head()

In [90]:
pd.DataFrame(toronto_venues.groupby('Venue Category').count()['Venue'])

Unnamed: 0_level_0,Venue
Venue Category,Unnamed: 1_level_1
Arcade,24
Athletics & Sports,864
Badminton Court,58
Baseball Field,212
Basketball Court,54
Boxing Gym,1
Building,57
Climbing Gym,125
College Gym,168
College Rec Center,44


In [109]:
toronto_venues['Venue Likes'].unique()

array([ 28,  11,  34,  15,  51,   7,  29,  43,  12,   3,  10,  33,   8,
        39,  38,  21,  14,  62,  54,  18,  25,   1,   6,  32,  19,   5,
         0,  16,  31,   2,   4,  17,  27,  22,  30,  45,  20, 126,  78,
        64,  60,   9,  23,  36, 672, 181,  57,  53,  41,  55,  86,  70,
       148,  26,  76,  42,  13,  47,  24,  73,  37,  68])

In [115]:
toronto_venues.sort_values(by=['Venue Category'],ascending=False)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue Likes,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
6891,Westmount,43.696319,-79.532242,4cbb1b07035d236a7420de4e,20,Octopus Garden Yoga Centre,43.653438,-79.427108,9717,Yoga Studio
7320,"The Annex, North Midtown, Yorkville",43.672710,-79.405678,4cbb1b07035d236a7420de4e,20,Octopus Garden Yoga Centre,43.653438,-79.427108,2753,Yoga Studio
2353,Central Bay Street,43.657952,-79.387383,50bf6e53d86ce1965377956b,18,Y Yoga,43.638534,-79.389158,2166,Yoga Studio
7426,"Parkdale, Roncesvalles",43.648960,-79.456325,4efa4c786c25c411edcaf286,12,Yoga Tree Midtown,43.707642,-79.397472,8069,Yoga Studio
7423,"Parkdale, Roncesvalles",43.648960,-79.456325,4aeb5989f964a5203ec121e3,14,The Yoga Sanctuary,43.661499,-79.383636,6018,Yoga Studio
7419,"Parkdale, Roncesvalles",43.648960,-79.456325,4cbb1b07035d236a7420de4e,20,Octopus Garden Yoga Centre,43.653438,-79.427108,2405,Yoga Studio
7417,"Parkdale, Roncesvalles",43.648960,-79.456325,50bf6e53d86ce1965377956b,18,Y Yoga,43.638534,-79.389158,5533,Yoga Studio
7416,"Parkdale, Roncesvalles",43.648960,-79.456325,4e38175dd4c0dc7ad2ec326a,11,Power Yoga Canada Etobicoke,43.636592,-79.520312,5335,Yoga Studio
7412,"Parkdale, Roncesvalles",43.648960,-79.456325,4d90ca3cc1b1721e93666346,26,Yoga Tree Downtown,43.647902,-79.396086,4853,Yoga Studio
3322,"Northwood Park, York University",43.767980,-79.487262,4b68b3dcf964a52009882be3,10,Moksha Yoga North York,43.763250,-79.405468,6596,Yoga Studio


In [174]:
def identify_the_closest_nh(df):
    return df.sort_values(by=['Venue Distance'],ascending=True).iloc[0]

In [183]:
def venues_eliminate_duplicates(df):
    venue_list = df['Venue'].unique()
    print(venue_list.size)
    unique_venues = pd.DataFrame(columns=['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Id','Venue','Venue Latitude','Venue Longitude','Venue Distance','Venue Category'])
    for venue in venue_list:
        print (venue)
        closest = identify_the_closest_nh(df.loc[df['Venue'] == venue])
        unique_venues = unique_venues.append(closest)
    return unique_venues

In [184]:
unique_venues = venues_eliminate_duplicates(toronto_venues)

611
Monarch Park Stadium
Toronto Lawn Tennis Club
GoodLife Fitness North York Steeles and Woodbine
Toronto Cricket Skating and Curling Club
LA Fitness
Douglas Snow Aquatic Centre
North Beach Indoor Volleyball Academy
Granite Club
East York Gymnastics
GoodLife Fitness North York York Mills
GoodLife Fitness North York Sheppard East
Fitness Connection
Planet Fitness North Scarborough
GoodLife Fitness Toronto Mount Pleasant and Davisville
Barreworks
GoodLife Fitness North York Victoria Terrace
Rock Oasis
GoodLife Fitness Scarborough Cedarbrae Mall
GoodLife Fitness Toronto Dunfield
Mayfair Clubs
GoodLife Fitness North York Madison Centre
World Gym
Goodlife Fitness
GoodLife Fitness Scarborough Markington
GoodLife Fitness Scarborough Town Centre Women's Only
Crossfit Metric
E. T. Seton PARK Archery Range
Variety Village
TKTO - Toronto Knife Throwing Organization
Dennis R. Timbrell Resource Centre
Manulife Financial
GoodLife Fitness Toronto Yonge and St Clair
GoodLife Fitness North York Willow

Pool at 88 erskine ave
The Soccer Centre
Glendora Tennis Courts
Northern Karate
Don Valley Golf Course
Uplands Ski Centre
The National Golf Club of Canada
Yoga Tree Vaughan
The Sports Village
Chesswood Arena
Tribe Fitness
Elle Fitness and Social
Spinco
15 Iceboat Terrace Fitness Centre
Moksha Yoga Danforth
barre3 Toronto - Leslieville
Garment Factory
Toronto Yoga Mamas
Leaside Memorial Gardens
Leaside Curling Club
Ashbridges Bay Skatepark
Corktown District Lofts Gym
1 Cole Gym
Gym @ Jazz
Bay Club Gym
Myodetox Performance
Deck 27 Pool & Fitness Facility
Board of Trade Country Club
Canada's Wonderland Water Park (Splashworks)
Malton Community Centre
Emery Village Hockey Training Rinks
Jim Park Goalie School (at Westwood Arenas)
EnergyFitBox
The Archery Game
Dufferin Clark Community Centre
Airborne Trampoline Club
Real Training Club
Elite Spa
Gyro's Gymnastics
Harmony Fitness
Metro Place - swimming pool
CONTACT KICKS MARTIAL ARTS
Riverstone Golf & Country Club
Hot Yoga Wellness Internatio

In [187]:
unique_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
4570,"The Beaches West, India Bazaar",43.668999,-79.315572,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,1225,Athletics & Sports
8910,Rosedale,43.679563,-79.377529,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,896,Athletics & Sports
2570,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym
5356,"Bedford Park, Lawrence Manor East",43.733283,-79.419750,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.739660,-79.418732,714,Sports Club
676,Don Mills North,43.745906,-79.352188,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,455,Gym / Fitness Center
5755,Willowdale South,43.770120,-79.408493,4b48c217f964a520b25526e3,Douglas Snow Aquatic Centre,43.767317,-79.414982,607,Pool
104,Victoria Village,43.725882,-79.315572,4b4fd4a9f964a520fd1627e3,North Beach Indoor Volleyball Academy,43.737191,-79.323714,1419,Gym / Fitness Center
5965,Lawrence Park,43.728020,-79.388790,4b22e0a0f964a520544f24e3,Granite Club,43.733043,-79.381986,782,Gym / Fitness Center
780,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4b7aa47ef964a520fa342fe3,East York Gymnastics,43.710654,-79.309279,476,Gym / Fitness Center
6461,York Mills West,43.752758,-79.400049,56d484efcd10b98f5098c328,GoodLife Fitness North York York Mills,43.744604,-79.406346,1039,Gym


In [257]:
def get_attr_value(resp_dict,attribute):
    
    if attribute in resp_dict:
        return resp_dict[attribute]
    
    return ''

In [203]:
def has_attr(resp_dict,attribute):
    
    return attribute in resp_dict

In [202]:
def get_reasons_items(reasons_items):
    
    risummary = get_attr_value(reasons_items,'summary')
    ritype = get_attr_value(reasons_items,'type')
    rireasonName = get_attr_value(reasons_items,'reasonName')
    
    return (risummary,ritype,rireasonName)

In [446]:
def get_reasons(curr_resp_dict,reasons):
    
    curr_resp_dict['reasons_count'] = get_attr_value(reasons,'count')
    
    items = ()
    index = 1
    for item in reasons['items']:
        items = get_reasons_items(item)
        curr_resp_dict[f'reasons_item_summary_{index}'] = items[0]
        curr_resp_dict[f'reasons_item_type_{index}'] = items[1]
        curr_resp_dict[f'reasons_item_reasonname_{index}'] = items[2]
        break

In [438]:
def getVenueDetails(id):
    
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
    
    # make the GET request
    #response_msg = requests.get(url).json()["response"]['venue']
    
    curr_resp_list = []
    curr_resp_dict = {'venue_id':id,'verified':'','tip_count':'',
                'likes':'','rating':'','ratingSignals':'','reasons_count':'',
                'reasons_item_summary_1':''}
    
    curr_resp_dict['verified'] = get_attr_value(response_msg,'verified')
    
    if has_attr(response_msg,'stats'):
        curr_resp_dict['tip_count'] = get_attr_value(response_msg['stats'],'tipCount')
        
    if has_attr(response_msg,'likes'):
        curr_resp_dict['likes'] = get_attr_value(response_msg['likes'],'count')
    
    curr_resp_dict['rating'] = get_attr_value(response_msg,'rating')
    curr_resp_dict['ratingSignals'] = get_attr_value(response_msg,'ratingSignals')
    
    if has_attr(response_msg,'reasons'):
        get_reasons(curr_resp_dict,response_msg['reasons'])
    
    return curr_resp_dict
    

In [439]:
def getDetailsforallVenues(venues):
    
    venues_list = []
    
    for venue_id in venues:
        venues_list.append(getVenueDetails(venue_id))
    
    #print("looks find",venues_list)
    det_venues_df = pd.DataFrame(venues_list)
    
    return det_venues_df

In [441]:
df_ts = getDetailsforallVenues(unique_venues['Venue Id'])
df_ts

Unnamed: 0,likes,rating,ratingSignals,reasons_count,reasons_item_reasonname_1,reasons_item_summary_1,reasons_item_type_1,tip_count,venue_id,verified
0,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,50a01c82e4b0a0263fcfdcb9,True
1,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4ba5156cf964a520b6da38e3,True
2,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4bb4ccb486a8d13a7ce1de7f,True
3,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b0eb057f964a520b75923e3,True
4,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4c18e819d4d9c9284e19f029,True
5,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b48c217f964a520b25526e3,True
6,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b4fd4a9f964a520fd1627e3,True
7,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b22e0a0f964a520544f24e3,True
8,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b7aa47ef964a520fa342fe3,True
9,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,56d484efcd10b98f5098c328,True


In [445]:
unique_venues.merge(df_ts,left_on='Venue Id',right_on='venue_id')

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,likes,rating,ratingSignals,reasons_count,reasons_item_reasonname_1,reasons_item_summary_1,reasons_item_type_1,tip_count,venue_id,verified
0,"The Beaches West, India Bazaar",43.668999,-79.315572,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,1225,Athletics & Sports,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,50a01c82e4b0a0263fcfdcb9,True
1,Rosedale,43.679563,-79.377529,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,896,Athletics & Sports,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4ba5156cf964a520b6da38e3,True
2,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4bb4ccb486a8d13a7ce1de7f,True
3,"Bedford Park, Lawrence Manor East",43.733283,-79.419750,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.739660,-79.418732,714,Sports Club,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b0eb057f964a520b75923e3,True
4,Don Mills North,43.745906,-79.352188,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,455,Gym / Fitness Center,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4c18e819d4d9c9284e19f029,True
5,Willowdale South,43.770120,-79.408493,4b48c217f964a520b25526e3,Douglas Snow Aquatic Centre,43.767317,-79.414982,607,Pool,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b48c217f964a520b25526e3,True
6,Victoria Village,43.725882,-79.315572,4b4fd4a9f964a520fd1627e3,North Beach Indoor Volleyball Academy,43.737191,-79.323714,1419,Gym / Fitness Center,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b4fd4a9f964a520fd1627e3,True
7,Lawrence Park,43.728020,-79.388790,4b22e0a0f964a520544f24e3,Granite Club,43.733043,-79.381986,782,Gym / Fitness Center,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b22e0a0f964a520544f24e3,True
8,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4b7aa47ef964a520fa342fe3,East York Gymnastics,43.710654,-79.309279,476,Gym / Fitness Center,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,4b7aa47ef964a520fa342fe3,True
9,York Mills West,43.752758,-79.400049,56d484efcd10b98f5098c328,GoodLife Fitness North York York Mills,43.744604,-79.406346,1039,Gym,62,7.6,95,1,rawLikesReason,Lots of people like this place,general,24,56d484efcd10b98f5098c328,True


In [141]:
def sort_venue_by_distance(df):
    return df.sort_values(by=['Venue Distance'],ascending=True)

In [189]:
ocga = sort_venue_by_distance(toronto_venues.loc[toronto_venues['Venue']== 'Planet Fitness North Scarborough'])
ocga.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
8337,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,1672,Gym
8813,L'Amoreaux West,43.799525,-79.318389,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,3056,Gym
7638,Agincourt,43.7942,-79.262029,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,4591,Gym
8039,"Clarks Corners, Sullivan, Tam O'Shanter",43.781638,-79.304302,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,4732,Gym
2579,Hillcrest Village,43.803762,-79.363452,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,5473,Gym


In [96]:
pd.DataFrame(toronto_venues.groupby('Neighborhood').count()['Venue'])

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",100
Agincourt,100
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",100
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",77
"Alderwood, Long Branch",100
"Bathurst Manor, Downsview North, Wilson Heights",100
Bayview Village,100
"Bedford Park, Lawrence Manor East",100
Berczy Park,100
"Birch Cliff, Cliffside West",100


In [144]:
toronto_venues.shape

(10077, 10)

In [156]:
extracted_venues = toronto_venues[['Venue','Venue Id','Venue Latitude','Venue Longitude']]
extracted_venues.shape

(10077, 4)

In [157]:
extracted_venues.loc[~extracted_venues.duplicated()]

Unnamed: 0,Venue,Venue Id,Venue Latitude,Venue Longitude
0,Monarch Park Stadium,50a01c82e4b0a0263fcfdcb9,43.678144,-79.324038
1,Toronto Lawn Tennis Club,4ba5156cf964a520b6da38e3,43.680667,-79.388559
2,GoodLife Fitness North York Steeles and Woodbine,4bb4ccb486a8d13a7ce1de7f,43.813541,-79.344064
3,Toronto Cricket Skating and Curling Club,4b0eb057f964a520b75923e3,43.739660,-79.418732
4,LA Fitness,4c18e819d4d9c9284e19f029,43.747665,-79.347077
5,Douglas Snow Aquatic Centre,4b48c217f964a520b25526e3,43.767317,-79.414982
6,North Beach Indoor Volleyball Academy,4b4fd4a9f964a520fd1627e3,43.737191,-79.323714
7,Granite Club,4b22e0a0f964a520544f24e3,43.733043,-79.381986
8,East York Gymnastics,4b7aa47ef964a520fa342fe3,43.710654,-79.309279
9,GoodLife Fitness North York York Mills,56d484efcd10b98f5098c328,43.744604,-79.406346


In [None]:
extracted_venues

In [162]:
glf = sort_venue_by_distance(toronto_venues.loc[toronto_venues['Venue Id'] == '4bd6e35d637ba5933ad3f870'])
glf.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue Likes,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
6571,Davisville North,43.712751,-79.390197,4bd6e35d637ba5933ad3f870,62,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,696,Gym
7749,Davisville,43.704324,-79.38879,4bd6e35d637ba5933ad3f870,62,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,714,Gym
7148,North Toronto West,43.715383,-79.405678,4bd6e35d637ba5933ad3f870,62,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,1125,Gym
6072,Roselawn,43.711695,-79.416936,4bd6e35d637ba5933ad3f870,62,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,1733,Gym
6673,"Forest Hill North, Forest Hill West",43.696948,-79.411307,4bd6e35d637ba5933ad3f870,62,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,1749,Gym


In [161]:
mgd_pos_df.loc[mgd_pos_df['PostalCode'] == 'M4P']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
67,M4P,Central Toronto,Davisville North,43.712751,-79.390197


###### Unique Venue Categories 

In [97]:
toronto_venues.shape

(10077, 10)

Sports Club


In [98]:
toronto_venues['Venue Category'].unique()

array(['Athletics & Sports', 'Gym', 'Sports Club', 'Gym / Fitness Center',
       'Pool', 'Yoga Studio', 'Climbing Gym', 'Office', 'Soccer Field',
       'Golf Course', 'Baseball Field', 'Park',
       'Residential Building (Apartment / Condo)', 'Community Center',
       'Tennis Court', 'Cycle Studio', 'Arcade', 'Badminton Court',
       'College Gym', 'Gym Pool', 'Basketball Court', 'College Stadium',
       'Soccer Stadium', 'College Rec Center', 'Plaza', 'Stadium',
       'Building', 'Martial Arts Dojo', 'Trail', 'Elementary School',
       'Event Space', 'Track', 'Curling Ice', 'Playground', 'Social Club',
       'Skating Rink', 'Hockey Arena', 'School', 'Paintball Field',
       'Swim School', 'Hotel', 'Other Great Outdoors', 'Field',
       'Skate Park', 'Physical Therapist', 'Hotel Pool', 'Water Park',
       'Hockey Field', 'Weight Loss Center', 'Boxing Gym', 'High School',
       'Pilates Studio', 'Recreation Center'], dtype=object)

In [99]:
toronto_venues.loc[toronto_venues['Venue Category'] == 'Housing Development']

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue Likes,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category


In [100]:
unexp = toronto_venues.loc[toronto_venues['Venue Category'].str.contains('Restaurant')]['Venue Category'].unique()

In [101]:
pd.DataFrame(toronto_venues.loc[~toronto_venues['Venue Category'].str.match('(Club|Sport|Gym|Stadium|Fit|Golf|Park)')]['Venue Category'].unique()).values

array([['Athletics & Sports'],
       ['Pool'],
       ['Yoga Studio'],
       ['Climbing Gym'],
       ['Office'],
       ['Soccer Field'],
       ['Baseball Field'],
       ['Residential Building (Apartment / Condo)'],
       ['Community Center'],
       ['Tennis Court'],
       ['Cycle Studio'],
       ['Arcade'],
       ['Badminton Court'],
       ['College Gym'],
       ['Basketball Court'],
       ['College Stadium'],
       ['Soccer Stadium'],
       ['College Rec Center'],
       ['Plaza'],
       ['Building'],
       ['Martial Arts Dojo'],
       ['Trail'],
       ['Elementary School'],
       ['Event Space'],
       ['Track'],
       ['Curling Ice'],
       ['Playground'],
       ['Social Club'],
       ['Skating Rink'],
       ['Hockey Arena'],
       ['School'],
       ['Paintball Field'],
       ['Swim School'],
       ['Hotel'],
       ['Other Great Outdoors'],
       ['Field'],
       ['Skate Park'],
       ['Physical Therapist'],
       ['Hotel Pool'],
       ['Water Pa

In [None]:
toronto_venues.loc[toronto_venues['Venue Category'] == 'Athletics & Sports']

In [None]:
toronto_venues.shape

In [None]:
toronto_gym = toronto_venues

In [None]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto_gym = folium.Map(location=[latitude, longitude], zoom_start=10)

count = 0
# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_gym['Venue Latitude'], toronto_gym['Venue Longitude'], toronto_gym['Venue'],toronto_gym['Neighborhood']):
    label = '{}, {}'.format(venue,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green').add_to(map_toronto_gym)  
    count += 1
    if count > 800:
        break
map_toronto_gym

In [None]:
toronto_res = toronto_venues.loc[toronto_venues['Venue Category'].str.contains('Restaurant')]
toronto_res.head()

In [None]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto_res = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_res['Venue Latitude'], toronto_res['Venue Longitude'], toronto_res['Venue'],toronto_res['Neighborhood']):
    label = '{}, {}'.format(venue,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='orange').add_to(map_toronto_res)  
    
map_toronto_res

###### Extracted Venues from each Neighbourhood

In [None]:
toronto_venues.head()

###### Count of the Venues Receied for each Neighborhood. 
###### Note: As highlighted above, API returned less venues for few neighbourhoods as it could only find those within the given radius

In [None]:
pd.DataFrame(toronto_venues.groupby('Neighborhood').count()['Venue'])

###### Analyze based on Venue Category

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

###### Shape of the Dataframe after the onehot encoding

In [None]:
toronto_onehot.shape

###### Group the Neighbourhoods with the corresponding encoded rows to see the most 

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

###### Shape of the Grouped Dataframe

In [None]:
toronto_grouped.shape

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

###### Get & Print the Top 5 most common venues 

In [None]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

###### Cluster Neighborhoods


In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_
toronto_grouped_clustering.head()

###### Add clustering labels

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

###### Create a map with clusted labels (produced in different colors)

In [None]:
toronto_merged = toronto_merged.dropna()
toronto_merged.head()

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)+7))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[np.int(cluster-1)]).add_to(map_clusters)
       
map_clusters

In [None]:
toronto_merged[]