# Segmenting and Clustering Neighborhoods in Toronto

###### Import libraries 

In [104]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plot
import requests 
import bs4
import folium
import json
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import time

###### Get Data from Wiki

In [2]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
if response.status_code == 200: 
    print ("Get Successful!")
else:
    print ("Get Failed! Check URL")

Get Successful!


###### Payload Received From Wiki

In [3]:
# print(response.text) # Commented out to avoid filling the notebook with the response text

###### Parse the response text using BeautifulSoap

In [4]:
resp_soap = bs4.BeautifulSoup(response.text, 'html.parser')
resp_soap.title

<title>List of postal codes of Canada: M - Wikipedia</title>

### Assumptions
 * Postal Code & Borough combination is expected have same value for all the nieighbourhoods with those combinations.
 * **_check_\__for_\__discrepancy_** is used to shout if there are any discrepancies found while processing the rows. 

In [5]:
def check_for_discrepancy(pos_dict,row_list):
    
    if row_list[0] in pos_dict and pos_dict[row_list[0]][0] != row_list[1]:
        print ("Warning! Existing Postal Codes and Borough combination is differeing from the new row")
        print (f"Existing Postal Code {row_list[0]} Borough {pos_dict[row_list[0]][0]}")
        print (f"New Postal Code {row_list[0]} Borough {row_list[1]}")

### Approach
* Very first table from the Response Text will be parsed to get all the Postal Codes
* **_get_\__list_\__of_\__postal_\__codes_** will parse all the rows.
* **_get_\__row_\__list_** will parse the columns.
* Postal Code Dictionary (_with postal code as a Key_) is maintained to store all the parsed postal codes. 
* If Borough i.e. Column #2 is "Not Assigned", it will skip adding the rows to the Postal Code Dictionary 
* If Nieighbourhood i.e. Column #3 is "Not Assigned" but if the Borough is valid, it will assign the Borough as its neighbourhood before adding the row to the Dictionary. 
* List is maintained to keep track of the Neighbourhoods that are having same postal code
* Another List is maintained to keep track of the rows that are having valid Borough but "Not assigned" Neighbourhood

In [6]:
def add_to_dictionary(pos_dict,row_list,more_nbh):
    
    if (row_list[0]) in pos_dict:
        more_nbh.append(row_list[0])
        pos_dict[row_list[0]][1] = pos_dict[row_list[0]][1] + ", " + row_list[2]
    else:
        pos_dict[row_list[0]] = [row_list[1],row_list[2]]

In [7]:
def get_row_list(row):
    
    row_list = []    
    for column in row:
        
        if isinstance(column,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char            
            row_list.append(column.get_text(strip=True))
            
    return row_list

In [8]:
def get_list_of_postal_codes(soap_msg_table):
    
    pos_dict = {}
    more_nbh = []
    na_nbh = []
    for row in soap_msg_table:
        
        if isinstance(row,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char
            row_list = get_row_list(row)
            
            if row_list[1].replace(' ','').lower() != 'notassigned' and row_list[2].replace(' ','').lower() == 'notassigned':
                na_nbh.append(row_list[0])
                row_list[2] = row_list[1]
                add_to_dictionary(pos_dict,row_list,more_nbh)
                
            elif row_list[1].replace(' ','').lower() != 'notassigned':                
                add_to_dictionary(pos_dict,row_list,more_nbh)
            
            # Check for discrepancy
            check_for_discrepancy(pos_dict,row_list)
            
    return pos_dict,more_nbh,na_nbh

###### Extracting & Parsing the very first table from the response text

In [9]:
resp_table = resp_soap.table
postal_code_list, more_nbh, na_nbh = get_list_of_postal_codes(resp_table.tbody)

###### Creating a DataFrame from the Parsed Dictionary. 
* _orient_ is used to parse the keys as Index
* Removing the first row as it is a header from the table
* Resetting the Index to get the DataFrame in an expected way. 
* Adding the expected column names to the DataFrame 

In [10]:
pos_df = pd.DataFrame.from_dict(postal_code_list,orient='index')
pos_df = pos_df[1:]
pos_df = pos_df.reset_index()
pos_df.columns = ['PostalCode','Borough','Neighborhood']

###### Postal Code with more than one neightbourhood - Eg : M5A is used

In [11]:
pos_df.loc[pos_df['PostalCode'] == 'M5A']['Neighborhood'].values

array(['Harbourfront, Regent Park'], dtype=object)

###### All Postal Codes that are having more than one neightbourhood

In [12]:
pos_df.loc[pos_df['PostalCode'].isin(set(more_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M1B,Scarborough,"Rouge, Malvern"
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
11,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
13,M3C,North York,"Flemingdon Park, Don Mills South"
17,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"


###### Postal Codes that were having "Not assigned" neighbourhood

In [13]:
pos_df.loc[pos_df['PostalCode'].isin(set(na_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Queen's Park


###### Resultant DataFrame - Only first 10 rows are displayed

In [14]:
pos_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


###### Shape of the Resultant Dataframe

In [15]:
print(f"{pos_df.shape}")

(103, 3)


## Extracting & Merging Geospatial Data

<font color=red>**Note:**</font> I don't seem to be recieveing data that are consistent with the Geospatial Data Provided in the Assignment Section. So, i will be using the data file provided in the Assignment Section

In [16]:
!wget -q http://cocl.us/Geospatial_data

###### Read the data file into a DataFrame & Rename the columns to match the existing DataFrame

In [17]:
geo_data = pd.read_csv('Geospatial_data',header=0)
geo_data.columns = ['PostalCode','Latitude','Longitude']

In [18]:
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###### Make sure all the postal codes are having the corresponding Latitude & Longitude values from Geospatial_data

In [19]:
pos_df.loc[~pos_df['PostalCode'].isin(geo_data['PostalCode'])].size

0

###### Merge the DataFrames based on Postal Codes

In [20]:
mgd_pos_df = pos_df.merge(right=geo_data,on='PostalCode')
mgd_pos_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


###### Shape of the Merged Dataframe

In [21]:
print(f"{mgd_pos_df.shape}")

(103, 5)


## Clustering the neighbourhoods 

In [22]:
print (f"There are {mgd_pos_df['Borough'].unique().size} unique Borough's from the merged DataFrame")

There are 11 unique Borough's from the merged DataFrame


### Approach
* **Only Borough's that contains the word "Toronto" will be analyzed below**. 
* https://nominatim.openstreetmap.org/ will be queried directly to get the Geocodes 
* foursquare api will be used to analyze the neighbourhoods. 

###### Get Geo codes for a specific location. 

In [23]:
def get_geo_codes(area):
    longitude = ""
    latitude = ""
    try:
        # Replace the space with +
        area_str = area.replace(" ","+")
        
        # Get the Geolocation for given area
        response = requests.get('https://nominatim.openstreetmap.org/search?q='+area_str+'&format=geojson&limit=1')
        if response.status_code == 200:
            print ("Get Successful")
            
            # Extract the Latitude & Longitude from the response text
            longitude,latitude = json.loads(response.text)['features'][0]['geometry']['coordinates']
            print(f"latitude & longitude values of {area} are {latitude},{longitude}")
        else:
            print ("Get Failed!")
    except:
        pass
    
    return latitude,longitude

###### Extract Latitude & Longitude for Toronto,ON

In [24]:
latitude,longitude = get_geo_codes('Toronto ON')

Get Successful
latitude & longitude values of Toronto ON are 43.653963,-79.387207


#### Create a map of Toronto with neighborhoods superimposed on top.

In [25]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(mgd_pos_df['Latitude'], mgd_pos_df['Longitude'], mgd_pos_df['Borough'], mgd_pos_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green').add_to(map_toronto)  
    
map_toronto

###### Create a dataframe with Boroughs that has a word "Toronto" Note : Only this dataframe toronto_df will be analyzed futher.

In [26]:
toronto_df = mgd_pos_df
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


In [27]:
toronto_df.shape

(103, 5)

###### Set the Credentials for Foursquare API 

In [28]:
CLIENT_ID = 'KGU5MEU4ZHYXQPJT2Y1B2DNRFFTV1GSD4Y5KXV0WMSFFDLQO' # your Foursquare ID
CLIENT_SECRET = 'S30A30GKJ1SITKL3YOTXQAPTCJBYQGUMONMF4H4CNVJTAVOT' # your Foursquare Secret
VERSION = '20190531' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KGU5MEU4ZHYXQPJT2Y1B2DNRFFTV1GSD4Y5KXV0WMSFFDLQO
CLIENT_SECRET:S30A30GKJ1SITKL3YOTXQAPTCJBYQGUMONMF4H4CNVJTAVOT


###### Get Nearby Venues from each Neighbourhood
* radius is set to 10000 to have enough datapoints for each neighbourhood
* LIMIT is set to 100 for all the neighbourhoods. However, there may be some cases with less number of records recived from the API

In [29]:
def getNearbyVenues(names, latitudes, longitudes, radius=10000):
    
    LIMIT = 100
    
    # Athletics & Sports {4f4528bc4b90abdf24c9de85}, Pool {4bf58dd8d48988d15e941735},    
    CATEGORY = '4f4528bc4b90abdf24c9de85,4bf58dd8d48988d15e941735'
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            CATEGORY)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng,
            v['venue']['id'],
            v['venue']['name'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['location']['distance'], 
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Venue Id',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Distance', 
                  'Venue Category']
    
    return(nearby_venues)

###### Get Venues from each Neighbourhood

In [30]:
#toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
#                                   latitudes=toronto_df['Latitude'],
#                                   longitudes=toronto_df['Longitude']
#                                  )
#csv_export = toronto_venues.to_csv (r'toronto_dataframe.csv', index = None, header=True)
toronto_venues = pd.read_csv('toronto_dataframe.csv',header=0)
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
0,Parkwoods,43.753259,-79.329656,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,8373,Athletics & Sports
1,Parkwoods,43.753259,-79.329656,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,9367,Athletics & Sports
2,Parkwoods,43.753259,-79.329656,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,6809,Gym
3,Parkwoods,43.753259,-79.329656,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.73966,-79.418732,7321,Sports Club
4,Parkwoods,43.753259,-79.329656,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,1532,Gym / Fitness Center


In [31]:
def identify_the_closest_nh(df):
    return df.sort_values(by=['Venue Distance'],ascending=True).iloc[0]

In [32]:
def venues_eliminate_duplicates(df):
    venue_list = df['Venue'].unique()
    unique_venues = pd.DataFrame(columns=['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Id','Venue','Venue Latitude','Venue Longitude','Venue Distance','Venue Category'])
    for venue in venue_list:
        closest = identify_the_closest_nh(df.loc[df['Venue'] == venue])
        unique_venues = unique_venues.append(closest)
    
    print (f"{unique_venues.shape[0]} Unique venues are identified for further analysis")
    return unique_venues

In [33]:
toronto_unique_venues = venues_eliminate_duplicates(toronto_venues)

611 Unique venues are identified for further analysis


In [34]:
toronto_unique_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
4570,"The Beaches West, India Bazaar",43.668999,-79.315572,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,1225,Athletics & Sports
8910,Rosedale,43.679563,-79.377529,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,896,Athletics & Sports
2570,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym
5356,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.73966,-79.418732,714,Sports Club
676,Don Mills North,43.745906,-79.352188,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,455,Gym / Fitness Center


In [35]:
def get_attr_value(resp_dict,attribute):
    
    if attribute in resp_dict:
        return resp_dict[attribute]
    
    return ''

In [36]:
def has_attr(resp_dict,attribute):
    
    return attribute in resp_dict

In [37]:
def get_reasons(curr_resp_dict,reasons):
    
    curr_resp_dict['reasons_count'] = get_attr_value(reasons,'count')
    
    items = ()
    index = 1
    for item in reasons['items']:
        curr_resp_dict[f'reasons_item_summary_{index}'] = get_attr_value(item,'summary')
        break

In [38]:
def getVenueDetails(id):
    
    url = 'https://api.foursquare.com/v2/venues/{}?&client_id={}&client_secret={}&v={}'.format(
            id,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
    
    # make the GET request
    response_msg = requests.get(url).json()["response"]['venue']
    
    curr_resp_list = []
    curr_resp_dict = {'venue_id':id,'verified':'','tip_count':'',
                'likes':'','rating':'','ratingSignals':'','reasons_count':'',
                'reasons_item_summary_1':''}
    
    curr_resp_dict['verified'] = get_attr_value(response_msg,'verified')
    
    if has_attr(response_msg,'stats'):
        curr_resp_dict['tip_count'] = get_attr_value(response_msg['stats'],'tipCount')
        
    if has_attr(response_msg,'likes'):
        curr_resp_dict['likes'] = get_attr_value(response_msg['likes'],'count')
    
    curr_resp_dict['rating'] = get_attr_value(response_msg,'rating')
    curr_resp_dict['ratingSignals'] = get_attr_value(response_msg,'ratingSignals')
    
    if has_attr(response_msg,'reasons'):
        get_reasons(curr_resp_dict,response_msg['reasons'])
    
    return curr_resp_dict
    

In [39]:
def getDetailsforallVenues(venues):
    
    venues_list = []
    
    for venue_id in venues:
        venues_list.append(getVenueDetails(venue_id))
    
    det_venues_df = pd.DataFrame(venues_list)
    
    return det_venues_df


In [280]:
### uncomment this after two days 
#toronto_venue_details = getDetailsforallVenues(toronto_unique_venues['Venue Id'])
#toronto_venue_details.head()

toronto_venue_details = pd.read_csv(r'toronto_venue_details_dataframe.csv',
                                names=['likes','rating','ratingSignals','reasons_count',
                                       'reasons_item_summary_1','tip_count','venue_id','verified'])

In [281]:
#toronto_venue_details_2.to_csv (r'toronto_venue_details_last_dataframe.csv', index = None, header=True)

In [282]:
#toronto_venue_details = toronto_venue_details_1.append(toronto_venue_details_2)

In [283]:
toronto_venue_details.dtypes

likes                       int64
rating                    float64
ratingSignals             float64
reasons_count               int64
reasons_item_summary_1     object
tip_count                   int64
venue_id                   object
verified                     bool
dtype: object

In [284]:
toronto_mvenues = toronto_unique_venues.merge(toronto_venue_details,left_on='Venue Id',right_on='venue_id').drop('venue_id',axis=1)

In [285]:
toronto_mvenues['reasons_count'].unique()

array([0, 1])

In [286]:
toronto_mvenues = toronto_mvenues.drop('reasons_count',axis=1)

In [287]:
toronto_mvenues.loc[~(toronto_mvenues['verified'] == True)].shape

(466, 15)

In [288]:
toronto_mvenues = toronto_mvenues.drop('verified',axis=1)

In [289]:
toronto_mvenues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,likes,rating,ratingSignals,reasons_item_summary_1,tip_count
0,"The Beaches West, India Bazaar",43.668999,-79.315572,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,1225,Athletics & Sports,28,8.9,36.0,,7
1,Rosedale,43.679563,-79.377529,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,896,Athletics & Sports,11,8.6,11.0,,0
2,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym,34,7.5,46.0,Lots of people like this place,10
3,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.73966,-79.418732,714,Sports Club,15,7.3,22.0,,6
4,Don Mills North,43.745906,-79.352188,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,455,Gym / Fitness Center,51,8.2,70.0,Lots of people like this place,20


In [290]:
toronto_mvenues.loc[np.isnan(toronto_mvenues['rating'])]

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,likes,rating,ratingSignals,reasons_item_summary_1,tip_count
26,"Flemingdon Park, Don Mills South",43.725900,-79.340923,4f691346e4b0de8712c745c4,E. T. Seton PARK Archery Range,43.712605,-79.338759,1490,Athletics & Sports,1,,,,2
28,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,55895716498eaf69c53e1042,TKTO - Toronto Knife Throwing Organization,43.709966,-79.313411,485,Athletics & Sports,3,,,,4
29,"Flemingdon Park, Don Mills South",43.725900,-79.340923,4d95ec43e07ea35d4592b202,Dennis R. Timbrell Resource Centre,43.718096,-79.331728,1141,Gym / Fitness Center,1,,,,0
34,L'Amoreaux West,43.799525,-79.318389,4f5b773fe4b035e31387ad4d,Buckler Aquatics,43.803861,-79.339656,1775,Athletics & Sports,1,,,,2
35,Hillcrest Village,43.803762,-79.363452,54c96555498e4c70ef42acac,Cummer Park Community Centre Swimming Pool,43.799999,-79.371011,737,Pool,3,,,,1
37,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937,4be6b3a9cf200f470cc6143c,Leaside Pool,43.701204,-79.360992,4149,Pool,0,,,,2
38,"Silver Hills, York Mills",43.757490,-79.374714,52485fb5498e07742ed84570,Bedford Park Community Centre,43.728060,-79.400205,3864,Pool,1,,,,0
42,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,4bda04093904a593bdad449e,L'Amoreaux Community Recreation Centre,43.812106,-79.305097,1685,Athletics & Sports,2,,,,4
43,Hillcrest Village,43.803762,-79.363452,4f2e83dbe4b062ad2c20dd96,Woodbrooke Estate,43.802067,-79.354347,755,Residential Building (Apartment / Condo),0,,,,0
45,Willowdale South,43.770120,-79.408493,4c33aab2ed37a59350bc6d03,Mitchell Field Community Centre,43.774931,-79.407887,537,Community Center,4,,,,0


In [291]:
toronto_mvenues['reasons_item_summary_1'].unique()

array([nan, 'Lots of people like this place'], dtype=object)

In [292]:
impCat = SimpleImputer(strategy='constant',fill_value='')
impCat.fit(toronto_mvenues[['reasons_item_summary_1','Venue Category']])

SimpleImputer(add_indicator=False, copy=True, fill_value='', missing_values=nan,
              strategy='constant', verbose=0)

In [293]:
x=impCat.transform(toronto_mvenues[['reasons_item_summary_1','Venue Category']])

In [294]:
x

array([['', 'Athletics & Sports'],
       ['', 'Athletics & Sports'],
       ['Lots of people like this place', 'Gym'],
       ...,
       ['', 'Hockey Arena'],
       ['', 'Skating Rink'],
       ['', 'Curling Ice']], dtype=object)

In [295]:
ordenc = OrdinalEncoder()

In [296]:
ordenc.fit(x)
ordcat_pd = pd.DataFrame(ordenc.transform(x),columns=['reasons_summary','encoded_venue_category'])

In [297]:
ordcat_pd['encoded_venue_category'].unique()

array([ 1., 17., 43., 18., 34.,  6., 26., 40., 16.,  3., 29., 36., 51.,
       10., 46., 12.,  0.,  2.,  7., 19.,  4.,  9., 41.,  8., 33., 44.,
        5., 25., 48., 13., 14., 47., 11., 32., 42., 39., 21., 37., 28.,
       45., 23., 27., 15., 38., 30., 24., 49., 22., 50., 20., 31., 35.])

In [298]:
toronto_mvenues = toronto_mvenues.merge(ordcat_pd,left_index=True,right_index=True)

In [299]:
toronto_mvenues.loc[toronto_mvenues['Venue Category'] == 'Gym']

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,likes,rating,ratingSignals,reasons_item_summary_1,tip_count,reasons_summary,encoded_venue_category
2,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym,34,7.5,46.0,Lots of people like this place,10,1.0,17.0
9,York Mills West,43.752758,-79.400049,56d484efcd10b98f5098c328,GoodLife Fitness North York York Mills,43.744604,-79.406346,1039,Gym,12,8.5,13.0,,1,0.0,17.0
11,Parkwoods,43.753259,-79.329656,4b71ff80f964a52035692de3,Fitness Connection,43.727473,-79.341707,3029,Gym,10,7.4,12.0,,5,0.0,17.0
12,"Agincourt North, L'Amoreaux East, Milliken, St...",43.815252,-79.284577,56945d1c498e11466e96405f,Planet Fitness North Scarborough,43.824095,-79.301411,1672,Gym,10,8.4,11.0,,1,0.0,17.0
13,Davisville,43.704324,-79.388790,4aec60d6f964a5207cc621e3,GoodLife Fitness Toronto Mount Pleasant and Da...,43.698087,-79.387274,705,Gym,33,7.9,48.0,Lots of people like this place,9,1.0,17.0
15,"Maryvale, Wexford",43.750072,-79.295849,4b049809f964a520725522e3,GoodLife Fitness North York Victoria Terrace,43.742234,-79.313351,1655,Gym,39,6.9,61.0,Lots of people like this place,17,1.0,17.0
17,Woburn,43.770992,-79.216917,4bdaf4cb3904a5930902489e,GoodLife Fitness Scarborough Cedarbrae Mall,43.758303,-79.228533,1693,Gym,14,7.8,20.0,,9,0.0,17.0
18,Davisville North,43.712751,-79.390197,4bd6e35d637ba5933ad3f870,GoodLife Fitness Toronto Dunfield,43.708093,-79.395981,696,Gym,62,7.6,95.0,Lots of people like this place,24,1.0,17.0
20,Willowdale South,43.770120,-79.408493,4b00bfb8f964a520f84022e3,GoodLife Fitness North York Madison Centre,43.764895,-79.411936,644,Gym,54,7.4,85.0,Lots of people like this place,29,1.0,17.0
23,Scarborough Village,43.744734,-79.239476,50f48f2fe4b0eacffb119e25,GoodLife Fitness Scarborough Markington,43.742761,-79.213702,2084,Gym,28,7.8,38.0,,9,0.0,17.0


In [300]:
impQua = SimpleImputer(strategy='median',verbose=1)
impQua.fit(toronto_mvenues[['rating','ratingSignals']])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=1)

In [301]:
x1=impQua.transform(toronto_mvenues[['rating','ratingSignals']])
transformed_ratings = pd.DataFrame(x1,columns=['imp_rating','imp_rating_signals'])

In [302]:
toronto_mvenues = toronto_mvenues.merge(transformed_ratings,left_index=True,right_index=True)

In [303]:
toronto_mvenues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,likes,rating,ratingSignals,reasons_item_summary_1,tip_count,reasons_summary,encoded_venue_category,imp_rating,imp_rating_signals
0,"The Beaches West, India Bazaar",43.668999,-79.315572,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,1225,Athletics & Sports,28,8.9,36.0,,7,0.0,1.0,8.9,36.0
1,Rosedale,43.679563,-79.377529,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,896,Athletics & Sports,11,8.6,11.0,,0,0.0,1.0,8.6,11.0
2,Hillcrest Village,43.803762,-79.363452,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,1900,Gym,34,7.5,46.0,Lots of people like this place,10,1.0,17.0,7.5,46.0
3,"Bedford Park, Lawrence Manor East",43.733283,-79.41975,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.73966,-79.418732,714,Sports Club,15,7.3,22.0,,6,0.0,43.0,7.3,22.0
4,Don Mills North,43.745906,-79.352188,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,455,Gym / Fitness Center,51,8.2,70.0,Lots of people like this place,20,1.0,18.0,8.2,70.0


In [304]:
def sort_venue_by_distance(df):
    return df.sort_values(by=['Venue Distance'],ascending=True)

In [305]:
ocga = sort_venue_by_distance(unique_venues.loc[unique_venues['Venue']== 'Planet Fitness North Scarborough'])
ocga.head()

NameError: name 'unique_venues' is not defined

###### Unique Venue Categories 

In [None]:
toronto_venues.shape

Sports Club


In [None]:
toronto_venues['Venue Category'].unique()

In [None]:
unique_venues.shape

In [None]:
toronto_gym = toronto_venues

In [None]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto_gym = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_mvenues['Venue Latitude'], toronto_mvenues['Venue Longitude'], toronto_mvenues['Venue'],toronto_mvenues['Neighborhood']):
    label = '{}, {}'.format(venue,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green').add_to(map_toronto_gym)  

map_toronto_gym

In [None]:
# create map of Downtown Toronto using latitude and longitude values
map_toronto_res = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, venue, neighborhood in zip(toronto_mvenues['Venue Latitude'], toronto_mvenues['Venue Longitude'], toronto_mvenues['Venue'],toronto_mvenues['Neighborhood']):
    label = '{}, {}'.format(venue,neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='orange').add_to(map_toronto_res)  
    
map_toronto_res

###### Extracted Venues from each Neighbourhood

In [None]:
toronto_venues.head()

###### Count of the Venues Receied for each Neighborhood. 
###### Note: As highlighted above, API returned less venues for few neighbourhoods as it could only find those within the given radius

In [None]:
pd.DataFrame(toronto_venues.groupby('Neighborhood').count()['Venue'])

###### Analyze based on Venue Category

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

###### Shape of the Dataframe after the onehot encoding

In [None]:
toronto_onehot.shape

###### Group the Neighbourhoods with the corresponding encoded rows to see the most 

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

###### Shape of the Grouped Dataframe

In [None]:
toronto_grouped.shape

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

###### Get & Print the Top 5 most common venues 

In [None]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(10)

###### Cluster Neighborhoods


In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_
toronto_grouped_clustering.head()

###### Add clustering labels

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

###### Create a map with clusted labels (produced in different colors)

In [None]:
toronto_merged = toronto_merged.dropna()
toronto_merged.head()

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)+7))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[np.int(cluster-1)]).add_to(map_clusters)
       
map_clusters

In [None]:
toronto_merged[]