# League of Toronto's Neighbourhoods in Sports Centers

###### Import libraries 

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests 
import bs4
import folium
import json
from pandas.io.json import json_normalize
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import time
from scipy.spatial.distance import cdist
import seaborn as sns
sns.set()

###### Get list of postal codes from Wiki (DS01)

In [38]:
response = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
if response.status_code == 200: 
    print ("Get Successful!")
else:
    print ("Get Failed! Check URL")

Get Successful!


###### Payload Received From Wiki

In [39]:
# print(response.text) # Commented out to avoid filling the notebook with the response text

###### Parse the response text using BeautifulSoap

In [40]:
resp_soap = bs4.BeautifulSoup(response.text, 'html.parser')
resp_soap.title

<title>List of postal codes of Canada: M - Wikipedia</title>

### Assumptions
 * Postal Code & Borough combination is expected have same value for all the nieighbourhoods with those combinations.
 * **_check_\__for_\__discrepancy_** is used to shout if there are any discrepancies found while processing the rows. 

In [41]:
def check_for_discrepancy(pos_dict,row_list):
    
    if row_list[0] in pos_dict and pos_dict[row_list[0]][0] != row_list[1]:
        print ("Warning! Existing Postal Codes and Borough combination is differeing from the new row")
        print (f"Existing Postal Code {row_list[0]} Borough {pos_dict[row_list[0]][0]}")
        print (f"New Postal Code {row_list[0]} Borough {row_list[1]}")

### Approach
* Very first table from the Response Text will be parsed to get all the Postal Codes
* **_get_\__list_\__of_\__postal_\__codes_** will parse all the rows.
* **_get_\__row_\__list_** will parse the columns.
* Postal Code Dictionary (_with postal code as a Key_) is maintained to store all the parsed postal codes. 
* If Borough i.e. Column #2 is "Not Assigned", it will skip adding the rows to the Postal Code Dictionary 
* If Nieighbourhood i.e. Column #3 is "Not Assigned" but if the Borough is valid, it will assign the Borough as its neighbourhood before adding the row to the Dictionary. 
* List is maintained to keep track of the Neighbourhoods that are having same postal code
* Another List is maintained to keep track of the rows that are having valid Borough but "Not assigned" Neighbourhood

In [42]:
def add_to_dictionary(pos_dict,row_list,more_nbh):
    
    if (row_list[0]) in pos_dict:
        more_nbh.append(row_list[0])
        pos_dict[row_list[0]][1] = pos_dict[row_list[0]][1] + ", " + row_list[2]
    else:
        pos_dict[row_list[0]] = [row_list[1],row_list[2]]

In [43]:
def get_row_list(row):
    
    row_list = []    
    for column in row:
        
        if isinstance(column,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char            
            row_list.append(column.get_text(strip=True))
            
    return row_list

In [44]:
def get_list_of_postal_codes(soap_msg_table):
    
    pos_dict = {}
    more_nbh = []
    na_nbh = []
    for row in soap_msg_table:
        
        if isinstance(row,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char
            row_list = get_row_list(row)
            
            if row_list[1].replace(' ','').lower() != 'notassigned' and row_list[2].replace(' ','').lower() == 'notassigned':
                na_nbh.append(row_list[0])
                row_list[2] = row_list[1]
                add_to_dictionary(pos_dict,row_list,more_nbh)
                
            elif row_list[1].replace(' ','').lower() != 'notassigned':                
                add_to_dictionary(pos_dict,row_list,more_nbh)
            
            # Check for discrepancy
            check_for_discrepancy(pos_dict,row_list)
            
    return pos_dict,more_nbh,na_nbh

###### Extracting & Parsing the very first table from the response text

In [45]:
resp_table = resp_soap.table
postal_code_list, more_nbh, na_nbh = get_list_of_postal_codes(resp_table.tbody)

###### Creating a DataFrame from the Parsed Dictionary. 
* _orient_ is used to parse the keys as Index
* Removing the first row as it is a header from the table
* Resetting the Index to get the DataFrame in an expected way. 
* Adding the expected column names to the DataFrame 

In [46]:
pos_df = pd.DataFrame.from_dict(postal_code_list,orient='index')
pos_df = pos_df[1:]
pos_df = pos_df.reset_index()
pos_df.columns = ['PostalCode','Borough','Neighborhood']

###### Postal Code with more than one neightbourhood - Eg : M5A is used

In [47]:
pos_df.loc[pos_df['PostalCode'] == 'M5A']['Neighborhood'].values

array(['Harbourfront, Regent Park'], dtype=object)

###### All Postal Codes that are having more than one neightbourhood

In [48]:
pos_df.loc[pos_df['PostalCode'].isin(set(more_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M1B,Scarborough,"Rouge, Malvern"
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"
11,M9B,Etobicoke,"Cloverdale, Islington, Martin Grove, Princess ..."
12,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
13,M3C,North York,"Flemingdon Park, Don Mills South"
17,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."
18,M1E,Scarborough,"Guildwood, Morningside, West Hill"


###### Postal Codes that were having "Not assigned" neighbourhood

In [49]:
pos_df.loc[pos_df['PostalCode'].isin(set(na_nbh))]

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M7A,Queen's Park,Queen's Park


###### Resultant DataFrame - Only first 10 rows are displayed

In [50]:
pos_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


###### Shape of the Resultant Dataframe

In [51]:
print(f"{pos_df.shape}")

(103, 3)


###### Extracting & Merging Geospatial Data (DS02)

**Note:** I don't seem to be recieveing data that are consistent with the Geospatial Data Provided in the Assignment Section. So, i will be using the data file provided in the Assignment Section

In [52]:
!wget -q http://cocl.us/Geospatial_data

###### Read the data file into a DataFrame & Rename the columns to match the existing DataFrame

In [53]:
geo_data = pd.read_csv('Geospatial_data',header=0)
geo_data.columns = ['PostalCode','Latitude','Longitude']

In [54]:
geo_data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


###### Make sure all the postal codes are having the corresponding Latitude & Longitude values from Geospatial_data

In [55]:
pos_df.loc[~pos_df['PostalCode'].isin(geo_data['PostalCode'])].size

0

###### Merge the DataFrames based on Postal Codes

In [56]:
toronto_df = pos_df.merge(right=geo_data,on='PostalCode')
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


###### Shape of the Merged Dataframe

In [57]:
print(f"{toronto_df.shape}")

(103, 5)


In [58]:
print (f"There are {toronto_df['Borough'].unique().size} unique Borough's from the merged DataFrame")

There are 11 unique Borough's from the merged DataFrame


### Approach
* **Only Borough's that contains the word "Toronto" will be analyzed below**. 
* https://nominatim.openstreetmap.org/ will be queried directly to get the Geocodes 
* foursquare api will be used to analyze the neighbourhoods. 

###### Get Geo codes for a specific location. 

In [59]:
def get_geo_codes(area):
    longitude = ""
    latitude = ""
    try:
        # Replace the space with +
        area_str = area.replace(" ","+")
        
        # Get the Geolocation for given area
        response = requests.get('https://nominatim.openstreetmap.org/search?q='+area_str+'&format=geojson&limit=1')
        if response.status_code == 200:
            print ("Get Successful")
            
            # Extract the Latitude & Longitude from the response text
            longitude,latitude = json.loads(response.text)['features'][0]['geometry']['coordinates']
            print(f"latitude & longitude values of {area} are {latitude},{longitude}")
        else:
            print ("Get Failed!")
    except:
        pass
    
    return latitude,longitude

###### Extract Latitude & Longitude for Toronto,ON

In [60]:
latitude,longitude = get_geo_codes('Toronto ON')

Get Successful
latitude & longitude values of Toronto ON are 43.653963,-79.387207


#### Create a map of Toronto with neighborhoods superimposed on top.

In [61]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color='green').add_to(map_toronto)  
    
map_toronto

###### Set the Credentials for Foursquare API  to get DS03

In [62]:
CLIENT_ID = 'a' # your Foursquare ID
CLIENT_SECRET = 'b' # your Foursquare Secret
VERSION = '20190531' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KGU5MEU4ZHYXQPJT2Y1B2DNRFFTV1GSD4Y5KXV0WMSFFDLQO
CLIENT_SECRET:S30A30GKJ1SITKL3YOTXQAPTCJBYQGUMONMF4H4CNVJTAVOT


###### Get Nearby Venues from each Neighbourhood
* radius is set to 10000 to have enough datapoints for each neighbourhood
* LIMIT is set to 100 for all the neighbourhoods. However, there may be some cases with less number of records recived from the API

In [63]:
def getNearbyVenues(names, latitudes, longitudes, radius=10000):
    
    LIMIT = 100
    
    # Athletics & Sports {4f4528bc4b90abdf24c9de85}, Pool {4bf58dd8d48988d15e941735},    
    CATEGORY = '4f4528bc4b90abdf24c9de85,4bf58dd8d48988d15e941735'
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT,
            CATEGORY)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng,
            v['venue']['id'],
            v['venue']['name'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['location']['distance'], 
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude',
                  'Venue Id',
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                  'Venue Distance', 
                  'Venue Category']
    
    return(nearby_venues)

###### Get Venues from each Neighbourhood

In [64]:
#toronto_venues = getNearbyVenues(names=toronto_df['Neighborhood'],
#                                   latitudes=toronto_df['Latitude'],
#                                   longitudes=toronto_df['Longitude']
#                                  )
#csv_export = toronto_venues.to_csv (r'toronto_dataframe.csv', index = None, header=True)
toronto_venues = pd.read_csv('toronto_dataframe.csv',header=0)
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Id,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
0,Parkwoods,43.753259,-79.329656,50a01c82e4b0a0263fcfdcb9,Monarch Park Stadium,43.678144,-79.324038,8373,Athletics & Sports
1,Parkwoods,43.753259,-79.329656,4ba5156cf964a520b6da38e3,Toronto Lawn Tennis Club,43.680667,-79.388559,9367,Athletics & Sports
2,Parkwoods,43.753259,-79.329656,4bb4ccb486a8d13a7ce1de7f,GoodLife Fitness North York Steeles and Woodbine,43.813541,-79.344064,6809,Gym
3,Parkwoods,43.753259,-79.329656,4b0eb057f964a520b75923e3,Toronto Cricket Skating and Curling Club,43.73966,-79.418732,7321,Sports Club
4,Parkwoods,43.753259,-79.329656,4c18e819d4d9c9284e19f029,LA Fitness,43.747665,-79.347077,1532,Gym / Fitness Center


In [65]:
def identify_the_closest_nh(df):
    return df.sort_values(by=['Venue Distance'],ascending=True).iloc[0]

In [66]:
def venues_eliminate_duplicates(df):
    venue_list = df['Venue'].unique()
    unique_venues = pd.DataFrame(columns=['Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue Id','Venue','Venue Latitude','Venue Longitude','Venue Distance','Venue Category'])
    for venue in venue_list:
        closest = identify_the_closest_nh(df.loc[df['Venue'] == venue])
        unique_venues = unique_venues.append(closest)
    
    print (f"{unique_venues.shape[0]} Unique venues are identified for further analysis")
    return unique_venues

In [67]:
toronto_unique_venues = venues_eliminate_duplicates(toronto_venues)

611 Unique venues are identified for further analysis


###### Change the data type of the Venue Distance feature

In [68]:
toronto_unique_venues['Venue Distance'] = pd.to_numeric(toronto_unique_venues['Venue Distance'].astype('float64'))

In [70]:
def get_attr_value(resp_dict,attribute):
    
    if attribute in resp_dict:
        return resp_dict[attribute]
    
    return ''

In [71]:
def has_attr(resp_dict,attribute):
    
    return attribute in resp_dict

In [69]:
def get_reasons(curr_resp_dict,reasons):
    
    curr_resp_dict['reasons_count'] = get_attr_value(reasons,'count')
    
    items = ()
    index = 1
    for item in reasons['items']:
        curr_resp_dict[f'reasons_item_summary_{index}'] = get_attr_value(item,'summary')
        break