# Coursera Capstone Project: Segmenting Neighborhoods

In this submission we are pulling neighborhood data from a table on wikipedia and several other sources to compare neighborhoods in Toronto.

In [340]:
import numpy as np #  to handle data
import pandas as pd #  for data analysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# import Matplotlib and modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import folium map library
!pip -q install folium
import folium

# import library to handle requests
import requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# import geopy to convert an address into latitude and longitude values
!pip -q install geopy
from geopy.geocoders import Nominatim

# import k-means from clustering stage
from sklearn.cluster import KMeans

print ("Libraries Imported")

Libraries Imported


## Import & Arrange Data

In [342]:

# Wikipedia page scrape
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#read the Wikipedia page - returns list of dataframes
dfs = pd.read_html(url, header=0)
#take the first dataframe from the returned list (it should be the only dataframe in the list)
df = dfs[0]
df.head(10)
print ("Table Imported")

Table Imported


#### Create new dataframe with records where Borough is not 'Not assigned'

In [343]:
df_assigned = df[df['Borough'] != 'Not assigned']
df_assigned.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Replace cells where the borough where neighborhood is 'Not assigned'

In [344]:

new_neigh = df_assigned['Neighborhood'].where(df_assigned['Neighborhood'] != 'Not assigned', other = df_assigned['Borough'], axis = 0)
#construct new dataframe using postcode and borough from the previous dataframe and neighborhood from the above list
df_replaced = pd.concat([df_assigned['Postal Code'], df_assigned['Borough'], new_neigh], axis = 1)
df_replaced.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Group the dataframe

In [345]:
toronto_neighborhoods = df_replaced.groupby(['Postal Code', 'Borough'])['Neighborhood'].apply(list).apply(lambda x: ', '.join(x)).to_frame()
toronto_neighborhoods.reset_index(inplace = True)
toronto_neighborhoods.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### Display the shape of the resulting dataframe

In [346]:
toronto_neighborhoods.shape

(103, 3)

#### Read the Canada public schools dataset

In [347]:
df = pd.read_excel('https://files.ontario.ca/opendata/publicly_funded_schools_xlsx_january_2019_en.xlsx')
df.head(3)

Unnamed: 0,Region,Board Number,Board Name,Board Type,Board Language,School Number,School Name,School Level,School Language,School Type,School Special Conditions Code,Suite,PO Box,Street,City,Province,Postal Code,Phone,Fax,Grade Range,Date Open,School Email,School Website,Board Website
0,Sudbury-North Bay Regional Office,B28010,Algoma DSB,Pub Dist Sch Brd (E/F),English,902344,Algoma Education Connection Secondary School,Secondary,English,Public,Alternative,,,550 Northern Ave,Sault Ste Marie,Ontario,P6B4J4,705-945-7194,705-945-7173,9-12,2010-09-07,,http://www.adsb.on.ca,www.adsb.on.ca
1,Sudbury-North Bay Regional Office,B28010,Algoma DSB,Pub Dist Sch Brd (E/F),English,19186,Anna McCrea Public School,Elementary,English,Public,Not applicable,,,250 Mark St,Sault Ste Marie,Ontario,P6A3M7,705-945-7106,705-945-7221,JK-8,1969-09-01,,http://www.adsb.on.ca/sites/schools/amc/defaul...,www.adsb.on.ca
2,Sudbury-North Bay Regional Office,B28010,Algoma DSB,Pub Dist Sch Brd (E/F),English,67679,Arthur Henderson Public School,Elementary,English,Public,Not applicable,,,2 Henderson Lane 2,Bruce Mines,Ontario,P0R1C0,705-785-3483,705-785-3220,JK-3,1969-09-01,,http://www.adsb.on.ca/sites/schools/art/defaul...,www.adsb.on.ca


#### Edit The Data

In [348]:
#keep postal code rows that are not null
df = df[df['Postal Code'].notna()]
#keep only rows where the postal code begins with M - these are Toronto postal codes
df = df[df['Postal Code'].str.startswith('M')]
#keep only first 3 characters of postal code
df['Postal Code'] = df['Postal Code'].str[:3]
#keep only public schools
df = df[df['School Type'] == 'Public']
df['School Level'] = df['School Level'].apply(lambda x: x + ' School')

#keep selected columns and store in dataframe called toronto_schools
toronto_schools = df[['School Level', 'School Name', 'Postal Code']]
toronto_schools.head()

Unnamed: 0,School Level,School Name,Postal Code
540,Elementary School,Collège français élémentaire,M5B
541,Secondary School,Collège français secondaire,M5B
542,Elementary School,École élémentaire Académie Alexandre-Dumas,M1E
546,Elementary School,École élémentaire Charles-Sauriol,M6N
551,Elementary School,École élémentaire Étienne-Brûlé,M2L


#### Read the csv file to find Lat and Long

In [349]:
coords = pd.read_csv('http://cocl.us/Geospatial_data')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Join the Lat & Long to the neighborhood table

In [350]:
toronto_schools = toronto_schools.join(toronto_neighborhoods.set_index('Postal Code'), on = 'Postal Code')
toronto_schools.head()

Unnamed: 0,School Level,School Name,Postal Code,Borough,Neighborhood
540,Elementary School,Collège français élémentaire,M5B,Downtown Toronto,"Garden District, Ryerson"
541,Secondary School,Collège français secondaire,M5B,Downtown Toronto,"Garden District, Ryerson"
542,Elementary School,École élémentaire Académie Alexandre-Dumas,M1E,Scarborough,"Guildwood, Morningside, West Hill"
546,Elementary School,École élémentaire Charles-Sauriol,M6N,York,"Runnymede, The Junction North"
551,Elementary School,École élémentaire Étienne-Brûlé,M2L,North York,"York Mills, Silver Hills"


In [351]:
toronto_schools_count = toronto_schools.groupby('Postal Code').count().reset_index()
toronto_schools_count.head()

Unnamed: 0,Postal Code,School Level,School Name,Borough,Neighborhood
0,M1B,17,17,17,17
1,M1C,10,10,10,10
2,M1E,22,22,22,22
3,M1G,12,12,12,12
4,M1H,5,5,5,5


#### Join data from previous tables & add the percent occupied data points

In [352]:
toronto_neighborhoods = toronto_neighborhoods.join(toronto_population.set_index('Geographic code'), on = 'Postal Code')
#replace null values with 0
toronto_neighborhoods['Percent Occupied'].fillna(0, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Percent Occupied
0,M1B,Scarborough,"Malvern, Rouge",0.306014
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",0.316454
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",0.365571
3,M1G,Scarborough,Woburn,0.328966
4,M1H,Scarborough,Cedarbrae,0.368494


In [353]:
toronto_neighborhoods = pd.merge(toronto_neighborhoods, coords, left_on = 'Postal Code', right_on = 'Postal Code')[['Postal Code', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
toronto_neighborhoods.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Cluster the Neighborhoods
#### Join population data

In [354]:
toronto_neighborhoods = toronto_neighborhoods.join(toronto_population.set_index('Geographic code'), on = 'Postal Code')
#replace null values with 0
toronto_neighborhoods['Percent Occupied'].fillna(0, inplace=True)
toronto_neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Percent Occupied
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,0.306014
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0.316454
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0.365571
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0.328966
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0.368494


In [355]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


#### Create map of Toronto using latitude and longitude values

In [356]:

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, percent_occupied in zip(toronto_neighborhoods['Latitude'], toronto_neighborhoods['Longitude'], toronto_neighborhoods['Borough'], toronto_neighborhoods['Neighborhood'], toronto_neighborhoods['Percent Occupied']):
    label = '{}, {}'.format(neighborhood, percent_occupied)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [357]:
CLIENT_ID = 'HHOZ5R21N0A3ANASX1G5QWRS3HCEHTCWVNC2HO3M23RXFCUE' 
CLIENT_SECRET = 'X0O3ZDLT1CKKZLJ0ZFIE4RTMN2SUWPLD0Z130TSKYH134141'
VERSION = '20200619' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HHOZ5R21N0A3ANASX1G5QWRS3HCEHTCWVNC2HO3M23RXFCUE
CLIENT_SECRET:X0O3ZDLT1CKKZLJ0ZFIE4RTMN2SUWPLD0Z130TSKYH134141


In [358]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?client_id=HHOZ5R21N0A3ANASX1G5QWRS3HCEHTCWVNC2HO3M23RXFCUE&client_secret=X0O3ZDLT1CKKZLJ0ZFIE4RTMN2SUWPLD0Z130TSKYH134141&v=20200619&ll=43.65,-79.39&radius=500&limit=100'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?client_id=HHOZ5R21N0A3ANASX1G5QWRS3HCEHTCWVNC2HO3M23RXFCUE&client_secret=X0O3ZDLT1CKKZLJ0ZFIE4RTMN2SUWPLD0Z130TSKYH134141&v=20200619&ll=43.65,-79.39&radius=500&limit=100'