# Segmenting and Clustering Neighborhoods in Toronto

## Introduction

This notebook contains "Segmenting and Clustering Neighborhoods in Toronto" assignment as a part of Applied Data Science Capstone

In [41]:
import numpy as np
import pandas as pd
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

print('Numpy and pandas imported!')
!pip install geopy
!pip install folium
!pip install beautifulsoup4
print('beautifulsourp4 installed')

from bs4 import BeautifulSoup
import requests

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

Current Time = 07:29:41
Numpy and pandas imported!
beautifulsourp4 installed
Current Time = 07:29:47


In [42]:
#fetching the content from the url
response = requests.get(
    url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
)

In [43]:
#using Beautifulsoap to read the response content 
soup = BeautifulSoup(response.content, 'html.parser')

In [44]:
#taking out the table content and storing it in local variable
table = soup.table
#create a dataframe out of table
df = pd.read_html(str(table))[0]

In [45]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
print(df.columns)
print('The dataframew has {} boroughs and {} neighbiourhoods'.format(len(df['Borough'].unique()),df.shape[0]))
df

Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')
The dataframew has 11 boroughs and 180 neighbiourhoods


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [46]:
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)

In [47]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


More than one neighborhood can exist in one postal code area, two rows will be combined into one row with the neighborhoods separated with a comma
Stroing default data in toronto.

In [48]:
df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index(drop=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


Replace neighbourhood with values 'Not Assigned' with value from borough column

In [49]:
mask = df['Neighbourhood'] == "Not assigned"
df.loc[mask, 'Neighbourhood'] = df.loc[mask, 'Borough']

In [52]:
df.head

<bound method NDFrame.head of     Postal Code           Borough  \
0           M3A        North York   
1           M4A        North York   
2           M5A  Downtown Toronto   
3           M6A        North York   
4           M7A  Downtown Toronto   
..          ...               ...   
98          M8X         Etobicoke   
99          M4Y  Downtown Toronto   
100         M7Y      East Toronto   
101         M8Y         Etobicoke   
102         M8Z         Etobicoke   

                                         Neighbourhood  
0                                            Parkwoods  
1                                     Victoria Village  
2                            Regent Park, Harbourfront  
3                     Lawrence Manor, Lawrence Heights  
4          Queen's Park, Ontario Provincial Government  
..                                                 ...  
98       The Kingsway, Montgomery Road, Old Mill North  
99                                Church and Wellesley  
100  Busines

In [53]:
df.shape

(103, 3)

Consuming CSV for latitude and longitude and merging it to the existing data frame

In [54]:
lat_long = pd.read_csv('http://cocl.us/Geospatial_data')

In [55]:
df1 = pd.merge(df, lat_long, on="Postal Code")

creating df with borough containing Toronto in it

In [78]:
toronto_data = df1[df1['Borough'].str.contains('Toronto',regex=False)].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [57]:
print('The dataframe has {} boroughs and {} Neighbourhood.'.format(
        len(toronto_data['Borough'].unique()),
        toronto_data.shape[0]
    )
)

The dataframe has 4 boroughs and 39 Neighbourhood.



Installing and Using geopy library to get the latitude and longitude values of Toronto
Installing folium for Visualisation 

In [58]:
from geopy.geocoders import Nominatim
print('Geopy installed and Nominatim imported')

Geopy installed and Nominatim imported


In [59]:
city = 'Toronto'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(city)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [60]:
import folium
print('Folium imported!')

Folium imported!


In [61]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

In [62]:
for latitude, longitude, borough, neighbourhood in zip(toronto_data.Latitude, toronto_data.Longitude, toronto_data.Borough, toronto_data.Neighbourhood):
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup= '{}, {}'.format(neighbourhood, borough),
        color='blue',
        fill=False,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

## Lets explore neighbourhood 

In [80]:
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Lets split the neighbourhood and rename latitude and longitude according to postal codes

In [144]:
toronto_data1 = toronto_data.assign(Neighbourhood=df['Neighbourhood'].str.split(',')).explode('Neighbourhood').reset_index(drop=True)

In [145]:
toronto_data1

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Parkwoods,43.654260,-79.360636
1,M7A,Downtown Toronto,Victoria Village,43.662301,-79.389494
2,M5B,Downtown Toronto,Regent Park,43.657162,-79.378937
3,M5B,Downtown Toronto,Harbourfront,43.657162,-79.378937
4,M5C,Downtown Toronto,Lawrence Manor,43.651494,-79.375418
...,...,...,...,...,...
66,M4Y,Downtown Toronto,Little Portugal,43.665860,-79.383160
67,M4Y,Downtown Toronto,Trinity,43.665860,-79.383160
68,M7Y,East Toronto,Kennedy Park,43.662744,-79.321558
69,M7Y,East Toronto,Ionview,43.662744,-79.321558


Now we have the data frame with 71 Neighbourhoods.<br>
Lets add locators to them. To add latlang to each neighbourhood, I am using ArcGIS service <br> https://geocoder.readthedocs.io/providers/ArcGIS.html#geocoding

In [139]:
!pip install geocoder
import geocoder

print('Geocode installed and imported')

Geocode installed and imported


### Creating a new data frame with borough, neighbourhood and its latitude and longitude

In [172]:
toronto_n = pd.DataFrame(columns=['Borough','Neighbourhood','Latitude', 'Longitude'])
for borough, neighbourhood in zip(toronto_data1['Borough'], toronto_data1['Neighbourhood']):
    g = geocoder.arcgis('{}, {}'.format(neighbourhood,borough))
    toronto_n = toronto_n.append({'Borough': borough,
                                          'Neighbourhood': neighbourhood,
                                          'Latitude': g.latlng[0],
                                          'Longitude': g.latlng[1]}, ignore_index=True)
toronto_n


# geolocator = Nominatim(user_agent="to_explorer")
# location = geolocator.geocode(city)
# latitude = location.latitude
# longitude = location.longitude

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Downtown Toronto,Parkwoods,43.650110,-79.382900
1,Downtown Toronto,Victoria Village,43.731540,-79.314280
2,Downtown Toronto,Regent Park,43.659470,-79.355810
3,Downtown Toronto,Harbourfront,43.650110,-79.382900
4,Downtown Toronto,Lawrence Manor,43.722940,-79.431160
...,...,...,...,...
66,Downtown Toronto,Little Portugal,43.647550,-79.429050
67,Downtown Toronto,Trinity,43.650110,-79.382900
68,East Toronto,Kennedy Park,43.652812,-79.473314
69,East Toronto,Ionview,43.735844,-79.273020


In [173]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_n['Borough'].unique()),
        toronto_n.shape[0]
    )
)

The dataframe has 4 boroughs and 71 neighborhoods.


In [174]:
toronto_coordinates = geocoder.arcgis('Toronto')

#### Lets create toronto map and mark all the neighbourhoods


In [175]:
map_toronto = folium.Map(location=[toronto_coordinates.latlng[0], toronto_coordinates.latlng[1]], zoom_start=11)

# add markers to map
for lat, lng, nei, bor in zip(toronto_n['Latitude'], toronto_n['Longitude'], toronto_n['Neighbourhood'], toronto_n['Borough']):
    label = folium.Popup('{}, {}'.format(nei,bor), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto


### Define Foursquare Credentials and Version

In [176]:
# The code was removed by Watson Studio for sharing.