# Segmenting and Clustering Neighborhoods in Toronto

## Introduction

This notebook contains "Segmenting and Clustering Neighborhoods in Toronto" assignment as a part of Applied Data Science Capstone

In [1]:
import numpy as np
import pandas as pd

print('Numpy and pandas imported!')


!pip install beautifulsoup4
print('beautifulsourp4 installed')

from bs4 import BeautifulSoup
import requests


Numpy and pandas imported!
beautifulsourp4 installed


In [2]:
#fetching the content from the url
response = requests.get(
    url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
)

In [3]:
#using Beautifulsoap to read the response content 
soup = BeautifulSoup(response.content, 'html.parser')

In [4]:
#taking out the table content and storing it in local variable
table = soup.table
#create a dataframe out of table
df = pd.read_html(str(table))[0]

In [5]:
#The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
print(df.columns)
print('The dataframew has {} boroughs and {} neighbiourhoods'.format(len(df['Borough'].unique()),df.shape[0]))
df.head()


Index(['Postal Code', 'Borough', 'Neighbourhood'], dtype='object')
The dataframew has 11 boroughs and 180 neighbiourhoods


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df = df[df.Borough != 'Not assigned'].reset_index(drop=True)

In [7]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
#More than one neighborhood can exist in one postal code area, two rows will be combined into one row with the neighborhoods separated with a comma

df.groupby(['Postal Code','Borough'])['Neighbourhood'].apply(', '.join).reset_index(drop=True)
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [9]:
#if neighbourhood has a value 'Not Assigned' then it should be same as borough
mask = df['Neighbourhood'] == "Not assigned"
df.loc[mask, 'Neighbourhood'] = df.loc[mask, 'Borough']

In [10]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [11]:
df.shape

(103, 3)

In [12]:
#consuming CSV for latitude and longitude
lat_long = pd.read_csv('http://cocl.us/Geospatial_data')


In [14]:
df1 = pd.merge(df, lat_long, on="Postal Code")
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
