# Coursera Capstone Project - Segmenting and Clustering Neighborhoods in Toronto


In [2]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
from bs4 import BeautifulSoup
import requests
import re

Extract Data from the Wikipedia page.

In [3]:
# Site URL
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

Extract table headings fromn the Dataset.
- iterating through the head HTML code and making list of clean headings

In [4]:
# The following line will generate a list of HTML content for each table
data = soup.find_all("table", attrs={"class": "wikitable"})
toronto = data[0]

# Table 
table = toronto.find_all("tr")
#Header Row
column_headers = table[0]

# Table Contents
table_rows = table[1:]

headings = []
for item in column_headers.find_all("th"): # loop through all th elements
    # convert the th elements to text and strip "\n"
    item = (item.text).rstrip("\n")
    # append the clean column name to headings
    headings.append(item)
print(headings)


['Postal Code', 'Borough', 'Neighbourhood']


Extract content from webpage table
- loop through all row entries
- row_item.text removes the tags from the entries
- the following regex is to remove \xa0 and \n and comma from row_item.text
- xa0 encodes the flag, \n is the newline and comma separates thousands in numbers

In [11]:
# Next is now to loop though the rest of the rows

#print(body_rows[0])
all_rows = [] # will be a list for list for all rows
for row_num in range(len(table_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in table_rows[row_num].find_all("td"): 
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row 
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [18]:
df = pd.DataFrame(data=all_rows,columns=headings)
df.drop(df.index[df['Borough'] == 'Not assigned'], inplace = True)
toronto = df.reset_index(drop=True)
toronto.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park Harbourfront
3,M6A,North York,Lawrence Manor Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue Humber Valley Village
6,M1B,Scarborough,Malvern Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill Woodbine Gardens
9,M5B,Downtown Toronto,Garden District Ryerson


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. 
- No neighbourhoods fit the criteria

In [13]:
not_assigned = df.loc[df['Neighbourhood'] == 'Not assigned']
not_assigned

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [14]:
df.shape

(103, 3)

In [10]:
postal_code = pd.read_csv('Geospatial_Coordinates.csv')
postal_code

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


Merge the postal data with the geospatial coordinate data

In [17]:
location_data = pd.merge(toronto,postal_code)
location_data.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue Humber Valley Village,43.667856,-79.532242
6,M1B,Scarborough,Malvern Rouge,43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,Parkview Hill Woodbine Gardens,43.706397,-79.309937
9,M5B,Downtown Toronto,Garden District Ryerson,43.657162,-79.378937
