# Segmenting and Clustering Neighborhoods in Toronto

Scrape the neigbourhood in Toronto from Wikipedia and wrangle them in dataframe


Import pandas, numpy and BeautifulSoup for scraping wikepedia

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

Declare functions for concatenating Neighbourhoods

In [2]:
def concat_neighbourhood(x):
    neigh = ""
    for i in range(len(x)-1):
        neigh = neigh + x.iloc[i] + ", "
    neigh += x.iloc[-1]
    return neigh

In [3]:
def sel_Borough(x):
    ref = x.iloc[0]
    for i in range(1, len(x)):
        if ref  != x.iloc[i]:
            for i in x:
                print(x)
            raise Exception("postcode has 2 Boroughs")
    return ref
    

Use BeautifulSoup and lxml to parse the Table from wikipedia

Declare an empty data frame to contain Postal Code, Borough and Neighbourhood

In [4]:
postal = pd.DataFrame(columns = ["PostCode", "Borough", "Neighbourhood"])

In [5]:
#pass the wikipedia URL
obj  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [6]:
page = obj.text

In [7]:
soup = BeautifulSoup(page, "lxml")

Get the table and the rows from Wikipedia

In [8]:
postTable = soup.find('table', class_='wikitable sortable')
row = postTable.find('tr')
rowData = row.find_all('th')

Iterate until all the rows have been retrieved and append to the dataframe created

In [9]:
for row in postTable.find_all('tr')[1:]:
    temp = []
    for cell in row.find_all('td'):
        temp.append(cell.text)
        #print(cell.text)
    print(temp)
    postal = postal.append(
    dict(zip(["PostCode", "Borough", "Neighbourhood"], temp)), ignore_index=True)

['M1A', 'Not assigned', 'Not assigned\n']
['M2A', 'Not assigned', 'Not assigned\n']
['M3A', 'North York', 'Parkwoods\n']
['M4A', 'North York', 'Victoria Village\n']
['M5A', 'Downtown Toronto', 'Harbourfront\n']
['M5A', 'Downtown Toronto', 'Regent Park\n']
['M6A', 'North York', 'Lawrence Heights\n']
['M6A', 'North York', 'Lawrence Manor\n']
['M7A', "Queen's Park", 'Not assigned\n']
['M8A', 'Not assigned', 'Not assigned\n']
['M9A', 'Etobicoke', 'Islington Avenue\n']
['M1B', 'Scarborough', 'Rouge\n']
['M1B', 'Scarborough', 'Malvern\n']
['M2B', 'Not assigned', 'Not assigned\n']
['M3B', 'North York', 'Don Mills North\n']
['M4B', 'East York', 'Woodbine Gardens\n']
['M4B', 'East York', 'Parkview Hill\n']
['M5B', 'Downtown Toronto', 'Ryerson\n']
['M5B', 'Downtown Toronto', 'Garden District\n']
['M6B', 'North York', 'Glencairn\n']
['M7B', 'Not assigned', 'Not assigned\n']
['M8B', 'Not assigned', 'Not assigned\n']
['M9B', 'Etobicoke', 'Cloverdale\n']
['M9B', 'Etobicoke', 'Islington\n']
['M9B', '

Replace all newlines with spaces and if there is no Neighbourhood, assign the Borough as the Neigbourhood

In [10]:
postal["Neighbourhood"] = postal["Neighbourhood"].replace({'\n':' '}, regex=True)
postal = postal.reset_index()
postal = postal.drop(["index"], axis = 1)
postal.set_value(6,'Neighbourhood',"Queen's Park")
postal.head()


  after removing the cwd from sys.path.


Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Drop all the rows with Borough = 'Not assigned'

In [11]:
postal = postal[postal.Borough != "Not assigned"]
postal

Unnamed: 0,PostCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Queen's Park
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Combine all the Neigbourhoods with the same Postcode

In [12]:
postal_clean = postal.groupby(["PostCode"]).agg({"Borough": lambda x: sel_Borough(x),
                                 "Neighbourhood": lambda x: concat_neighbourhood(x)})
postal_clean.reset_index()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge , Malvern"
1,M1C,Scarborough,"Highland Creek , Rouge Hill , Port Union"
2,M1E,Scarborough,"Guildwood , Morningside , West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park , Ionview , Kennedy Park"
7,M1L,Scarborough,"Clairlea , Golden Mile , Oakridge"
8,M1M,Scarborough,"Cliffcrest , Cliffside , Scarborough Village W..."
9,M1N,Scarborough,"Birch Cliff , Cliffside West"


In [13]:
#print the number of rows of the dataframe
postal_clean.shape

(103, 2)

In [14]:
postal_clean.to_csv("PostalDataToronto.csv")