# Segmenting and Clustering Neighborhoods in Toronto - Part 1

In [35]:
import pandas as pd
import numpy as np
import json
import urllib.request
from bs4 import BeautifulSoup

# Use BeautifulSoap library to scrap the webpage and load the 

In [36]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [37]:
page = urllib.request.urlopen(url)

In [38]:
soup = BeautifulSoup(page, "lxml")

Saving the table lxml result in a variable.This has HTML like format

In [39]:
table=soup.find('table', class_='wikitable sortable')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>Ea

Looping through the table object and looking for table rows (tr) and cells (td).
The result of each cell is saved in temporary array

In [40]:
PostalCode=[]
Borough=[]
Neighbourhood=[]

for row in table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        PostalCode.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighbourhood.append(cells[2].find(text=True))

# Create Dataframe

In [41]:
df=pd.DataFrame(PostalCode,columns=['PostalCode'])
df['Borough']=Borough
df['Neighbourhood']=Neighbourhood
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


# Data Cleaning steps

Sorting values in dataframe and checking shape size

In [42]:
df = df.sort_values(['PostalCode','Borough','Neighbourhood']).reset_index()
df.shape

(180, 4)

Remove newline special character - \n

In [43]:
df['PostalCode'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)
df['Borough'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)
df['Neighbourhood'].replace(r'\s+|\\n', ' ', regex=True, inplace=True)

#df['PostalCode'] = df['PostalCode'].astype(str)
#df['Borough'] = df['Borough'].astype(str)
#df['Neighbourhood'] = df['Neighbourhood'].astype(str)

df.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,0,M1A,Not assigned,
1,9,M1B,Scarborough,Malvern / Rouge
2,18,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
3,27,M1E,Scarborough,Guildwood / Morningside / West Hill
4,36,M1G,Scarborough,Woburn


Check how many 'Not assigned' boroughs are there

In [44]:
print(df['Borough'].value_counts())

Not assigned         77
North York           24
Downtown Toronto     19
Scarborough          17
Etobicoke            12
Central Toronto       9
West Toronto          6
East Toronto          5
East York             5
York                  5
Mississauga           1
Name: Borough, dtype: int64


Replace 'Not assigned' Borough by changing then to NaN and using dropna function

In [45]:
df = df.replace('Not assigned',np.nan, regex=True)
df.dropna(subset=["Borough"], axis=0, inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,9,M1B,Scarborough,Malvern / Rouge
1,18,M1C,Scarborough,Rouge Hill / Port Union / Highland Creek
2,27,M1E,Scarborough,Guildwood / Morningside / West Hill
3,36,M1G,Scarborough,Woburn
4,45,M1H,Scarborough,Cedarbrae


In [46]:
df.shape

(103, 4)

The assignment is asking to combine Neighbourhood names in the same PostalCode and Borough with comma value. However, the data is already grouping the Neighbourhoods with forward slash, therefore I have replaced the forward slash with comma.

In [47]:
df = df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df.shape

(103, 3)

In [48]:
df['Neighbourhood'].replace(r' \/ ', ', ', regex=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Final result

In [51]:
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [50]:
df.shape

(103, 3)