# Segmenting and Clustering Neighborhoods in Toronto

In [20]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [21]:
# Create the dataframe consist of three columns: PostalCode, Borough, and Neighborhood

URL ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' 
Rq = requests.get(URL)
BS = BeautifulSoup(Rq.content,'lxml')
table = BS.find_all('table')[0] 
df = pd.read_html(str(table))[0]
df_pcode = pd.DataFrame(df)

df_pcode.head()

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [22]:
#  Drop the first coulmn and rename the coulum names

df_pcode.drop(0,inplace=True)
df_pcode.columns = ['Postcode','Borough','Neighborhood']
df_pcode.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [25]:
# Remove boroughs with 'not assigned' 

df_pcode1 = df_pcode[df_pcode.Borough.str.contains("Not assigned") == False]

# Reset index

df_pcode2 = df_pcode1.reset_index()
df_pcode3 = df_pcode2.drop(df_pcode2.columns[0],axis=1)
df_pcode3.head()


Unnamed: 0,Postcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [26]:
# Combine rows with the same Postcode & Borough

df_pcode4 = df_pcode3.groupby(['Postcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_pcode4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [27]:
# If a cell has a borough but a Not assigned neighborhood, then assign the neighborhood cell the same as the borough.

row=0
for row in range(103):
    if df_pcode4.Neighborhood[row]=='Not assigned':
        df_pcode4.Neighborhood[row] = df_pcode4.Borough[row]
        row=row+1
    
df_pcode4.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [28]:
# size of the dataframe
df_pcode4.shape

(103, 3)