# Segmenting and Clustering Neighborhoods in Toronto

Data is scraped from the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

### Libraries Used

In [3]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Scrape data from wikipedia

In [21]:
url =  "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_content = requests.get(url).text

In [22]:
soup = BeautifulSoup(html_content, 'xml')

In [23]:
table = soup.find('table')

In [24]:
# Create a dataframe with 3 columns: PostalCode, Borough and Neighborhood
col = ['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns = col)

In [25]:
# Get all data in PostalCode, Borough and Neighborhood 
for tr in table.find_all('tr'):
    row = []
    for td in tr.find_all('td'):
        row.append(td.text.strip())
    if len(row)==3:
        df.loc[len(df)] = row

In [26]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Data Cleaning

In [27]:
# Remove row with a borough that is Not assigned
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)

In [28]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [46]:
# Group data
temp = df.groupby('Postalcode')['Neighborhood'].apply(lambda x: ', '.join(x))
temp = pd.DataFrame(temp).reset_index()
temp = temp.rename(columns = {'Neighborhood' : 'Grouped_Neighborhood'})
temp.head()

Unnamed: 0,Postalcode,Grouped_Neighborhood
0,M1B,"Malvern, Rouge"
1,M1C,"Rouge Hill, Port Union, Highland Creek"
2,M1E,"Guildwood, Morningside, West Hill"
3,M1G,Woburn
4,M1H,Cedarbrae


In [52]:
merge = pd.merge(df, temp, on='Postalcode')
merge = merge.drop('Neighborhood', axis = 1)
merge = merge.drop_duplicates()
merge = merge.rename(columns={'Group_Neighborhood':'Neighborhood'})

In [53]:
merge.head()

Unnamed: 0,Postalcode,Borough,Grouped_Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [54]:
merge.shape

(103, 3)