# Segmenting and Clustering Neighborhoods in Toronto

### Preprocessing

In [None]:
!conda install -c conda-forge beautifulsoup4
!conda install -c conda-forge lxml

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

Scrape the data from Wikipedia using BeautifulSoup and transfer the table to a list

In [2]:
raw_html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(raw_html,'lxml')
raw_text = soup.tbody.text
raw_text = raw_text.splitlines(keepends = False)


drop all the empty data

In [11]:
for i in raw_text:
    if i=='':
        raw_text.remove(i)

print (len(raw_text))

870


Create a new dataframe and fit the data into it

In [12]:

column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
neigh = pd.DataFrame(columns=column_names)

n=3
for i in range(int(len(raw_text)/3)):
    
    postal = raw_text[i*n]
    borough = raw_text[i*n+1]
    neighborhood = raw_text[i*n+2]
    

    neigh = neigh.append({'PostalCode':postal,
                         'Borough':borough,
                         'Neighborhood':neighborhood},ignore_index=True)
neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Clean the dataframe

In [13]:
neigh.drop([0],inplace = True)
neigh.replace(to_replace = 'Not assigned',value = np.nan,inplace = True)
neigh.dropna(axis = 0, subset = ['Borough'], inplace = True)
neigh = neigh.reset_index(drop=True)
neigh['Neighborhood'].fillna(value = neigh['Borough'], inplace=True)

Combine the rows that share the same postal code

In [14]:
for i in range(len(neigh)-1):
    if neigh.loc[i,'PostalCode']==neigh.loc[i+1,'PostalCode']:
        neigh.loc[i+1,'Neighborhood'] = str(neigh.loc[i+1,'Neighborhood'])+str(',')+str(neigh.loc[i,'Neighborhood'])

In [15]:
neigh = neigh.drop_duplicates(subset='PostalCode', keep='last')
neigh = neigh.reset_index(drop=True)

Results

In [16]:
neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Queen's Park


In [17]:
neigh.shape

(103, 3)