## Segmenting and Clustering Neighborhoods in Toronto
### Scraping data from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M to a structured pandas dataframe

Import Libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

Scrape Canadian postal codes off of Wiki page using "BeautifulSoup"

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text
Canada_zip = BeautifulSoup(source, 'lxml')

Create Data using PostalCode HTML

In [8]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
toronto = pd.DataFrame(columns= column_names)

#iterate to extract necessary data
content = Canada_zip.find('div', class_ = 'mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i=0
    for td in tr.find_all('td'):
        if i==0:
            postcode = td.text
            i += 1
        elif i == 1:
            borough = td.text
            i += 1
        elif i == 2:
            neighborhood = td.text.strip('\n').replace(']','')
    toronto = toronto.append({'Postalcode': postcode, 'Borough': borough, 'Neighborhood': neighborhood}, ignore_index=True)
    
toronto = toronto[toronto.Borough!= 'Not assigned']
toronto = toronto[toronto.Borough!= 0]
toronto.reset_index(drop=True,inplace=True)
i=0
for i in range(0,toronto.shape[0]):
    if toronto.iloc[i][2]=='Not Assigned':
        toronto.iloc[i][2] = toronto.iloc[i][1]
        i += 1
        
df = toronto.groupby(['Postalcode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index()
df.head()
                                                    

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Now lets Clean Some of the Data

In [9]:
df = df.dropna()
empty = 'Unassigned'
df = df[(df.Postalcode != empty) & (df.Borough != empty) & (df.Neighborhood != empty)]
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [13]:
def neighborhood_list(grouped):    
    return ', '.join(sorted(grouped['Neighborhood'].tolist()))
                    
grp = df.groupby(['Postalcode', 'Borough'])
df2 = grp.apply(neighborhood_list).reset_index(name='Neighborhood')

In [None]:
print