## Segmenting and clustering neighbourhoods in Toronto

### Step 1: retrieve data

* Use `requests` to retrieve the page
* Create a "soup" using `BeautifulSoup`

In [1]:
import requests
page = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(page, 'html.parser')

### Step 2: parsing data

* Find each row and each cell
* If borough is not assigned, do not append to table
* If neighbourhood is not assigned, use borough's name
* Joining rows with identical postal code and borough, separting by ','

In [3]:
import pandas as pd
table = soup.find('table', {'class': 'wikitable'})
df = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighbourhood'])
for row in table.findAll('tr'):
    cells = row.findAll('td')
    if (len(cells) > 0):
        pc = cells[0].text
        bo = cells[1].text
        nb = cells[2].text
        if bo != 'Not assigned':
            df = df.append({'PostalCode': pc, 'Borough': bo, 'Neighbourhood': nb}, ignore_index=True)

df['Neighbourhood'] = df['Neighbourhood'].str.replace('\n','')
df.shape

(211, 3)

In [4]:
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.Borough
df = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [5]:
df.shape

(103, 3)

In [16]:
geocoder.arcgis('M5A, Toronto, Ontario').latlng

[43.65512000000007, -79.36263979699999]

In [17]:
import geocoder

latlist = []
lonlist = []

for pc in list(df['PostalCode']):
    print(pc)
    latlon = None
    while (latlon is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
        latlon = g.latlng
    latlist.append(latlon[0])
    lonlist.append(latlon[1])

df['lat'] = latlist
df['lon'] = lonlist
df.head()

M1B
M1C
M1E
M1G
M1H
M1J
M1K
M1L
M1M
M1N
M1P
M1R
M1S
M1T
M1V
M1W
M1X
M2H
M2J
M2K
M2L
M2M
M2N
M2P
M2R
M3A
M3B
M3C
M3H
M3J
M3K
M3L
M3M
M3N
M4A
M4B
M4C
M4E
M4G
M4H
M4J
M4K
M4L
M4M
M4N
M4P
M4R
M4S
M4T
M4V
M4W
M4X
M4Y
M5A
M5B
M5C
M5E
M5G
M5H
M5J
M5K
M5L
M5M
M5N
M5P
M5R
M5S
M5T
M5V
M5W
M5X
M6A
M6B
M6C
M6E
M6G
M6H
M6J
M6K
M6L
M6M
M6N
M6P
M6R
M6S
M7A
M7R
M7Y
M8V
M8W
M8X
M8Y
M8Z
M9A
M9B
M9C
M9L
M9M
M9N
M9P
M9R
M9V
M9W


Unnamed: 0,PostalCode,Borough,Neighbourhood,lat,lon
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.78573,-79.15875
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175256
3,M1G,Scarborough,Woburn,43.768359,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


In [18]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None): 
    print(df)

    PostalCode           Borough  \
0          M1B       Scarborough   
1          M1C       Scarborough   
2          M1E       Scarborough   
3          M1G       Scarborough   
4          M1H       Scarborough   
5          M1J       Scarborough   
6          M1K       Scarborough   
7          M1L       Scarborough   
8          M1M       Scarborough   
9          M1N       Scarborough   
10         M1P       Scarborough   
11         M1R       Scarborough   
12         M1S       Scarborough   
13         M1T       Scarborough   
14         M1V       Scarborough   
15         M1W       Scarborough   
16         M1X       Scarborough   
17         M2H        North York   
18         M2J        North York   
19         M2K        North York   
20         M2L        North York   
21         M2M        North York   
22         M2N        North York   
23         M2P        North York   
24         M2R        North York   
25         M3A        North York   
26         M3B        North 