# Segmenting and Clustering Neighbourhoods in Toronto

## Data Scraping

#### Import Packages

In [148]:
import requests
from bs4 import BeautifulSoup

import numpy as np 
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Get HTML Code

In [149]:
response = requests.get(url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M",)
soup = BeautifulSoup(response.content, 'html.parser')

#### Print 1st Row

In [150]:
soup.table.tbody.find_all('tr')[1]#.find_all('td')[1].get_text()

<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>

#### Convert HTML Table to Dataframe

In [151]:
#Create New Dataframe
column_names = ['Postal Code','Borough', 'Neighborhood'] 
nbhds = pd.DataFrame(columns=column_names)

In [152]:
table = soup.table.tbody

row_count = 0

for row in table.find_all('tr'):
    new_row=[]
    for column in row.find_all('td'):
        new_row.append(column.get_text().strip('\n'))    
    if (len(new_row) != 0):
        print(new_row)
        nbhds.loc[row_count] = new_row
        row_count = row_count + 1


['M1A', 'Not assigned', 'Not assigned']
['M2A', 'Not assigned', 'Not assigned']
['M3A', 'North York', 'Parkwoods']
['M4A', 'North York', 'Victoria Village']
['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront']
['M6A', 'North York', 'Lawrence Manor, Lawrence Heights']
['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"]
['M8A', 'Not assigned', 'Not assigned']
['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village']
['M1B', 'Scarborough', 'Malvern, Rouge']
['M2B', 'Not assigned', 'Not assigned']
['M3B', 'North York', 'Don Mills']
['M4B', 'East York', 'Parkview Hill, Woodbine Gardens']
['M5B', 'Downtown Toronto', 'Garden District, Ryerson']
['M6B', 'North York', 'Glencairn']
['M7B', 'Not assigned', 'Not assigned']
['M8B', 'Not assigned', 'Not assigned']
['M9B', 'Etobicoke', 'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale']
['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek']
['M2C', 'Not assigned', 'Not assigned']
['M3C'

['M7Y', 'East Toronto', 'Business reply mail Processing Centre, South Central Letter Processing Plant Toronto']
['M8Y', 'Etobicoke', "Old Mill South, King's Mill Park, Sunnylea, Humber Bay, Mimico NE, The Queensway East, Royal York South East, Kingsway Park South East"]
['M9Y', 'Not assigned', 'Not assigned']
['M1Z', 'Not assigned', 'Not assigned']
['M2Z', 'Not assigned', 'Not assigned']
['M3Z', 'Not assigned', 'Not assigned']
['M4Z', 'Not assigned', 'Not assigned']
['M5Z', 'Not assigned', 'Not assigned']
['M6Z', 'Not assigned', 'Not assigned']
['M7Z', 'Not assigned', 'Not assigned']
['M8Z', 'Etobicoke', 'Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West']
['M9Z', 'Not assigned', 'Not assigned']


#### Drop 'Borough' cells with 'Not assigned' values

In [153]:
nbhds.drop(nbhds[nbhds.Borough == 'Not assigned'].index, inplace=True)
nbhds.reset_index(inplace = True, drop = True)

In [154]:
nbhds.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Assign 'Borough' to 'Neighborhood' if it is 'Not assigned'

In [155]:
nbhds['Neighborhood'].loc[(nbhds.Neighborhood == 'Not assigned') & (nbhds.Borough != 'Not assigned')] = nbhds['Borough'].loc[(nbhds.Neighborhood == 'Not assigned') & (nbhds.Borough != 'Not assigned')]

In [156]:
nbhds.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [157]:
nbhds.shape

(103, 3)