## Segmenting and Clustering Neighborhoods in Toronto

### 1. Import libreries

In [92]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### 2. Web Scraping using beautifulsoup

In [93]:
http_src = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(http_src).text
soup = BeautifulSoup(source,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td if i.text.strip()]
    if row:
        res.append(row)

df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighbourhood"])
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 3. Remove the Borough "not assigned" lines from dataframes*

In [94]:
df_pcodes = df[df["Borough"]!="Not assigned"]
df_pcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### 4. Group by Postcode and Borough and put neighbourhood in same line

In [95]:
df_pcodes = df_pcodes.groupby(['Postcode','Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df_pcodes.reset_index(inplace=True)
df_pcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 5. Copy Bouroughs on unassigned Neighbourhood

In [96]:
df_pcodes.loc[result["Neighbourhood"]=="Not assigned","Neighbourhood"] = result["Borough"]


### 6. print dataframe shape

In [97]:
result.shape

(103, 3)