## Segmenting and Clustering Neighborhoods in Toronto

### 1.1 Import libreries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### 1.2 Web Scraping using beautifulsoup

In [2]:
http_src = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(http_src).text
soup = BeautifulSoup(source,'lxml')
table = soup.find('table')
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td if i.text.strip()]
    if row:
        res.append(row)

df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighbourhood"])
df.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### 1.3 Remove the Borough "not assigned" lines from dataframes*

In [3]:
df_pcodes = df[df["Borough"]!="Not assigned"]
df_pcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### 1.4 Group by Postcode and Borough and put neighbourhood in same line

In [4]:
df_pcodes = df_pcodes.groupby(['Postcode','Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
df_pcodes.reset_index(inplace=True)
df_pcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 1.5 Copy Bouroughs on unassigned Neighbourhood

In [5]:
df_pcodes.loc[df_pcodes["Neighbourhood"]=="Not assigned","Neighbourhood"] = df_pcodes["Borough"]


### 1.6 print dataframe shape

In [6]:
df_pcodes.shape

(103, 3)

### 2.1 import from cocl.us

In [7]:
path_csv="http://cocl.us/Geospatial_data"
df_geolist = pd.read_csv(path_csv)
df_geolist.columns = ["Postcode", "Latitude", "Longitude"]
df_geolist.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 2.2 merge dataframes (add longitude and latitude)


In [9]:
df_result = df_pcodes.merge(df_geolist, left_on='Postcode', right_on='Postcode', how='inner')
df_result.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
