# Segmenting and Clustering Neighborhoods in Toronto

## 1. Scraping Neighborhood Data from wiki page

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Scraping wiki page into dataframe

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(url)
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]

df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


### Removing cells with 'Not assigned' borough

In [3]:
index_boroughNotAssigned = df[df.Borough == 'Not assigned'].index
df.drop(index_boroughNotAssigned, inplace = True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Assigning 'Not assigned' neighborhood by its borough name

In [4]:
index_neigbourhoodNotAssigned = df[df.Neighbourhood == 'Not assigned'].index
df.Neighbourhood[index_neigbourhoodNotAssigned] = df.Borough[index_neigbourhoodNotAssigned]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


### Combining neighborhoods in one postcode area

In [5]:
grouped = df.groupby(["Postcode", "Borough"])["Neighbourhood"].apply(', '.join)
df_grouped = pd.DataFrame(data = grouped).reset_index()
df_grouped.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [6]:
print("There are %d rows in combined data frame" % df_grouped.shape[0])

There are 103 rows in combined data frame


## 2. Build Neighborhood Dataframe with Geographical Coordinates

## 3. Explore and Clustering Neighborhoods in Toronto