# Segmentation and Clustering of Neighbourhood

### Import Required Module for Python

In [1]:
import itertools
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

### Code to scrape the Wikipedia page of "List of postal codes of Canada"

In [5]:
req_canada=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup_canada=BeautifulSoup(req_canada.content,'lxml')
table_canada=soup_canada.find_all('table')
df=pd.read_html(str(table_canada))

### convert the list to dataframe using panda

In [6]:
neighbor_canada=pd.DataFrame(df[0])
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignore cells with a borough that is Not assigned

In [7]:
neighbor_canada.drop(neighbor_canada[neighbor_canada['Borough']=="Not assigned"].index,axis=0, inplace=True)
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If more than one neighborhood exists in one postal code area, these rows will be combined into one row with the neighborhoods separated with a comma

In [9]:
df_canada = neighbor_canada.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df_canada.reset_index(inplace=True)


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [11]:
df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Neighbourhood']=df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Borough']
df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [12]:
df_canada.shape

(103, 3)