# Segmentation and Clustering of Neighbourhood

### Import Required Module for Python

In [1]:
import itertools
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

### Code to scrape the Wikipedia page of "List of postal codes of Canada"

In [2]:
req_canada=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup_canada=BeautifulSoup(req_canada.content,'lxml')
table_canada=soup_canada.find_all('table')
df=pd.read_html(str(table_canada))

### convert the list to dataframe using panda

In [3]:
neighbor_canada=pd.DataFrame(df[0])
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignore cells with a borough that is Not assigned

In [4]:
neighbor_canada.drop(neighbor_canada[neighbor_canada['Borough']=="Not assigned"].index,axis=0, inplace=True)
neighbor_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If more than one neighborhood exists in one postal code area, these rows will be combined into one row with the neighborhoods separated with a comma

In [5]:
df_canada = neighbor_canada.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df_canada.reset_index(inplace=True)


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [6]:
df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Neighbourhood']=df_canada.loc[df_canada['Neighbourhood']=="Not assigned",'Borough']
df_canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [7]:
df_canada.shape

(103, 3)

## Starting of Part 2

### Find out the geographical coordinates of each postal code

In [11]:
latitude_longitude = pd.read_csv('https://cocl.us/Geospatial_data')


### Merge the latitude and longitude to the dataframe in separate columns

In [12]:
df_with_coordinate = pd.merge(df_canada,latitude_longitude,on='Postal Code')

### Find out the neighbourhood of Toronto based on "Borough" column

In [15]:
df_toronto = df_with_coordinate[df_with_coordinate['Borough'].str.contains('Toronto')]
df_toronto

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
