# Segmenting and Clustering Neighborhoods in Toronto - TC

In [1]:
import pandas as pd
import numpy as np

## 1. Parse postal code data

#### Parse postal code data from Wikipedia:

In [2]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
postal_codes=pd.read_html(url,header=0)[0]
postal_codes.columns=['PostalCode', 'Borough','Neighborhood']

In [3]:
postal_codes.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
284,M8Z,Etobicoke,Mimico NW
285,M8Z,Etobicoke,The Queensway West
286,M8Z,Etobicoke,Royal York South West
287,M8Z,Etobicoke,South of Bloor
288,M9Z,Not assigned,Not assigned


#### Remove not assigned boroughs:

In [4]:
postal_codes=postal_codes[postal_codes.Borough!='Not assigned']

#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough:

In [5]:
postal_codes['Neighborhood']=np.where(postal_codes.Neighborhood=='Not assigned',
                                     postal_codes.Borough,postal_codes.Neighborhood)

#### Combine neighborhoods with the same postal code:

In [6]:
pc_comb=postal_codes.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ', '.join(x)).reset_index()
pc_comb.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Print the number of rows of the final dataframe:

In [7]:
print('The number of rows of the final dataframe is:',pc_comb.shape[0])

The number of rows of the final dataframe is: 103


## 2. Retrieve coordinates

In [11]:
coords=pd.read_csv('Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
pc_comb.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
df=pd.merge(pc_comb,coords,left_on='PostalCode',right_on='Postal Code',how='left').drop(columns='Postal Code')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
