# Segmenting and Clustering Neighborhoods in Toronto

- Muhammad Umar Khan

In [1]:
# import all the libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors

%matplotlib inline
print('Packages installed  :)')

Packages installed  :)


In [2]:
# Get the file using beautiful soup 
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
result = requests.get(url)
data_html = BeautifulSoup(result.content)


#### Lines until df.head() are reading the file and then into a pandas dataframe

In [3]:
soup = BeautifulSoup(str(data_html))

In [4]:
neigh = soup.find('table')

In [5]:
table_str = str(neigh.extract())

In [6]:
df = pd.read_html(table_str)[0]

In [7]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Drop all the rows which have Boroughs **'Not assigned'** and then reset the index. Assumed looking at the wikipedia link that the Neighborhoods with the same Postal Code appear on the same row so further cleansing didnt have to be done to satisfy requirements

In [8]:
df_dropna = df[df.Borough != 'Not assigned'].reset_index(drop=True)

In [9]:
df_dropna.rename(columns={'Postal Code' : 'PostalCode'}, inplace=True)

Check to see the shape of the dataframe

In [10]:
print(df_dropna.shape)

(103, 3)


#### Preprocess data so that Broroughs are grouped based on their name

In [11]:
df = df_dropna

In [12]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [13]:
df_grouped = df.groupby(['Borough', 'PostalCode'], 
                        as_index=False).agg(lambda x:','.join(x))
df_grouped.head(20)

Unnamed: 0,Borough,PostalCode,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
5,Central Toronto,M4V,"Summerhill West, Rathnelly, South Hill, Forest..."
6,Central Toronto,M5N,Roselawn
7,Central Toronto,M5P,Forest Hill North & West
8,Central Toronto,M5R,"The Annex, North Midtown, Yorkville"
9,Downtown Toronto,M4W,Rosedale


Check to see if we have any Not assigned Neighorhoods

In [14]:
df_grouped.loc[df_grouped['Borough'].isin(["Not assigned"])]

Unnamed: 0,Borough,PostalCode,Neighborhood


So now all the requirements are met and the data is cleansed, now we just have to add the Latitude and Longitudes of each location


In [15]:
df_grouped.reset_index(drop=True)

Unnamed: 0,Borough,PostalCode,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,North Toronto West
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
...,...,...,...
98,York,M6C,Humewood-Cedarvale
99,York,M6E,Caledonia-Fairbanks
100,York,M6M,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
101,York,M6N,"Runnymede, The Junction North"


In [16]:
list(df_grouped.columns.values)

['Borough', 'PostalCode', 'Neighborhood']

In [17]:
df_grouped = df_grouped[['PostalCode', 'Borough', 'Neighborhood']]

In [18]:
list(df_grouped.columns.values)

['PostalCode', 'Borough', 'Neighborhood']

Rearranged the Columns so the **PostalCode** appears first

In [19]:
df_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M4N,Central Toronto,Lawrence Park
1,M4P,Central Toronto,Davisville North
2,M4R,Central Toronto,North Toronto West
3,M4S,Central Toronto,Davisville
4,M4T,Central Toronto,"Moore Park, Summerhill East"


Now we are going to create a new table with the Latitudes and Longitudes corresponding to the different PostalCodes

In [20]:
geo_url = "https://cocl.us/Geospatial_data"

geo_df = pd.read_csv(geo_url)
geo_df.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

In [21]:
geo_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Next We Will merge the data. We will make the actual data equal to the temporary data frame we are working with becauase we are happy with it and then we will merge it based on the postal code.

In [22]:
df = df_grouped

In [23]:
df = pd.merge(df, geo_df, on='PostalCode')

In [24]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
1,M4P,Central Toronto,Davisville North,43.712751,-79.390197
2,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
3,M4S,Central Toronto,Davisville,43.704324,-79.38879
4,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316


# This notebook finished. Please Check the Final for full version