# Segmenting and Clustering Neighborhoods in Toronto
__This notebook is a part of IBM Capstone Project based on Week-2 Segmenting and Clustering of Negihborhoods in Toronto__

This project objectives is to explore, segment, and cluster the neigborhoods in the city of Toronto. The data is available in Wikipedia which need to be scrape and wrangle the data, clean it, and then read it into a _pandas_ dataframe.
Once the data is in a structured format, you can replicate the analysis that we did to the New York City dataset to explore and cluster the neighborhoods in the city of Toronto.

In [5]:
import requests
import pandas as pd
import numpy as np
import random

from geopy.geocoders import Nominatim
from IPython.display import Image
from IPython.core.display import HTML

from IPython.display import display_html
import pandas as pd
import numpy as py

import folium
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# Scraping the Wikipedia page for the table of postal codes of Canada

In [9]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')
print(soup.title)
from IPython.display import display_html
tab=str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postal code,Borough,Neighborhood
M1A,Not assigned,
M2A,Not assigned,
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Regent Park / Harbourfront
M6A,North York,Lawrence Manor / Lawrence Heights
M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
M8A,Not assigned,
M9A,Etobicoke,Islington Avenue
M1B,Scarborough,Malvern / Rouge


# HTML table is converted to Pandas DataFrame for cleaning and preprocessing

In [10]:
dfs = pd.read_html(tab)
df = dfs[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


# Data preprocessing and cleaning

In [13]:
df1 = df[df.Borough != 'Not assigned']
df2 = df1.groupby(['Postal code', 'Borough'], sort = False).agg (', '.join)
df2.reset_index(inplace = True)
df2['Neighborhood']= np.where(df2['Neighborhood']=='Not assigned', df2['Borough'],df2['Neighborhood'])
df2

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,The Kingsway / Montgomery Road / Old Mill North
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,Old Mill South / King's Mill Park / Sunnylea /...


# Importing the csv file for various spatial langitude and longitudes

In [15]:
l_l = pd.read_csv('https://cocl.us/Geospatial_data')
l_l.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Merge the Latitude and Longitude with neighborhoods

In [19]:
l_l.rename(columns={'Postal Code':'Postal code'}, inplace = True)
df3 = pd.merge(df2, l_l,on='Postal code')
df3.head()

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Manor / Lawrence Heights,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494


# Getting all rows from the data frame which contains Toronto in their Borough

In [21]:
df4 = df3[df3['Borough'].str.contains('Toronto',regex = False)]
df4

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568
31,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259


# Visualize all the neighbor using Folium

In [26]:
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng, borough, neighborhoods in zip(df4['Latitude'],df4['Longitude'], df4['Borough'], df4['Neighborhood']):
    label = '{},{}'.format(neighborhoods,borough)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],radius = 2, popup=label, color = 'red', fill=True, fill_color = '#3186cc', fill_opacity = 0.5, parse_html = False).add_to(map_toronto)
map_toronto

# Using k-means Clustering for the cluster of the neighborhoods

In [33]:
k = 5
clustering = df4.drop(['Postal code','Borough', 'Neighborhood'],1)
kmeans = KMeans(n_clusters = k, random_state = 4).fit(clustering)
kmeans.labels_
df4

Unnamed: 0,Cluster Labels,Postal code,Borough,Neighborhood,Latitude,Longitude
2,2,M5A,Downtown Toronto,Regent Park / Harbourfront,43.65426,-79.360636
4,2,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,43.662301,-79.389494
9,2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,2,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,2,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,1,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,2,M5H,Downtown Toronto,Richmond / Adelaide / King,43.650571,-79.384568
31,4,M6H,West Toronto,Dufferin / Dovercourt Village,43.669005,-79.442259


In [36]:
map_cluster = folium.Map(location=[43.651070,-79.347015],zoom_start=10)
x = np.arange(k)
ys = [i+x+(i*x)**2 for i in range(k)]
colors_array=cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat,lng, neighborhoods, cluster in zip(df4['Latitude'],df4['Longitude'], df4['Neighborhood'], df4['Cluster Labels']):
    label = '{},{}'.format(neighborhoods,'Cluster ' +str(cluster))
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker([lat,lng],radius = 2, popup=label, color = rainbow[cluster-1], fill=True, fill_color = rainbow[cluster-1], fill_opacity = 0.5, parse_html = False).add_to(map_toronto)
map_toronto