## Segmenting and Clustering Neighborhoods in Toronto

## Task #1: Web Scrabing From Wikipedia

Import libraries

In [79]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd

Retrieve the Wikipedia URL having postal codes in Toronto 
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
and create Beautiful soup object with request content

In [6]:
req = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(req.content)

Retrieve table contents

In [197]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [198]:
 df = pd.DataFrame(table_contents)

In [199]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [188]:
df.shape

(103, 3)

check value counts

In [200]:
df['Borough'].value_counts()

North York                                                      24
Downtown Toronto                                                17
Scarborough                                                     17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East Toronto                                                     4
East York                                                        4
MississaugaCanada Post Gateway Processing Centre                 1
Queen's Park                                                     1
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
EtobicokeNorthwest                                               1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
East YorkEast Toronto                                         

Some values are mistaken e.g. EtobicokeNorthwest

In [201]:
df[df['Borough']=='East YorkEast Toronto']

Unnamed: 0,PostalCode,Borough,Neighborhood
35,M4J,East YorkEast Toronto,The Danforth East


In [204]:
df['Borough'].replace('MississaugaCanada Post Gateway Processing Centre', 'Mississauga',inplace=True)
df['Borough'].replace('Downtown TorontoStn A PO Boxes25 The Esplanade','Downtown Toronto',inplace=True)
df['Borough'].replace('EtobicokeNorthwest', 'Etobicoke',inplace=True)
df['Borough'].replace('East YorkEast Toronto', 'East York',inplace=True)
df['Borough'].replace('East TorontoBusiness reply mail Processing Centre969 Eastern','East Toronto',inplace=True)
  

In [205]:
df[df['Borough'] == 'East YorkEast Toronto']


Unnamed: 0,PostalCode,Borough,Neighborhood


In [206]:
df['Borough'].value_counts()

North York          24
Downtown Toronto    18
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East Toronto         5
York                 5
East York            5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

## Task #2: Add longtitude and latitude columns

Import Geospatial_Coordinates csv file, for adding lat,lang columns

In [139]:
geo_coordinates = pd.read_csv('Geospatial_Coordinates.csv')

In [141]:
geo_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge geo_coordinates and df dataframes on postal code


In [231]:
df_final = pd.merge(left=df, right=geo_coordinates ,left_on='PostalCode', right_on='Postal Code')

In [208]:
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",M5A,43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",M6A,43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,M7A,43.662301,-79.389494
...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",M8X,43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,M4Y,43.665860,-79.383160
100,M7Y,East Toronto,Enclave of M4L,M7Y,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",M8Y,43.636258,-79.498509


drop Postal Code column

In [232]:
df_final.drop('Postal Code', axis=1,inplace=True)

In [210]:
df_final.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


## Task #3: Cluster Analysis

Import necessary libraries

In [157]:
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 

We can only work with numerical data, so create new dataframes with latitude and longtitude

In [263]:
num_clusters = 5 #randomly defined cluster number

k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(df_final.loc[:, ['Latitude','Longitude']])
df_final["Labels"] = k_means.labels_


In [264]:
df_final

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,labels,Labels
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,1
1,M4A,North York,Victoria Village,43.725882,-79.315572,2,1
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636,4,0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,3,4
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494,4,0
...,...,...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,1,2
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160,4,0
100,M7Y,East Toronto,Enclave of M4L,43.662744,-79.321558,2,1
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509,1,2


Labels column defines the cluster set of each location, Let's visualize the cluster on the Toronto map

In [None]:
! pip install folium==0.5.0
import folium # plotting library

In [271]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [248]:
latitude= df_final['Latitude'].mean()
longitude=df_final['Longitude'].mean()

In [273]:
# create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=10)

# set color scheme for the clusters
x = np.arange(num_clusters)
ys = [i + x + (i*x)**2 for i in range(num_clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df_final['Latitude'], df_final['Longitude'], df_final['Neighborhood'], df_final['Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
