# Peer-graded Assignment Segmenting and Clustering Neighborhoods in Toronto


## PART 1
### Scrapping website

In [1]:
# for getting webpage html
from bs4 import BeautifulSoup 
import requests

In [13]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text

In [52]:
# Wepage html
soup = BeautifulSoup(data,"html5lib")
#soup

In [4]:
#html table in the web page
table = soup.find('table') 

In [7]:
# creating an array of dictionaries to store the tabble data
table_contents=[]

for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

In [11]:
print(table_contents[0])

{'PostalCode': 'M3A', 'Borough': 'North York', 'Neighborhood': 'Parkwoods'}


In [16]:
# making the dataframe consisting of a list of dictionaries
df=pd.DataFrame(table_contents)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East TorontoBusiness reply mail Processing Cen...,Enclave of M4L
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [17]:
# shortening the Borough names
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})
df["Borough"]

0                 North York
1                 North York
2           Downtown Toronto
3                 North York
4               Queen's Park
               ...          
98                 Etobicoke
99          Downtown Toronto
100    East Toronto Business
101                Etobicoke
102                Etobicoke
Name: Borough, Length: 103, dtype: object

In [24]:
print("The DataFrame df has {} rows".format(df.shape[0]))

The DataFrame df has 103 rows


## PART 2
### Extracting geospatial data for each postal code

In [25]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [27]:
# import geocoder
#import geocoder 

In [29]:
## initialize your variable to None
#lat_lng_coords = None

## loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(df["PostalCode"]))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]


In [30]:
# Using Geospatial_Coordinates.csv file
geos_cord = pd.read_csv("Geospatial_Coordinates.csv")
geos_cord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [41]:
#renaming the column so as to merge them later
geos_cord.rename(columns={"Postal Code":"PostalCode"},inplace=True)

In [48]:
geos_cord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [49]:
# merging dataframes to get the dataframe that will be used in the clustering
result_df=pd.merge(df, geos_cord, on='PostalCode')
result_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [51]:
result_df.shape

(103, 5)

## PART 3
### Cluster Neighborhoods and Visualization
**Will be clustering the neighbourhoods based on the number of Boroughs**

In [62]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# map rendering library
import folium 

In [65]:
# The number of Distinct
len(result_df["Borough"].unique())

15

In [67]:
result_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [66]:
# number of times each Borough is present in the dataframe
result_df["Borough"].value_counts()

North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East York                  4
East Toronto               4
East York/East Toronto     1
East Toronto Business      1
Mississauga                1
Etobicoke Northwest        1
Queen's Park               1
Downtown Toronto Stn A     1
Name: Borough, dtype: int64

In [68]:
# Preparing the dataframe I will cluster with
clustering_df = result_df.drop(["PostalCode","Borough","Neighborhood"], 1)

In [69]:
clustering_df.head()

Unnamed: 0,Latitude,Longitude
0,43.753259,-79.329656
1,43.725882,-79.315572
2,43.65426,-79.360636
3,43.718518,-79.464763
4,43.662301,-79.389494


In [82]:
# set number of clusters = number of Boroughs
kclusters = len(result_df["Borough"].unique())

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(clustering_df)

In [83]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([ 7,  2,  4,  1,  4,  9, 14,  7,  2,  4])

In [77]:
# add cluster labels to our data frame
result_df.insert(0, 'Cluster Labels', kmeans.labels_)

In [78]:
result_df.head()

Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,7,M3A,North York,Parkwoods,43.753259,-79.329656
1,2,M4A,North York,Victoria Village,43.725882,-79.315572
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,1,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


**Visualization of Clusters**

In [79]:
# libriary for geographical coordinates
from geopy.geocoders import Nominatim

In [80]:
# Getting the latitude and longitude of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [81]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(result_df['Latitude'], result_df['Longitude'], result_df['Neighborhood'], result_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters