
# Segmenting and Clustering Neighborhoods in Toronto

### Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [1]:
#import modules
from bs4 import BeautifulSoup
import requests
import pandas as pd


In [2]:
# import data using url provided
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
url_text=requests.get(url).text
#url_text

### Use BeautifulSoup to list the webpage and extract the table out of the webpage
The first row the page is assigned initially using 'Not assigned', which will be removed in the next step.

In [3]:
# using BeautifulSoup to process url_text
city_data = BeautifulSoup(url_text, 'html.parser')

# creat a new Dataframe
column_names = ['PostalCode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

content = city_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 'Not assigned'
borough = 'Not assigned'
neighborhood = 'Not assigned'

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text.strip('\n')
            i = i + 1
        elif i == 1:
            borough = td.text.strip('\n')
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n')
    toronto = toronto.append({'PostalCode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

print('The shape of the original table is ',toronto.shape)
toronto.head()

The shape of the original table is  (181, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,Not assigned,Not assigned,Not assigned
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


### Remove all rows with 'Not assigned'.
### List the head of the dataframe and print out the shape of the dataframe

In [4]:
# clean dataframe 
toronto = toronto[toronto.Borough!='Not assigned']
toronto.reset_index(drop = True, inplace = True)

print('The shape of the cleaned table is ',toronto.shape)
toronto.head()

The shape of the cleaned table is  (103, 3)


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Use the csv file to create the following dataframe

In [5]:
!wget -O geospatial_data.csv https://cocl.us/Geospatial_data
geocsv_data = pd.read_csv('geospatial_data.csv')
geocsv_data.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
geocsv_data.head()

--2020-08-01 09:39:20--  https://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 158.85.108.86, 158.85.108.83, 169.48.113.194
Connecting to cocl.us (cocl.us)|158.85.108.86|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-01 09:39:27--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.29.197
Connecting to ibm.box.com (ibm.box.com)|107.152.29.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2020-08-01 09:39:28--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Reusing existing connection to ibm.box.com:443.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.ent.box.com/public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge dataframe totonto and geocsv to create a new dataframe

In [6]:
df = pd.merge(toronto, geocsv_data, on='PostalCode')
print('The shape of the cleaned table is ',df.shape)
df.head()

The shape of the cleaned table is  (103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Visualizing all the Neighbourhoods of the above data frame using Folium


### Get all the rows with 'Toronto' in their Borough.

In [7]:
df2 = df[df['Borough'].str.contains('Toronto',regex=False)]
print('The shape of the cleaned table with Toronot is ',df2.shape)
df2.head()

The shape of the cleaned table with Toronot is  (39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Display the map of Toronto

In [8]:
#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np

address = 'Toronto'

geolocator = Nominatim(user_agent="Toronto_explore")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

map_toronto = folium.Map(location=[latitude,longitude],zoom_start=12)

for lat,lng,borough,neighbourhood in zip(df2['Latitude'],df2['Longitude'],df2['Borough'],df2['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=8,
    popup=label,
    color='black',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Using K-Means clustering for the clsutering of the neighborhoods
k=4, to separate the neighbourhoods into 4 clusters based on therir latitude and longitude

In [9]:
k=4

toronto_clustering = df2.drop(['PostalCode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(toronto_clustering)
print(kmeans.labels_)
df2.insert(0, 'Cluster Labels', kmeans.labels_)

print('The shape of the cleaned table with Toronot is ',df2.shape)
df2.head()

[1 1 1 1 3 1 1 2 1 2 1 2 3 1 2 3 1 3 0 0 0 0 2 0 1 2 0 1 2 0 1 0 1 1 1 1 1
 1 3]
The shape of the cleaned table with Toronot is  (39, 6)


Unnamed: 0,Cluster Labels,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,1,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,1,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,3,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Display the map of Toronto with the 4 clusters

In [10]:
# create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df2['Latitude'], df2['Longitude'], df2['Neighborhood'], df2['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=8,
        popup=label,
        color='black',
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters