# Segmenting and Clustering Neighborhoods in Toronto

In [1]:
# Initialize libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs

The next section will open the HTML file with BeautifulSoup and confirm successful input. The HTML file was downloaded into a local folder.

In [2]:
# Open html file with BeautifulSoup and confirm the name of the tree
with open('List of postal codes of Canada_ M - Wikipedia.htm') as fp:
    soup = bs(fp)
soup.name

'[document]'

Then we will parse for a table and create headers for our dataframe. After that, we will append each row from the table to the dataframe, making sure to ignore rows with no assigned Borough, and setting the Neighborhood to the Borough if no Neighborhood is present.

In [3]:
# Check the table elements and create a dataframe

# First check table headers and create dataframe columns
labels = []
len_col = 0
for x in soup.table.find_all('th'):
    y = str(x)
    #for x in headers:
    y = y.strip('\<th\>')
    y = y.strip('</')
    y = y.strip('\n')
    labels.append(y)
    len_col = len_col + 1
t_df = pd.DataFrame(columns=labels)

# Next check each table row and append them to the dataframe
a = 0
z = {}
counter = 0
for x in soup.table.find_all(['td']):
    if a == len_col:
        a = 0
        if z['Neighbourhood'].strip('\n') == 'Not assigned':
            z['Neighbourhood'] = z['Borough']
        if z['Borough'] != 'Not assigned':
            t_df = t_df.append(z, ignore_index=True)
        z = {}
        counter = counter + 1
    temp = x.string
    if temp == None:
        temp = x.find('a').string
        temp = temp.strip('\n')
    z[labels[a]]=temp
    a = a + 1

t_df[0:10]


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


Next we will consolidate rows with the same Postcode by appending their Neighborhood values.

In [4]:
# Clean up duplicate postcodes, first by counting the number of duplicates, then appending the Neighborhood to the first entry

for x in t_df['Postcode']:
    dupe_list = t_df.loc[t_df['Postcode']==x].index
    dup = (len(t_df.loc[t_df['Postcode']==x].index))

    while dup > 1:
        if t_df.iloc[dupe_list[0]][2].find(t_df.iloc[dupe_list[dup-1]][2].strip('\n')) < 0:

            t_df.iloc[dupe_list[0]][2] = t_df.iloc[dupe_list[0]][2].strip('\n') + ', ' + t_df.iloc[dupe_list[dup-1]][2].strip('\n')
        dup = dup - 1
t_df.drop_duplicates(subset=['Postcode'], keep='first', inplace=True)        
t_df[0:20]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
4,M6A,North York,"Lawrence Heights, Lawrence Manor"
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,"Rouge, Malvern"
10,M3B,North York,Don Mills North
11,M4B,East York,"Woodbine Gardens, Parkview Hill"
13,M5B,Downtown Toronto,"Ryerson, Garden District"


Finally we confirm the shape of the dataframe.

In [5]:
t_df.shape

(103, 3)

# Getting Geolocation data from CSV

In [6]:
# geocoders isn't working for me so I'm pulling coordinates from CSV
coord = pd.read_csv('Geospatial_Coordinates.csv')
coord = coord.rename(columns={'Postal Code': 'Postcode'})
coord.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
# The index needs to be set to Postcode so the dataframes can be joined
t_df = t_df.set_index('Postcode').join(coord.set_index('Postcode'))

t_df[0:10]

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
M7A,Queen's Park,Queen's Park,43.662301,-79.389494
M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M3B,North York,Don Mills North,43.745906,-79.352188
M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [8]:
# Resetting the index so it matches the assignment instructions
t_df.reset_index()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Visualizing the data

In [9]:
# Import visualization libraries
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [10]:
# set number of clusters
kclusters = 5

t_cluster = t_df.drop(columns=['Borough','Neighbourhood'])

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(t_cluster)

# add clustering labels
t_df.insert(0, 'Cluster Labels', kmeans.labels_)

t_df[0:10]

Unnamed: 0_level_0,Cluster Labels,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
M3A,4,North York,Parkwoods,43.753259,-79.329656
M4A,4,North York,Victoria Village,43.725882,-79.315572
M5A,2,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
M6A,3,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
M7A,2,Queen's Park,Queen's Park,43.662301,-79.389494
M9A,1,Etobicoke,Islington Avenue,43.667856,-79.532242
M1B,0,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M3B,4,North York,Don Mills North,43.745906,-79.352188
M4B,4,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
M5B,2,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [13]:
# create map, hardcoded with Toronto's lat/long
map_clusters = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(t_df['Latitude'], t_df['Longitude'], t_df['Neighbourhood'], t_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters