### Load all the dependencies

In [43]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt # plotting library

# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans 
from sklearn.datasets.samples_generator import make_blobs

print('Libraries imported.')

Libraries imported.


### Load wikipedia table into pandas dataframe. The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood


In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [3]:
df = df[df['Borough'] != 'Not assigned'].reset_index(drop = True)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

Check if postal code appear only once

In [4]:
all(df['Postal code'].value_counts() ==1)

True

Replace slash with comma in Neighborhood column

In [5]:
df['Neighborhood'] = df['Neighborhood'].str.replace(' /',',')
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.


In [6]:
df['Neighborhood'][df.Neighborhood == 'Not assigned'] = df.Borough
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


use the .shape method to print the number of rows of your dataframe.

In [7]:
df.shape

(103, 3)

### Load the geographical coordinates of each postal code

In [8]:
latlong_df = pd.read_csv('http://cocl.us/Geospatial_data')
latlong_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Rename the column such that we can merge 

In [9]:
latlong_df.rename(columns={'Postal Code':'Postal code'},inplace=True)
latlong_df.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the two dataframe to concatenate latitude and longitude onto the original dataframe

In [10]:
geocoded_df = pd.merge(df,latlong_df)
geocoded_df.head()


Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Generate maps to visualize your neighborhoods and how they cluster together.

Cluster by borough, each borough have different color.

In [44]:
import folium

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[43.654260,-79.360636], zoom_start=10)

unique_borough = list(set(geocoded_df['Borough']))

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, len(unique_borough)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


In [45]:
# add markers to map
for lat, lng, borough, neighborhood in zip(geocoded_df['Latitude'], geocoded_df['Longitude'], geocoded_df['Borough'], geocoded_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= rainbow[ unique_borough.index(borough)],
        fill=True,
        fill_color= rainbow[ unique_borough.index(borough)],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [59]:
k_means = KMeans(init="k-means++", n_clusters= len(unique_borough), n_init=12)
X = np.array(geocoded_df[['Latitude','Longitude'   ]])
k_means.fit(X)
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_


In [60]:
# create map of Toronto using latitude and longitude values
map_toronto_2 = folium.Map(location=[43.654260,-79.360636], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood, k in zip(geocoded_df['Latitude'], geocoded_df['Longitude'], geocoded_df['Borough'], geocoded_df['Neighborhood'],k_means_labels):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color= rainbow[ k],
        fill=True,
        fill_color= rainbow[ k],
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_2)  

map_toronto_2

Note that when we use k-Mean to cluster the postal code by its latitude and longitude (distance with each other), the clustering result is not the same as how it is categorized into different borough. Hence, we can conclude that the clustering of postal code into borough is not done by measuring the distance between neighbourhoods.
