### Part 1 - Webscrapping

Import packages and install BeautifulSoup if necessary

In [113]:
# libraries for webscrapping
import requests
import pandas as pd
from bs4 import BeautifulSoup
import folium
# !pip install bs4  # uncomment if BeautifulSoup is not installed

# clustering and visualization
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

Load content of Wiki page into soup

In [114]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

Parse table from soup into meaningful dataframe ('Hints for scraping Notebook')

In [115]:
table_contents = []
table = soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text)
                                .split('(')[1]).strip(')'))
                                .replace(' /', ','))
                                .replace(')', ' ')).strip(' ')
        table_contents.append(cell)

df = pd.DataFrame(table_contents)
df['Borough'] = df['Borough'].replace({
    'Downtown TorontoStn A PO Boxes25 The Esplanade':
        'Downtown Toronto Stn A',
    'East TorontoBusiness reply mail Processing Centre969 Eastern':
        'East Toronto Business',
    'EtobicokeNorthwest':
        'Etobicoke Northwest',
    'East YorkEast Toronto':
        'East York/East Toronto',
    'MississaugaCanada Post Gateway Processing Centre':
        'Mississauga'})

### Part 2 Geocoder 

Get the coordinates from given csv file

In [116]:
data = pd.read_csv(
    'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/'
    'IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/'
    'Geospatial_Coordinates.csv')

Rename Postal Code column to the same in the webscrapped DataFrame

In [117]:
data.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)

Merge it with the dataframe

In [118]:
df1 = pd.merge(df, data, on='PostalCode')
df1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### Part 3 Clustering (Toronto boroughs only)

In [119]:
# set up Toronto location
address = 'Toronto, TOR'
geolocator = Nominatim(user_agent="tor_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# select only boroughs that contains 'Toronto'
tor_boroughs = df1[df1['Borough'].str.contains('Toronto')]
tor_boroughs.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [120]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(tor_boroughs['Latitude'], tor_boroughs['Longitude'],
                           tor_boroughs['Neighborhood']):
    label = folium.Popup(label)
    folium.CircleMarker([lat, lng], radius=5, popup=label).add_to(map_toronto)

map_toronto

In [121]:
kclusters = 5  # number of clusters
tor_boroughs_clust = tor_boroughs[['Latitude', 'Longitude']]  # location

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_boroughs_clust)
kmeans.labels_[:]  # check labels

array([4, 4, 4, 1, 4, 4, 0, 4, 3, 1, 4, 0, 1, 4, 0, 1, 4, 1, 2, 2, 2, 2,
       3, 2, 0, 3, 2, 0, 3, 2, 0, 2, 0, 4, 4, 4, 4, 4, 1], dtype=int32)

In [122]:
# append labels to the tor_boroughs_clust
tor_boroughs_clust.insert(0, 'Cluster', kmeans.labels_)

In [123]:
# merge it with tor_boroughs on location
tor_boroughs_clustered = pd.merge(tor_boroughs, tor_boroughs_clust,
                                  how='left', left_on=['Latitude', 'Longitude'],
                                  right_on=['Latitude', 'Longitude'])
tor_boroughs_clustered.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,4
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,4
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,4
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,1
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,4


In [125]:
# final visualization
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_boroughs_clustered['Latitude'],
                                  tor_boroughs_clustered['Longitude'],
                                  tor_boroughs_clustered['Neighborhood'],
                                  tor_boroughs_clustered['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker([lat, lon], radius=5, popup=label,
                        color=rainbow[cluster-1]).add_to(map_clusters)
map_clusters