<a href="https://www.bigdatauniversity.com"><img src = "https://ibm.box.com/shared/static/cw2c7r3o20w9zn8gkecaeyjhgw3xdgbj.png" width="400" align="center"></a>

<h1><center>Assignment II - Clustering Toronto</center></h1>

# ---------------------------------------------- Part I ---------------------------------------------

### Import and download library

In [37]:
pip install BeautifulSoup4

Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [39]:
pip install html5lib

Note: you may need to restart the kernel to use updated packages.


In [40]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
print('Everything is downloaded and imported')

Everything is downloaded and imported


### Get a local copy in the repository of the Wikipedia article

In [41]:
#get a local copy of the Wikipedia article
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = urllib.request.urlopen(url)
article = req.read().decode()
with open('List_of_postal_codes_of_Canada:_M.html', 'w') as fo:
    fo.write(article)

### Load data from repository

In [42]:
tables = pd.read_html('List_of_postal_codes_of_Canada:_M.html', header = 0, keep_default_na = False, )
tables[0].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Extract table from the list tables

In [43]:
headings = ['Postcode', 'Borough', 'Neighbourhood']
for table in tables:
    current_headings  = table.columns.values[:3]
    if len(current_headings) != len(headings):
        continue
    if all(current_headings == headings):
        break

### Save tables into csv file into the repository 

In [44]:
table[headings].to_csv('test.txt', sep=';', header=False, index=False)

### Import table csv file as dataframe and change column name

In [45]:
df = pd.read_csv('test.txt', sep = ';', header = None)
df.columns = ['PostalCode', 'Borough', 'Neighbourhood']

### Delete rows where Borough are not assigned / rearrange index

In [46]:
df = df[df.Borough != 'Not assigned']
df.reset_index(inplace = True)
df = df.drop(['index'], axis = 1)

### Merge rows with same PostalCode and keep different Neighbourhood name separated with ','

In [47]:
df = df[df.Borough != 'Not assigned']
df.reset_index(inplace = True)
df = df.drop(['index'], axis = 1)

### Replace not assigned Neighbourhood with Borough corresponding

In [48]:
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.Borough

In [49]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [50]:
df.shape

(103, 3)

# --------------------------------------------- Part II ---------------------------------------------

### Download the data

In [51]:
!wget -O Geospatial_data.csv http://cocl.us/Geospatial_data
print('Data downloaded!')

--2019-08-24 07:50:13--  http://cocl.us/Geospatial_data
Resolving cocl.us (cocl.us)... 169.48.113.201
Connecting to cocl.us (cocl.us)|169.48.113.201|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cocl.us/Geospatial_data [following]
--2019-08-24 07:50:13--  https://cocl.us/Geospatial_data
Connecting to cocl.us (cocl.us)|169.48.113.201|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-08-24 07:50:16--  https://ibm.box.com/shared/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv
Resolving ibm.box.com (ibm.box.com)... 107.152.27.197
Connecting to ibm.box.com (ibm.box.com)|107.152.27.197|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /public/static/9afzr83pps4pwf2smjjcf1y5mvgb18rr.csv [following]
--2019-08-24 07:50:16--  https://ibm.box.com/public/static/9afzr83pps4pwf2smjjc

### Transform the data into dataframe

In [52]:
geo = pd.read_csv("Geospatial_data.csv")

### Rename columns of dataframe just created

In [53]:
geo.columns = ['PostalCode', 'Latitude', 'Longitude']

### Merge the two dataframes

In [54]:
df_geo = pd.merge(df, geo, on='PostalCode', how='left')
df_geo.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# --------------------------------------------- Part III ---------------------------------------------

### Import and download library

In [55]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries downloaded and imported.')

Libraries downloaded and imported.


### Only select the subset of data where the word "Toronto" features

In [56]:
toronto = df_geo[df_geo['Borough'].str.contains('Toronto')]

### Get the coordinates of Toronto

In [57]:
address = 'Toronto, TO'
geolocator = Nominatim(user_agent="toronto-map_data")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6523873, -79.3835641.


### Loading Foursquare API profile

In [58]:
CLIENT_ID = '3JHWEITDFXAALR1TXRJND12HEVZF2V11BENGPFLLDZHSG2YA' # your Foursquare ID
CLIENT_SECRET = 'OSMXBJNX3MX3NY3YMZCXPJ5BLPZMRIRUKMUO3DW4OY2SYXGO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 3JHWEITDFXAALR1TXRJND12HEVZF2V11BENGPFLLDZHSG2YA
CLIENT_SECRET:OSMXBJNX3MX3NY3YMZCXPJ5BLPZMRIRUKMUO3DW4OY2SYXGO


### Limit of number of venues returned by Foursquare API and define radius

In [59]:
LIMIT = 100 # are API
radius = 500 # define radius

### Create the function getNearbyVenues to get names, categories and coordinates of nearby places

In [60]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'PostalCode Latitude', 
                  'PostalCode Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Run the above function on each neighborhood and create a new dataframe called toronto_venues

In [None]:
toronto_venues = getNearbyVenues(names=df_geo['PostalCode'], latitudes=df_geo['Latitude'], longitudes=df_geo['Longitude'])
print("function runned on df_geo and new dataframe toronto_venues created")

In [None]:
toronto_venues.head()

### One hot encoding according to each category of nearby venues

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add PostalCode column back to dataframe
toronto_onehot['PostalCode'] = toronto_venues['PostalCode'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

### Merge rows by PostalCode, agregate all the onehot

In [None]:
toronto_grouped = toronto_onehot.groupby('PostalCode').mean().reset_index()
toronto_grouped.head()

### Function to find the most common venues

In [442]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Create loop for columns according to number of top venues

In [443]:
num_top_venues = 5
indicators = ['st', 'nd', 'rd']
columns = ['PostalCode']

for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

### Create a new dataframe

In [444]:
PostalCode_venues_sorted = pd.DataFrame(columns=columns) #Create the columns based on the loop above
PostalCode_venues_sorted['PostalCode'] = toronto_grouped['PostalCode'] #Add the postal code from the data frame toronto_grouped

for ind in np.arange(toronto_grouped.shape[0]):
    PostalCode_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

PostalCode_venues_sorted.head()

Unnamed: 0,PostalCode,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Fast Food Restaurant,Yoga Studio,Electronics Store,Dog Run,Doner Restaurant
1,M1C,Bar,Yoga Studio,Electronics Store,Doner Restaurant,Donut Shop
2,M1E,Breakfast Spot,Rental Car Location,Intersection,Pizza Place,Electronics Store
3,M1G,Coffee Shop,Korean Restaurant,Electronics Store,Dog Run,Doner Restaurant
4,M1H,Hakka Restaurant,Thai Restaurant,Fried Chicken Joint,Bank,Bakery


### Perform K-Means Clustering

In [445]:
toronto_clustering = toronto_grouped.drop('PostalCode', axis = 1)

In [446]:
k_clusters = 5
kmeans = KMeans(n_clusters = k_clusters, random_state = 0).fit(toronto_clustering)
kmeans.labels_

array([1, 4, 3, 3, 1, 3, 3, 1, 3, 1, 3, 3, 3, 1, 0, 3, 1, 3, 1, 3, 0, 3,
       0, 3, 0, 1, 3, 3, 3, 0, 1, 3, 1, 3, 1, 1, 3, 3, 3, 0, 3, 3, 3, 0,
       3, 3, 3, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 0, 3, 3,
       3, 3, 3, 3, 3, 1, 3, 0, 1, 1, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 0, 3, 3, 1, 3, 3, 3, 0, 3, 0, 3, 1], dtype=int32)

### Add clustering labels to dataframe PostalCode_venues_sorted

In [447]:
PostalCode_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [448]:
toronto_merged = toronto

### Merge toronto_merged with PostalCode_venues_sorted to add latitude/longitude for each PostalCode

In [449]:
toronto_merged = toronto_merged.join(PostalCode_venues_sorted.set_index('PostalCode'), on='PostalCode')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Trail,Health Food Store,Pub,Coffee Shop,Neighborhood
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,3,Greek Restaurant,Coffee Shop,Italian Restaurant,Furniture / Home Store,Ice Cream Shop
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,3,Pizza Place,Park,Pub,Brewery,Liquor Store
43,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Café,Coffee Shop,Bakery,Italian Restaurant,American Restaurant
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Park,Dim Sum Restaurant,Swim School,Bus Line,Dive Bar


### Visualize clusters on map

In [450]:
# create map of Toronto using latitude and longitude values
toronto_clustermap = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
rainbow = ['red','blue','green','yellow','black','pink', 'darkgreen','cyan','forestgreen']

# Plot the selected PostalCode to the map of Toronto and add marker
markers_colors = []
for lat, lon, post, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(post) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-3],
        fill_opacity=0.7).add_to(toronto_clustermap)
       
toronto_clustermap

### List of the venues type most recuring within each cluster

In [451]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1
Park,4


In [452]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1
Bakery,1
Grocery Store,1
Mexican Restaurant,1


In [453]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1
Garden,1


In [454]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1
Coffee Shop,12
Café,6
Pizza Place,2
Airport Terminal,1
Bar,1


In [455]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1


In [456]:
test = toronto_merged.loc[toronto_merged['Cluster Labels'] == 5].groupby('1st Most Common Venue').count()
test = test.drop(test.columns.to_series()["Borough":"5th Most Common Venue"], axis=1).sort_values(columns[0], ascending = False)
test.columns = ['Recurrence']
test.head()

Unnamed: 0_level_0,Recurrence
1st Most Common Venue,Unnamed: 1_level_1
