## Import necessary Libraries

In [109]:
import urllib.request

#get_ipython().system(u' pip install beautifulsoup4')
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np # library to handle data in a vectorized manner

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

#!conda install -c conda-forge geocoder
import geocoder # import geocoder

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

## Webscrape table from wikipedia

In [110]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")

table = soup.find('table', class_='wikitable sortable')
#table

In [111]:
PostalCode = [] # List to store postal codes
Borough = [] # List to store borough names
Neighborhood = [] # List to store neighborhood names

for row in table.findAll('tr'):
    cells = row.findAll('td')
    if len(cells)==3:
        br_name = cells[1].findAll('a') 
        if not br_name: # check if borough name exists - if not, skip row
            continue
        PostalCode.append(cells[0].find(text=True)) # check if borough name exists - if yes, save postal code
        Borough.append(br_name[0].find(text=True)) # check if borough name exists - if yes, save borough name
        
        neigh_name_temp = cells[2].findAll('a') 
        if not neigh_name_temp: # check if neighborhood name exists - if not, copy borough name over
            neigh_name = br_name[0].find(text=True)
        else:
            neigh_name = cells[2].find(text=True) # check if neighborhood name exists - if yes, save neighborhood name
        
        Neighborhood.append(neigh_name)

df = pd.DataFrame({'PostalCode':PostalCode,'Borough':Borough,'Neighborhood':Neighborhood})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


### Drop duplicate zips for further analysis 
#### Lat Long are determined by zip codes and not by neighborhood names

In [112]:
print(df.shape)
df = df.drop_duplicates('PostalCode')
print(df.shape)

(207, 3)
(100, 3)


##  Get latitude and longitude coordinates of each zip code

In [114]:
# get lat long from the csv file
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
lat_long_df = pd.read_csv('Geospatial_Coordinates.csv')
lat_long_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [115]:
# assemble lat long in the main dataframe
latitude = []
longitude = []

for ind in df.index:
    test = df['PostalCode'][ind]
    
    ind_new = lat_long_df[lat_long_df['Postal Code']==test].index.tolist()
    latitude.append(lat_long_df['Latitude'][ind_new].tolist()[0])
    longitude.append(lat_long_df['Longitude'][ind_new].tolist()[0])

df['Latitude'] = latitude
df['Longitude'] = longitude

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494


## Neighborhood Segmentation

### Plotting Toronto neighborhoods on the map

In [116]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [117]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define Foursquare Credentials and Version

In [118]:
CLIENT_ID = 'JHZEFTFI5EMA5DTB5SWOVUNWGBATRBAB15HG2SE1TBGVI1CY' # your Foursquare ID
CLIENT_SECRET = 'HRHTUXSDB2GFQMM1B40FYQCD3YSD1QNWBNAWQQ2GG4SD4YMY' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JHZEFTFI5EMA5DTB5SWOVUNWGBATRBAB15HG2SE1TBGVI1CY
CLIENT_SECRET:HRHTUXSDB2GFQMM1B40FYQCD3YSD1QNWBNAWQQ2GG4SD4YMY


### Get info on neighborhoods from foursquare

In [120]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Create a new dataframe with info from Foursquare

In [121]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'])
print(toronto_venues.shape)
toronto_venues.head()

Parkwoods
Victoria Village
Harbourfront
Lawrence Heights
Queen's Park
Queen's Park
Rouge
North York
Woodbine Gardens
Downtown Toronto
North York
Etobicoke
Highland Creek
Flemingdon Park
Woodbine Heights
St. James Town
Etobicoke
Scarborough
The Beaches
Berczy Park
Woburn
Leaside
Downtown Toronto
Downtown Toronto
Scarborough
Hillcrest Village
Bathurst Manor
Thorncliffe Park
Downtown Toronto
Dovercourt Village
Scarborough Village
North York
Northwood Park
East Toronto
Downtown Toronto
Little Portugal
Scarborough
Bayview Village
CFB Toronto
East Toronto
Design Exchange
West Toronto
Clairlea
North York
Downsview West
East Toronto
Commerce Court
Downsview
Humber Summit
Cliffcrest
Newtonbrook
North York
East Toronto
Bedford Park
York
Emery
Birch Cliff
North York
North York
Lawrence Park
Central Toronto
York
Weston
Dorset Park
North York
Central Toronto
Forest Hill North
High Park
Etobicoke
Maryvale
Willowdale West
Central Toronto
The Annex
Parkdale
Kingsview Village
Agincourt
Central Toronto


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.753259,-79.329656,TTC stop - 44 Valley Woods,43.755402,-79.333741,Bus Stop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [122]:
toronto_venues.groupby('Neighborhood').count()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
Agincourt North,2,2,2,2,2,2
Alderwood,9,9,9,9,9,9
Bathurst Manor,22,22,22,22,22,22
Bayview Village,4,4,4,4,4,4
Bedford Park,26,26,26,26,26,26
Berczy Park,56,56,56,56,56,56
Birch Cliff,4,4,4,4,4,4
CFB Toronto,4,4,4,4,4,4
CN Tower,17,17,17,17,17,17


In [123]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 270 uniques categories.


In [124]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
toronto_onehot.shape

(2201, 270)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [126]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Agincourt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Agincourt North,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Alderwood,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Bathurst Manor,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.045455,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Bayview Village,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Bedford Park,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.038462
6,Berczy Park,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.017857,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
7,Birch Cliff,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,CFB Toronto,0.000000,0.000000,0.000000,0.250000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,CN Tower,0.000000,0.000000,0.000000,0.058824,0.058824,0.058824,0.117647,0.176471,0.117647,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [127]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [128]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
1,Agincourt North,Park,Playground,Women's Store,Dog Run,Deli / Bodega,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store
2,Alderwood,Pizza Place,Coffee Shop,Pool,Gym,Skating Rink,Pharmacy,Pub,Sandwich Place,Dance Studio,Deli / Bodega
3,Bathurst Manor,Coffee Shop,Shopping Mall,Pizza Place,Bank,Supermarket,Middle Eastern Restaurant,Sushi Restaurant,Restaurant,Fast Food Restaurant,Deli / Bodega
4,Bayview Village,Japanese Restaurant,Café,Bank,Chinese Restaurant,Department Store,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Women's Store


### Run *k*-means to cluster the neighborhood into 5 clusters.

In [129]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 0, 0, 0, 3, 0], dtype=int32)

In [96]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,3.0,Bus Stop,Park,Food & Drink Shop,Dog Run,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Women's Store
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Portuguese Restaurant,Pizza Place,Hockey Arena,French Restaurant,Coffee Shop,Eastern European Restaurant,Dumpling Restaurant,Drugstore,Donut Shop,Doner Restaurant
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,0.0,Coffee Shop,Bakery,Park,Pub,Mexican Restaurant,Breakfast Spot,Café,Performing Arts Venue,Beer Store,Hotel
3,M6A,North York,Lawrence Heights,43.718518,-79.464763,0.0,Clothing Store,Women's Store,Miscellaneous Shop,Furniture / Home Store,Event Space,Athletics & Sports,Arts & Crafts Store,Boutique,Vietnamese Restaurant,Accessories Store
5,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,0.0,Coffee Shop,Gym,Park,Yoga Studio,Diner,Fast Food Restaurant,Beer Bar,Italian Restaurant,Sculpture Garden,Sandwich Place


In [130]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    if np.isnan(cluster):
        continue
        
    cluster=int(cluster)
    
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters