In [1]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import requests

pd.set_option('display.width', 1000)

# get the Wikipedia page containing postal codes for Toronto
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = urllib.request.urlopen(link)
html_doc = response.read()

# parse the HTML of the page using Beautiful Soup
soup = BeautifulSoup(html_doc, 'html.parser')

# get the Tag called <tbody> (which contains the table of postal codes)
tbody = soup.tbody

# create a dataframe to store the information scraped from the pages
toronto_neighborhoods = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])

toronto_neighborhoods_index = 0

# drill down through the table, starting at the row level, and then examining each table cell (in each row)
table_rows = tbody.find_all("tr")
for row in table_rows:
    table_cells = row.find_all("td")
    for cell in table_cells:
        anchors = cell.find_all("a")
        # if there is only one anchor, this means that there are no neighborhood names,
        # so assign the neighborhood string to the same value as the borough string
        if len(anchors) == 1:
            postal_code = str(cell.b.string)
            borough = str(cell.a.string)
            neighborhoods = str(cell.a.string)
            # add a row to the dataframe with the values obtained from the table cell
            toronto_neighborhoods.loc[toronto_neighborhoods_index] = [postal_code, borough, neighborhoods]            
            toronto_neighborhoods_index = toronto_neighborhoods_index + 1
        # else if there are at least two anchors, this means that there are neighborhood names listed
        # so loop through the neighborhood tags and concatenate them into a single string
        elif len(anchors) > 1:
            postal_code = str(cell.b.string)
            neighborhoods = ""
            index = 0
            for anchor in anchors:
                if index == 0:
                    borough = str(anchor.string)
                else:
                    neighborhoods = neighborhoods + anchor.string + ", "
                index = index + 1
            neighborhoods = neighborhoods[:-2] # remove the trailing comma and space
            # add a row to the dataframe with the values obtained from the table cell
            toronto_neighborhoods.loc[toronto_neighborhoods_index] = [postal_code, borough, neighborhoods]            
            toronto_neighborhoods_index = toronto_neighborhoods_index + 1

print(toronto_neighborhoods)
print(toronto_neighborhoods.shape)

    PostalCode              Borough                                       Neighborhood
0          M3A           North York                                          Parkwoods
1          M4A           North York                                   Victoria Village
2          M5A     Downtown Toronto                          Regent Park, Harbourfront
3          M6A           North York                   Lawrence Manor, Lawrence Heights
4          M7A         Queen's Park                                       Queen's Park
5          M9A            Etobicoke                                   Islington Avenue
6          M1B          Scarborough                                     Malvern, Rouge
7          M3B           North York                                          Don Mills
8          M4B            East York                    Parkview Hill, Woodbine Gardens
9          M5B     Downtown Toronto                           Garden District, Ryerson
10         M6B           North York        

In [2]:
# The code was removed by DSX for sharing.

In [None]:
# create two empty Series objects to collect latitudes and longitudes
latitudes = pd.Series([])
longitudes = pd.Series([])

index = 0
while index < toronto_neighborhoods.shape[0]:
    
    # add "Toronto" to the postal code so that the Google API can correctly location based
    # on both city and postal code
    extended_postal_code = "Toronto, " + toronto_neighborhoods.loc[index]['PostalCode']
    
    # construct URL to make API call
    url = 'https://maps.googleapis.com/maps/api/geocode/json?key={}&address={}'.format(API_key, extended_postal_code)

    response = requests.get(url).json() # get response
    geographical_data = response['results'][0]['geometry']['location'] # get geographical coordinates
    latitude = geographical_data['lat']
    longitude = geographical_data['lng']
    # create new rows in the "latitudes" and "longitudes" Series objects with the values obtained
    latitudes.loc[index] = latitude
    longitudes.loc[index] = longitude
    index = index + 1
    
# add the "latitudes" and "longitudes" columns to the main dataframe
toronto_neighborhoods['Latitude'] = latitudes
toronto_neighborhoods['Longitude'] = longitudes

print(toronto_neighborhoods)

In [None]:
import numpy as np # library to handle data in a vectorized manner

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
!pip install folium
import folium # map rendering library

print('Libraries imported.')

In [None]:
downtown_toronto_neighborhoods = toronto_neighborhoods[toronto_neighborhoods['Borough'] == "Downtown Toronto"]
downtown_toronto_neighborhoods = downtown_toronto_neighborhoods[downtown_toronto_neighborhoods['PostalCode'] != "M6G"]
downtown_toronto_neighborhoods = downtown_toronto_neighborhoods[downtown_toronto_neighborhoods['PostalCode'] != "M5W"]
downtown_toronto_neighborhoods = downtown_toronto_neighborhoods.reset_index(drop=True)

# create map of Toronto using latitude and longitude values
map_downtown_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(downtown_toronto_neighborhoods['Latitude'], downtown_toronto_neighborhoods['Longitude'], downtown_toronto_neighborhoods['Borough'], downtown_toronto_neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown_toronto)  
    
map_downtown_toronto

In [None]:
# The code was removed by DSX for sharing.

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

downtown_toronto_venues = getNearbyVenues(names=downtown_toronto_neighborhoods['Neighborhood'],
                                   latitudes=downtown_toronto_neighborhoods['Latitude'],
                                   longitudes=downtown_toronto_neighborhoods['Longitude']
                                  )



In [None]:
print(downtown_toronto_venues.shape)
downtown_toronto_venues.head()

In [None]:
downtown_toronto_venues.groupby('Neighborhood').count()

In [None]:
# one hot encoding
downtown_toronto_onehot = pd.get_dummies(downtown_toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
downtown_toronto_onehot['Neighborhood'] = downtown_toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [downtown_toronto_onehot.columns[-1]] + list(downtown_toronto_onehot.columns[:-1])
downtown_toronto_onehot = downtown_toronto_onehot[fixed_columns]

downtown_toronto_onehot.head()

In [None]:
downtown_toronto_onehot.shape

In [None]:
downtown_toronto_grouped = downtown_toronto_onehot.groupby('Neighborhood').mean().reset_index()
downtown_toronto_grouped

In [None]:
downtown_toronto_grouped.shape

In [None]:
num_top_venues = 5

for hood in downtown_toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = downtown_toronto_grouped[downtown_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = downtown_toronto_grouped['Neighborhood']

for ind in np.arange(downtown_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(downtown_toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

In [None]:
# set number of clusters
kclusters = 5

downtown_toronto_grouped_clustering = downtown_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

In [None]:
downtown_toronto_merged = downtown_toronto_neighborhoods

# add clustering labels
downtown_toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with downtown_toronto_neighborhoods to add latitude/longitude for each neighborhood
downtown_toronto_merged = downtown_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

downtown_toronto_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(downtown_toronto_merged['Latitude'], downtown_toronto_merged['Longitude'], downtown_toronto_merged['Neighborhood'], downtown_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 0, downtown_toronto_merged.columns[[2] + list(range(5, downtown_toronto_merged.shape[1]))]]

In [None]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 1, downtown_toronto_merged.columns[[2] + list(range(5, downtown_toronto_merged.shape[1]))]]

In [None]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 2, downtown_toronto_merged.columns[[2] + list(range(5, downtown_toronto_merged.shape[1]))]]

In [None]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 3, downtown_toronto_merged.columns[[2] + list(range(5, downtown_toronto_merged.shape[1]))]]

In [None]:
downtown_toronto_merged.loc[downtown_toronto_merged['Cluster Labels'] == 4, downtown_toronto_merged.columns[[2] + list(range(5, downtown_toronto_merged.shape[1]))]]

In [None]:
# Observations:  

# Cluster 0 is differentiated by the fact that is has "Clothing Store" and "Cosmetics Shop" in its top 5; these venues don't show up in the top 10 of any other neighborhood
# in any other cluster

# Cluster 1 is differentiated by the fact that there are a lot of "Coffee Shops" and "Cafe" in the top 5 for each neighborhood, but only few venues which are 
# are actual restaurants

# Cluster 2 is differentiated by the fact that its venues are almost all airport-related 

# Cluster 3 is differentiated by the fact that it has more "Hotel" and "...Restaurant" venues in its top 5 than other clusters

# Cluster 4 is differentiated by the fact that it has more "Bar" and "Cocktail Bar" venues in its top 5 than other clusters