In [1]:
import re, csv

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import folium
from sklearn.cluster import KMeans
import matplotlib.colors as colors
import matplotlib.cm as cm

# 0. Scrape Data from Wikipedia

In [2]:
source = requests.get(' https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
soup = BeautifulSoup(source.text, 'lxml')

In [4]:
content = soup.find('tbody')

## Write the data into a csv file (Delete the nonsigned Postal code)

In [5]:
with open('toronto_data.csv', 'w') as csv_file:
    csv_writer = csv.writer(csv_file)
    for tr in content.find_all('tr'): # each row
        row = []
        for td in tr.find_all('td'):
            row.append(td.text.replace('\n', ''))
        # for the first row (titles)
        if not row: row = re.split(r'\n+', tr.text)[1:-1]
        # skip the 'Not assgined' rows
        if row[1] == 'Not assigned': continue
        csv_writer.writerow(row)

# 1. Transform the data into a dataframe

In [6]:
df = pd.read_csv('toronto_data.csv')
df.rename(columns={'Postal code': 'PostCode'}, inplace=True)
df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [7]:
groups = df.groupby(by=df.columns[0])

In [8]:
for key, group in groups:
    if len(group) >1: 
        print(key, group.shape)
        break
else:
    print("No duplicate values in the column 'Postal code'!")

No duplicate values in the column 'Postal code'!


## seperate neighborhoods belonging to the same borough by comma

In [9]:
df.iloc[:,-1] = df.iloc[:, -1].apply(lambda x: ', '.join(re.split(r'\ +/\ +', x.strip())))

In [10]:
df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Shape of Dataframe

In [11]:
df.shape

(103, 3)

# 2. Get the latitude and longitude coordinates

In [12]:
"""
import geocoder
lat_lng = np.zeros((len(df), 2))
for i, postal_code in enumerate(df.iloc[:,0]):
    lat_lng_one = None
    address = '{}, Toronto, Ontario'.format(postal_code)
    print(address)
    while True:
        g = geocoder.google(address)
        if g.ok: break
    lat_lng_one = g.latlng
    lat_lng[i] = lat_lng_one[0:2]
"""
print("")




Since the above code does not work, I use the data downloaded from the link: https://cocl.us/Geospatial_data
as the file "Geospatial_Coordinates.cvs".

In [17]:
file_lat_lng = 'Geospatial_Coordinates.csv'
df_lat_lng = pd.read_csv(file_lat_lng)
df_lat_lng.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [18]:
df_merged = df.merge(df_lat_lng, left_on=df.columns[0], right_on=df_lat_lng.columns[0], how='left').drop('Postal Code', axis=1)

In [19]:
df_merged.isnull().any()

PostCode        False
Borough         False
Neighborhood    False
Latitude        False
Longitude       False
dtype: bool

In [20]:
df_merged.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [21]:
df_merged.shape

(103, 5)

# 3. Explore the neighborhoods in Toronto

Get the boroughs with name including Toronto

In [22]:
df_toronto = df_merged[df_merged['Borough'].str.contains('Toronto')]

In [23]:
df_toronto.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [24]:
df_toronto.shape

(39, 5)

## 3.1 Create a map using the latitude and longitude

In [25]:
map_toronto = folium.Map(location=df_toronto.iloc[0, -2:], width='80%', height='80%', zoom_start=11)
# add marker to map
for lat, lng, neighbor, borough, postcode in zip(df_toronto.Latitude, 
                                                 df_toronto.Longitude, 
                                                 df_toronto.Neighborhood, 
                                                 df_toronto.Borough,
                                                 df_toronto.PostCode):
  label = 'Post {}, {}, {}'.format(postcode, neighbor,borough)
  label = folium.Popup(label, parse_html=True, max_width=300)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7,
      parse_html=False).add_to(map_toronto)  
map_toronto

# 3.2 Get top 100 venues for the neighborhoods in df_toronto from Foursquare.

Define Foursquare Credential and version

In [42]:
CLIENT_ID = 'Foursquare ID' # your Foursquare ID
CLIENT_SECRET = 'Foursquare Secret' # your Foursquare Secret
VERSION = '20200401' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: Foursquare ID
CLIENT_SECRET:Foursquare Secret


Define function to get venues for each borough

In [27]:
LIMIT = 100
def getNearbyVenues(postcodes, latitudes, longitudes, radius=500):
    venues_list=[]
    for postcode, lat, lng in zip(postcodes, latitudes, longitudes):
        print(postcode)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']      
        # return only relevant information for each nearby venue
        venues_list.append([(
            postcode, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostCode', 
                  'PostcodeLatitude', 
                  'PostcodeLongitude', 
                  'Venue', 
                  'VenueLatitude', 
                  'VenueLongitude', 
                  'VenueCategory'] 
    return(nearby_venues)

In [28]:
toronto_venues = getNearbyVenues(df_toronto.PostCode, df_toronto.Latitude, df_toronto.Longitude)

M5A
M7A
M5B
M5C
M4E
M5E
M5G
M6G
M5H
M6H
M5J
M6J
M4K
M5K
M6K
M4L
M5L
M4M
M4N
M5N
M4P
M5P
M6P
M4R
M5R
M6R
M4S
M5S
M6S
M4T
M5T
M4V
M5V
M4W
M5W
M4X
M5X
M4Y
M7Y


In [29]:
print(toronto_venues.shape)
toronto_venues.head()

(1602, 7)


Unnamed: 0,PostCode,PostcodeLatitude,PostcodeLongitude,Venue,VenueLatitude,VenueLongitude,VenueCategory
0,M5A,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,M5A,43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,M5A,43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,M5A,43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,M5A,43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot


In [30]:
print(len(toronto_venues.PostCode.unique()))
print(df_toronto.shape)
print("Nunber of the unique venue categories: ", len(toronto_venues.VenueCategory.unique()))

39
(39, 5)
Nunber of the unique venue categories:  228


# 4. Cluster the neighborhoods by using the venue categories
## 4.1 One hot vectorize the venue categories

In [31]:
venues_onehot = pd.get_dummies(toronto_venues.iloc[:,-1])
venues_onehot.insert(0, toronto_venues.columns[0], toronto_venues.iloc[:,0])
print("shape: ",venues_onehot.shape)
venues_onehot.head()

shape:  (1602, 229)


Unnamed: 0,PostCode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M5A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Gruop rows by PostCode

In [32]:
toronto_grouped = venues_onehot.groupby(by='PostCode').mean().reset_index()
toronto_grouped

Unnamed: 0,PostCode,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,M4E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,0.0,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,...,0.0,0.0,0.023256,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
2,M4L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.025
4,M4N,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,M4P,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,M4R,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
7,M4S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.029412,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,M4T,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,M4V,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0


In [33]:
len(df_toronto.Borough.unique())

4

## 4.2 Get the top 10 venues for each neighborhoodE 

In [34]:
# define the function to return the top venues
def return_most_common_venues(row, num_top_venues):
    row_sorted = row.sort_values(ascending=False)
    
    return row_sorted.index.values[0:num_top_venues]


n_top = 10
indicator = ['st', 'nd', 'rd']
# make the columns
cols = ['PostCode']
for idx in range(n_top):
  tmp_col = '{}{}_Most_Common_Venue'
  try: new_col = tmp_col.format(idx+1, indicator[idx])
  except: new_col = tmp_col.format(idx+1, 'th')
  cols.append(new_col)

toronto_top_venue = pd.DataFrame(columns=cols)
toronto_top_venue.iloc[:, 0] = toronto_grouped.iloc[:, 0]
# add top venues for each row
for idx in range(len(toronto_grouped)):
  toronto_top_venue.iloc[idx, 1:] = return_most_common_venues(
                                          toronto_grouped.iloc[idx, 1:], n_top)
toronto_top_venue.head()

Unnamed: 0,PostCode,1st_Most_Common_Venue,2nd_Most_Common_Venue,3rd_Most_Common_Venue,4th_Most_Common_Venue,5th_Most_Common_Venue,6th_Most_Common_Venue,7th_Most_Common_Venue,8th_Most_Common_Venue,9th_Most_Common_Venue,10th_Most_Common_Venue
0,M4E,Neighborhood,Pub,Coffee Shop,Health Food Store,Trail,Asian Restaurant,Yoga Studio,Discount Store,Distribution Center,Dog Run
1,M4K,Greek Restaurant,Coffee Shop,Italian Restaurant,Bookstore,Frozen Yogurt Shop,Ice Cream Shop,Furniture / Home Store,Yoga Studio,Bubble Tea Shop,Spa
2,M4L,Fast Food Restaurant,Gym,Pub,Liquor Store,Sandwich Place,Burrito Place,Italian Restaurant,Intersection,Restaurant,Ice Cream Shop
3,M4M,Café,Coffee Shop,Brewery,Gastropub,Bakery,American Restaurant,Neighborhood,Sandwich Place,Cheese Shop,Clothing Store
4,M4N,Park,Bus Line,Swim School,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant


## 4.3 Cluster the neighborhoods

Run k-means to cluster the neighborhoods into 4 clusters

In [35]:
# set the number of cluster
k = 4
toronto_cluster = toronto_grouped.drop('PostCode', axis=1)
kmeans = KMeans(n_clusters=k, random_state=42).fit(toronto_cluster)
kmeans.labels_[:10]

array([0, 0, 0, 0, 3, 0, 0, 0, 2, 0], dtype=int32)

In [36]:
df_toronto_final = df_toronto.merge(toronto_top_venue, on='PostCode')
df_toronto_final.insert(loc=5, column='ClusterLabel', value=kmeans.labels_)
df_toronto_final.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude,ClusterLabel,1st_Most_Common_Venue,2nd_Most_Common_Venue,3rd_Most_Common_Venue,4th_Most_Common_Venue,5th_Most_Common_Venue,6th_Most_Common_Venue,7th_Most_Common_Venue,8th_Most_Common_Venue,9th_Most_Common_Venue,10th_Most_Common_Venue
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Restaurant,Café,Theater,Mexican Restaurant,Shoe Store
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Diner,Sushi Restaurant,Yoga Studio,Park,Mexican Restaurant,Juice Bar,Italian Restaurant,Hobby Shop,Fried Chicken Joint
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0,Coffee Shop,Clothing Store,Café,Bubble Tea Shop,Japanese Restaurant,Cosmetics Shop,Middle Eastern Restaurant,Restaurant,Tea Room,Diner
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0,Coffee Shop,Café,Cocktail Bar,Gastropub,Italian Restaurant,American Restaurant,Seafood Restaurant,Farmers Market,Hotel,Department Store
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,Neighborhood,Pub,Coffee Shop,Health Food Store,Trail,Asian Restaurant,Yoga Studio,Discount Store,Distribution Center,Dog Run


In [37]:
df_toronto_final.groupby('ClusterLabel').count().iloc[:,0]

ClusterLabel
0    35
1     1
2     1
3     2
Name: PostCode, dtype: int64

## 4.4 Visualize the clusters on the map

In [41]:
map_clusters = folium.Map(location=df_toronto_final.loc[0, ['Latitude', 'Longitude']], 
                          width='80%', height='80%', zoom_start=11)
# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, k))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add marker to map
for lat, lng, neighbor, borough, postcode, cluster in zip(df_toronto_final.Latitude, 
                                                        df_toronto_final.Longitude, 
                                                        df_toronto_final.Neighborhood, 
                                                        df_toronto_final.Borough,
                                                        df_toronto_final.PostCode,
                                                        df_toronto_final.ClusterLabel):
  label = 'Post {}, {}, {}, Cluster {}'.format(postcode, neighbor, borough, cluster)
  label = folium.Popup(label, parse_html=True, max_width=300)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color=rainbow[cluster],
      fill=True,
      fill_color=rainbow[cluster],
      fill_opacity=0.7,
      parse_html=False).add_to(map_toronto)  
map_toronto