## import libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import folium
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from pandas import json_normalize
import matplotlib.pyplot as plt

## make a request 

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_content = requests.get(url).text

## get the data

In [3]:
soup = BeautifulSoup(html_content, 'lxml')


## extract the required data

In [4]:
can_table = soup.find("table", attrs={"class": "wikitable sortable"})
can_table_data = can_table.tbody.find_all("tr")
#get all rows of table
rows = []
for tr in can_table_data:
    i = tr.find_all("td")
    if i:
        rows.append(i)
#get all the rows of the table
table_data = []
for row in rows:
    postalcode = row[0].text.strip()
    borough = row[1].text.strip()
    neighborhood = row[2].text.strip()
    if borough != 'Not assigned':
        if neighborhood == 'Not assigned':
            neighborhood = borough
        table_data.append([postalcode, borough, neighborhood])

## convert the data into dataframe

In [5]:
col = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(table_data, columns=col)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## lets check the dimensions of dataframe

In [6]:
df.shape

(103, 3)

In [7]:
dfgeo = pd.read_csv("https://cocl.us/Geospatial_data")
dfgeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
dfgeo['Postal Code'].count()

103

## Add coordinates 

In [11]:
dfgeo.rename(columns={'Postal Code' : 'PostalCode'}, inplace=True)
df_new = pd.merge(df, dfgeo, on='PostalCode', how='left')
df2 = df_new
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [13]:
df2.groupby('Borough').count()

Unnamed: 0_level_0,PostalCode,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


In [18]:
color_list = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'white', 'pink', 'gray', 'black']

## Function to get the categories of a venue

In [20]:
def get_category(row):
    try:
        category_list=row['categories']
    except:
        category_list=row['venue.categories']
        
    if len(category_list)==0:
        return None
    else:
        return category_list[0]['name']

## Function to get the venues

In [72]:
def get_data(postalcode, names, latitudes, longitudes,radius=1000):
    
    
    venues_list=[]
    for pc, name ,lat, lng in zip(postalcode, names, latitudes, longitudes):
        print(pc)
                            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        limit)
        
        result=requests.get(url).json()['response']['groups'][0]['items']
        
        venues_list.append([(
            pc,
            name,
            lat,
            lng,
            i['venue']['name'],
            i['venue']['location']['lat'],
            i['venue']['location']['lng'],
            i['venue']['categories'][0]['name']) for i in result])
        
        nearby_venue = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venue.columns=['Postal code','Name',"Latitude",'Longitude','Venue Name','Venue Latitude','Venue Longitude','Venue Category']
        
    return(nearby_venue)

## Function to get the top venues of an area

In [22]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [73]:
CLIENT_ID = 'JL1RMKER1S3QJRYL3ZEAG52SNVZJYS0TX0MVANSCCSTWF1S1' # Foursquare ID
CLIENT_SECRET = 'AYXKVGCNHIOOZINW5DC2AY2WDEAR1LAHGKJNTDUYMH13VI34' # Foursquare Secret
VERSION = '20180604'
limit = 100
radius = 1000

## This will get the venues under 1000 m range for every postal code

In [74]:
Toronto_venues = get_data(postalcode=df2['PostalCode'], names=df2['Neighborhood'], latitudes=df2['Latitude'], longitudes=df2['Longitude'])

M3A
M4A
M5A
M6A
M7A
M9A
M1B
M3B
M4B
M5B
M6B
M9B
M1C
M3C
M4C
M5C
M6C
M9C
M1E
M4E
M5E
M6E
M1G
M4G
M5G
M6G
M1H
M2H
M3H
M4H
M5H
M6H
M1J
M2J
M3J
M4J
M5J
M6J
M1K
M2K
M3K
M4K
M5K
M6K
M1L
M2L
M3L
M4L
M5L
M6L
M9L
M1M
M2M
M3M
M4M
M5M
M6M
M9M
M1N
M2N
M3N
M4N
M5N
M6N
M9N
M1P
M2P
M4P
M5P
M6P
M9P
M1R
M2R
M4R
M5R
M6R
M7R
M9R
M1S
M4S
M5S
M6S
M1T
M4T
M5T
M1V
M4V
M5V
M8V
M9V
M1W
M4W
M5W
M8W
M9W
M1X
M4X
M5X
M8X
M4Y
M7Y
M8Y
M8Z


In [75]:
Toronto_venues.head()

Unnamed: 0,Postal code,Name,Latitude,Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
0,M3A,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,M3A,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,M3A,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
3,M3A,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
4,M3A,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


In [76]:
Toronto_venues.shape  #as we see we got 4901 different venues

(4901, 8)

In [77]:
#total number of unique categories
print('Number of unique venue categories: ',len(Toronto_venues['Venue Category'].unique()))

Number of unique venue categories:  339


In [78]:
Toronto_venues.groupby('Postal code').count()

Unnamed: 0_level_0,Name,Latitude,Longitude,Venue Name,Venue Latitude,Venue Longitude,Venue Category
Postal code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
M1B,17,17,17,17,17,17,17
M1C,5,5,5,5,5,5,5
M1E,23,23,23,23,23,23,23
M1G,9,9,9,9,9,9,9
M1H,28,28,28,28,28,28,28
...,...,...,...,...,...,...,...
M9N,17,17,17,17,17,17,17
M9P,17,17,17,17,17,17,17
M9R,15,15,15,15,15,15,15
M9V,16,16,16,16,16,16,16


## Let's do onehot encoding

In [79]:
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")
Toronto_onehot['Postal code'] = Toronto_venues['Postal code']
venue_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[venue_columns]
Toronto_onehot.head(25)

Unnamed: 0,Postal code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
Toronto_onehot.shape

(4901, 340)

In [81]:
Toronto_grouped = Toronto_onehot.groupby('Postal code').mean().reset_index()
Toronto_grouped

Unnamed: 0,Postal code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0.0,0.0,0.058824,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
1,M1C,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
2,M1E,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
3,M1G,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
4,M1H,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.035714,0.0,0.035714,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,M9N,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
98,M9P,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
99,M9R,0.0,0.0,0.000000,0.0,0.0,0.066667,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0
100,M9V,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0


## This function will return the most common venues of an area

In [82]:
def most_common_venues(row, req_venues):
    row_categories = row.iloc[1:]
    raw_categories_sorted = raw_categories.sort.values(ascending=False)
    return raw_categories_sorted.index.values[0:req_venues]

## Let's get the top 10 venues of every area

In [83]:
req_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Postal code']
for ind in np.arange(req_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
top_venues = pd.DataFrame(columns=columns)
top_venues['Postal code'] = Toronto_grouped['Postal code']

for ind in np.arange(Toronto_grouped.shape[0]):
    top_venues.iloc[ind, 1:] = return_most_common_venues(Toronto_grouped.iloc[ind, :], req_venues)

top_venues.head()

Unnamed: 0,Postal code,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Coffee Shop,Trail,Fast Food Restaurant,Gym,Martial Arts School,Supermarket,Restaurant,Hardware Store,Paper / Office Supplies Store,Spa
1,M1C,Italian Restaurant,Breakfast Spot,Park,Burger Joint,Playground,Zoo,Electronics Store,Escape Room,Ethiopian Restaurant,Event Space
2,M1E,Pizza Place,Bank,Coffee Shop,Fast Food Restaurant,Food & Drink Shop,Breakfast Spot,Smoothie Shop,Supermarket,Fried Chicken Joint,Liquor Store
3,M1G,Park,Coffee Shop,Chinese Restaurant,Mobile Phone Shop,Indian Restaurant,Pizza Place,Fast Food Restaurant,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant
4,M1H,Bakery,Coffee Shop,Gas Station,Bank,Indian Restaurant,Sporting Goods Shop,Athletics & Sports,Bus Line,Fast Food Restaurant,Music Store


### Now let's run KMeans to cluster the venues in 5 clusters

In [84]:
k=5
Toronto_grouped_clustering = Toronto_grouped.drop('Postal code',1)
kmeans = KMeans(n_clusters=k, random_state=0).fit(Toronto_grouped_clustering)
kmeans.labels_[:10]

array([4, 2, 4, 2, 4, 4, 4, 4, 4, 0])

In [87]:
top_venues.insert(0, 'Cluster Labels', kmeans.labels_)

In [88]:
Toronto_merged = df2
Toronto_merged = Toronto_merged.join(top_venues.set_index('Postal code'), on='PostalCode')
Toronto_merged.head(10)

Unnamed: 0,cluster_labels,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,1,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Park,Shopping Mall,Convenience Store,Pharmacy,Bus Stop,Fish & Chips Shop,Supermarket,Food & Drink Shop,Fast Food Restaurant,Café
1,1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Coffee Shop,Sporting Goods Shop,Men's Store,Gym / Fitness Center,Café,Golf Course,Boxing Gym,Pizza Place,Playground,Park
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Café,Park,Pub,Diner,Theater,Bakery,Restaurant,Breakfast Spot,Performing Arts Venue
3,0,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4.0,Clothing Store,Furniture / Home Store,Restaurant,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Dessert Shop,Sushi Restaurant,Vietnamese Restaurant,Event Space
4,5,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Japanese Restaurant,Park,Italian Restaurant,Sushi Restaurant,Thai Restaurant,Pizza Place,Ramen Restaurant,Dance Studio,Gastropub
5,9,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242,2.0,Pharmacy,Café,Bank,Shopping Mall,Golf Course,Park,Skating Rink,Grocery Store,Bakery,Playground
6,3,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,4.0,Coffee Shop,Trail,Fast Food Restaurant,Gym,Martial Arts School,Supermarket,Restaurant,Hardware Store,Paper / Office Supplies Store,Spa
7,4,M3B,North York,Don Mills,43.745906,-79.352188,0.0,Coffee Shop,Japanese Restaurant,Pizza Place,Burger Joint,Bank,Athletics & Sports,Cafeteria,Café,Breakfast Spot,Caribbean Restaurant
8,1,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937,4.0,Brewery,Pizza Place,Gym / Fitness Center,Construction & Landscaping,Intersection,Soccer Stadium,Breakfast Spot,Fast Food Restaurant,Gastropub,Bakery
9,5,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,0.0,Coffee Shop,Gastropub,Japanese Restaurant,Hotel,Diner,Clothing Store,Middle Eastern Restaurant,New American Restaurant,Italian Restaurant,Department Store


In [89]:
Toronto_merged.drop(['cluster_labels'], axis=1, inplace=True)
Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2.0,Park,Shopping Mall,Convenience Store,Pharmacy,Bus Stop,Fish & Chips Shop,Supermarket,Food & Drink Shop,Fast Food Restaurant,Café
1,M4A,North York,Victoria Village,43.725882,-79.315572,0.0,Coffee Shop,Sporting Goods Shop,Men's Store,Gym / Fitness Center,Café,Golf Course,Boxing Gym,Pizza Place,Playground,Park
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0.0,Coffee Shop,Café,Park,Pub,Diner,Theater,Bakery,Restaurant,Breakfast Spot,Performing Arts Venue
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4.0,Clothing Store,Furniture / Home Store,Restaurant,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Dessert Shop,Sushi Restaurant,Vietnamese Restaurant,Event Space
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0.0,Coffee Shop,Japanese Restaurant,Park,Italian Restaurant,Sushi Restaurant,Thai Restaurant,Pizza Place,Ramen Restaurant,Dance Studio,Gastropub


In [95]:
Toronto_merged['Cluster Labels'].isnull().sum()

1

In [96]:
Toronto_merged.dropna(subset=['Cluster Labels'], inplace=True)
Toronto_merged['Cluster Labels'].isnull().sum()

0

In [102]:
Toronto_merged['Cluster Labels'] = Toronto_merged['Cluster Labels'].astype(int)
Toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,2,Park,Shopping Mall,Convenience Store,Pharmacy,Bus Stop,Fish & Chips Shop,Supermarket,Food & Drink Shop,Fast Food Restaurant,Café
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,Coffee Shop,Sporting Goods Shop,Men's Store,Gym / Fitness Center,Café,Golf Course,Boxing Gym,Pizza Place,Playground,Park
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,0,Coffee Shop,Café,Park,Pub,Diner,Theater,Bakery,Restaurant,Breakfast Spot,Performing Arts Venue
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763,4,Clothing Store,Furniture / Home Store,Restaurant,Coffee Shop,Fast Food Restaurant,Fried Chicken Joint,Dessert Shop,Sushi Restaurant,Vietnamese Restaurant,Event Space
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,0,Coffee Shop,Japanese Restaurant,Park,Italian Restaurant,Sushi Restaurant,Thai Restaurant,Pizza Place,Ramen Restaurant,Dance Studio,Gastropub


In [103]:
map_clusters = folium.Map(location=[43.65, -79.38], zoom_start=11)

color_list = ['red', 'blue', 'green', 'gray', 'black']
# add markers to the map
markers_colors = []
for pc, lat, lon, poi, cluster in zip(Toronto_merged['PostalCode'], Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + ' Postal code ' + str(pc), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color_list[cluster],
        fill=True,
        fill_color=color_list[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters