Applied Data Science Capstone
================================
### Assignment, Week 3


  ***

# **Part 1**

Imports

In [1]:
# !pip install bs4
# !pip install geocoder
import urllib.request as req
from bs4 import BeautifulSoup as BS
from time import sleep
import pandas as pd
from os import getenv
import json

  
Scraping data from wikipedia

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki_page = req.urlopen(url)
wiki_soup = BS(wiki_page, 'html.parser')
data_table = wiki_soup.find('table', class_='wikitable sortable')

headers = [x.find(text=True).rstrip() for x in data_table.findAll('th')]

df = pd.DataFrame(columns=headers)

for row in data_table.findAll('tr'):
    cells = [x.find(text=True).rstrip() for x in row.findAll('td')]
    if len(cells) != 0:
        df = df.append(pd.DataFrame([cells], columns=headers), ignore_index=True)

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean up dataframe

In [3]:
df_clean = df[df['Borough'] != 'Not assigned']

df_clean = df_clean.groupby(['Postal Code']).aggregate(lambda x: ', '.join(x)).reset_index()

print('Cleaned DataFrame shape: ', df_clean.shape)

Cleaned DataFrame shape:  (103, 3)


In [4]:
df_clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


***
# **Part 2**

In [5]:
df_coords = pd.read_csv('Geospatial_Coordinates.csv')
df_geo = pd.merge(df_clean, df_coords, on=['Postal Code', 'Postal Code'])

df_geo.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


***
# **Part 3**

In [6]:
from geopy.geocoders import Nominatim
import folium

address = "Toronto, ON"
loc = Nominatim(user_agent="toronto_explorer").geocode(address)

print('Geographic coordinates of Toronto, ON, Canada are {}, {}'.format(loc.latitude, loc.longitude))

map_toronto = folium.Map(location=[loc.latitude, loc.longitude], zoom_start=11)

# look at boroughs that have 'Toronto' in their name
df_geo = df_geo[df_geo['Borough'].str.contains('Toronto')].reset_index(drop=True)

for i, row in df_geo.iterrows():
    label = '{}, {}'.format(row['Neighborhood'], row['Borough'])
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [row['Latitude'], row['Longitude']],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_toronto)

map_toronto

Geographic coordinates of Toronto, ON, Canada are 43.6534817, -79.3839347


**Set up FourSquare params.**

In [7]:
from pandas import json_normalize
from os import path, mkdir

CLIENT_ID = getenv('FOURSQR_ID')
CLIENT_SECRET = getenv('FOURSQR_SECRET')
VERSION = '20180605'

# for local storage (remove directory 'venue_data' to reload everything from foursquare)
DATA_PATH = 'venue_data'
if not path.exists(DATA_PATH):
    mkdir(DATA_PATH)

LIMIT = 500
RADIUS = 500
CATEGORY_ID = '4d4b7105d754a06374d81259' # Food
    


**Retrieve top-level venue categories.**

In [8]:
cats_local = 'categories.json'
cats = None

if path.isfile(cats_local):
    print('Found local categories json. Loading...')
    with open(cats_local, 'r') as jsonfile:
        cats = json.load(jsonfile)
        print('Done!')
else:
    print('Local categories json not found. Loading from FourSquare...')
    cats_url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(
        CLIENT_ID, CLIENT_SECRET, VERSION)
    results = json.load(req.urlopen(cats_url))
    cats = results['response']['categories']
    print('Writing loaded categories to local json...')
    with open(cats_local, 'w') as jsonfile:
        json.dump(cats, jsonfile)
        print('Done!')

cats_id_list = []
for cat in cats:
    cats_id_list.append(cat['id'])

Found local categories json. Loading...
Done!


**Retrieving venues within 500m of postal code geolocations.**

In [9]:
df_venues = pd.DataFrame()

for i, row in df_geo.iterrows():
    for cat in cats:
        df = None
        # check for locally stored data
        filepath = path.join(DATA_PATH, '-'.join([row['Postal Code'], cat['shortName']]))
        if path.isfile(filepath):
            # locally stored data found
            print('Found local data for {}'.format('-'.join([row['Postal Code'], cat['shortName']])))
            df = pd.read_pickle(filepath, compression='gzip')
        else:
            # retrieve data from foursquare, then store locally
            print('Retrieving foursquare data for {}'.format('-'.join([row['Postal Code'], cat['shortName']])))
            url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(
                CLIENT_ID, CLIENT_SECRET, VERSION, df_geo.loc[i, 'Latitude'], df_geo.loc[i, 'Longitude'], RADIUS, LIMIT, cat['id'])

            results = json.load(req.urlopen(url))
            venues = results['response']['venues']

            df = json_normalize(venues)

            print('Storing retrieved data locally to {}...'.format(filepath))
            df.to_pickle(filepath, compression='gzip')

        if not df.empty:
            df = df[['name', 'location.lat', 'location.lng', 'categories']]
            df.columns = ['name', 'lat', 'lng', 'categories']
            df.insert(0, 'main category', cat['shortName'])
            df.insert(0, 'postcode', row['Postal Code'])
            df.insert(0, 'neighborhoods', row['Neighborhood'])
            print('Postal Code [{}] category [{}] venues dataframe shape: {}'.format(row['Postal Code'], cat['shortName'], df.shape))
            df_venues = df_venues.append(df)

df_venues = df_venues.reset_index(drop=True)
print('Done loading venues!')
print(df_venues.shape)
df_venues.head()

Found local data for M4E-Arts & Entertainment
Postal Code [M4E] category [Arts & Entertainment] venues dataframe shape: (5, 7)
Found local data for M4E-College & Education
Postal Code [M4E] category [College & Education] venues dataframe shape: (4, 7)
Found local data for M4E-Event
Found local data for M4E-Food
Postal Code [M4E] category [Food] venues dataframe shape: (38, 7)
Found local data for M4E-Nightlife
Postal Code [M4E] category [Nightlife] venues dataframe shape: (7, 7)
Found local data for M4E-Outdoors & Recreation
Postal Code [M4E] category [Outdoors & Recreation] venues dataframe shape: (19, 7)
Found local data for M4E-Professional
Postal Code [M4E] category [Professional] venues dataframe shape: (27, 7)
Found local data for M4E-Residence
Postal Code [M4E] category [Residence] venues dataframe shape: (5, 7)
Found local data for M4E-Shops
Postal Code [M4E] category [Shops] venues dataframe shape: (39, 7)
Found local data for M4E-Travel
Postal Code [M4E] category [Travel] ven

Unnamed: 0,neighborhoods,postcode,main category,name,lat,lng,categories
0,The Beaches,M4E,Arts & Entertainment,Studio 888,43.672264,-79.28891,"[{'id': '4bf58dd8d48988d1e2931735', 'name': 'A..."
1,The Beaches,M4E,Arts & Entertainment,Toronto Theatre Dance School,43.680833,-79.291376,"[{'id': '4bf58dd8d48988d134941735', 'name': 'D..."
2,The Beaches,M4E,Arts & Entertainment,Lens Factory,43.671987,-79.290421,"[{'id': '4bf58dd8d48988d1e2931735', 'name': 'A..."
3,The Beaches,M4E,Arts & Entertainment,Beaches Dance & Music Studio,43.680595,-79.2913,"[{'id': '4bf58dd8d48988d134941735', 'name': 'D..."
4,The Beaches,M4E,Arts & Entertainment,St-Denis Studios Inc.,43.675031,-79.288022,"[{'id': '4bf58dd8d48988d1e5931735', 'name': 'M..."


**Simplifying venue categories.**

In [10]:
from numpy import NaN

for i, row in df_venues.iterrows():
    cats = row['categories']
    if isinstance(cats, list):
        if len(cats) > 0:
            df_venues.loc[i, 'categories'] = cats[0]['name']
        else:
            df_venues.loc[i, 'categories'] = NaN

df_venues.head()

Unnamed: 0,neighborhoods,postcode,main category,name,lat,lng,categories
0,The Beaches,M4E,Arts & Entertainment,Studio 888,43.672264,-79.28891,Art Gallery
1,The Beaches,M4E,Arts & Entertainment,Toronto Theatre Dance School,43.680833,-79.291376,Dance Studio
2,The Beaches,M4E,Arts & Entertainment,Lens Factory,43.671987,-79.290421,Art Gallery
3,The Beaches,M4E,Arts & Entertainment,Beaches Dance & Music Studio,43.680595,-79.2913,Dance Studio
4,The Beaches,M4E,Arts & Entertainment,St-Denis Studios Inc.,43.675031,-79.288022,Music Venue


**One-hot encode venue categories, and group by postal code.**

In [11]:
df_onehot = pd.get_dummies(df_venues[['main category']], prefix='', prefix_sep='')
df_onehot['postcode'] = df_venues['postcode']
df_grouped = df_onehot.groupby('postcode').mean().reset_index()

df_grouped.head()

Unnamed: 0,postcode,Arts & Entertainment,College & Education,Event,Food,Nightlife,Outdoors & Recreation,Professional,Residence,Shops,Travel
0,M4E,0.033784,0.027027,0.0,0.256757,0.047297,0.128378,0.182432,0.033784,0.263514,0.027027
1,M4K,0.077551,0.012245,0.0,0.2,0.126531,0.17551,0.163265,0.028571,0.171429,0.044898
2,M4L,0.090426,0.010638,0.0,0.244681,0.058511,0.111702,0.159574,0.015957,0.244681,0.06383
3,M4M,0.120755,0.003774,0.003774,0.184906,0.09434,0.158491,0.181132,0.022642,0.154717,0.075472
4,M4N,0.115385,0.153846,0.0,0.038462,0.0,0.153846,0.307692,0.038462,0.153846,0.038462


**Clustering neighborhoods**

In [12]:
from sklearn.cluster import KMeans

kclusters = 5

df_clustering = df_grouped.drop('postcode', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_clustering)

df_clustered = df_geo[:]
df_clustered.insert(0, 'Cluster Labels', kmeans.labels_)
df_clustered.head()

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,0,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,4,M4M,East Toronto,Studio District,43.659526,-79.340923
4,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


**Drawing a map to illustrate clustering result.**

In [13]:
import numpy as np
from matplotlib import cm
from matplotlib import colors

map_clusters = folium.Map(location=[loc.latitude, loc.longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for postcode, lat, lng, poi, cluster in zip(df_clustered['Postal Code'], df_clustered['Latitude'], df_clustered['Longitude'], df_clustered['Neighborhood'], df_clustered['Cluster Labels']):
    label = folium.Popup('{} - {} - Cluster {}'.format(postcode, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 5,
        popup = label,
        color = rainbow[cluster],
        fill = True,
        fill_color = rainbow[cluster],
        fill_opacity = 0.7
    ).add_to(map_clusters)

map_clusters

**Observations.**

In [14]:
num_top_values = 5

indicators = ['st', 'nd', 'rd', 'th']
columns = ['Postal Code', 'Neighborhoods']
for x in range(num_top_values):
    columns.append('{}{} Most Common Venue'.format(x+1, indicators[min(len(indicators)-1, x)]))

def get_most_common(postcode):
    return df_grouped[df_grouped['postcode']==postcode].reset_index(drop=True).T.iloc[1:].sort_values(by=0, ascending=False).index.values[0:num_top_values]

**Cluster 0**

As can be seen below, Cluster 0 areas consist primarily of shops and eateries.

In [15]:
df_0 = pd.DataFrame(columns=columns)
df_0['Postal Code'] = df_clustered[df_clustered['Cluster Labels']==0]['Postal Code']
df_0['Neighborhoods'] = df_clustered[df_clustered['Cluster Labels']==0]['Neighborhood']
df_0 = df_0.reset_index(drop = True)

for i, row in df_0.iterrows():
    most_common = get_most_common(df_0.loc[i, 'Postal Code'])
    for j in range(2, 2 + num_top_values):
        df_0.iloc[i, j] = most_common[j - 2]

df_0

Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4E,The Beaches,Shops,Food,Professional,Outdoors & Recreation,Nightlife
1,M4L,"India Bazaar, The Beaches West",Food,Shops,Professional,Outdoors & Recreation,Arts & Entertainment
2,M4R,"North Toronto West, Lawrence Park",Shops,Food,Professional,Outdoors & Recreation,Arts & Entertainment
3,M6G,Christie,Shops,Professional,Food,Outdoors & Recreation,Nightlife
4,M6P,"High Park, The Junction South",Food,Professional,Shops,Nightlife,Arts & Entertainment
5,M6R,"Parkdale, Roncesvalles",Food,Shops,Professional,Arts & Entertainment,Nightlife
6,M6S,"Runnymede, Swansea",Food,Shops,Professional,Outdoors & Recreation,Nightlife
7,M7Y,"Business reply mail Processing Centre, South C...",Shops,Food,Professional,Outdoors & Recreation,Travel


**Cluster 1**

Areas in Cluster 1 appear to form an entertainment district, with primarily food-related and Arts & Entertainment venues.

In [16]:
df_1 = pd.DataFrame(columns=columns)
df_1['Postal Code'] = df_clustered[df_clustered['Cluster Labels']==1]['Postal Code']
df_1['Neighborhoods'] = df_clustered[df_clustered['Cluster Labels']==1]['Neighborhood']
df_1 = df_1.reset_index(drop = True)

for i, row in df_1.iterrows():
    most_common = get_most_common(df_1.loc[i, 'Postal Code'])
    for j in range(2, 2 + num_top_values):
        df_1.iloc[i, j] = most_common[j - 2]

df_1

Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4Y,Church and Wellesley,Food,Shops,Nightlife,Residence,College & Education
1,M5A,"Regent Park, Harbourfront",Arts & Entertainment,Professional,Food,Outdoors & Recreation,Shops
2,M5B,"Garden District, Ryerson",Food,Nightlife,Shops,College & Education,Outdoors & Recreation
3,M5C,St. James Town,Food,College & Education,Professional,Nightlife,Shops
4,M5E,Berczy Park,Food,Nightlife,Arts & Entertainment,Residence,Outdoors & Recreation
5,M5G,Central Bay Street,Food,College & Education,Residence,Nightlife,Professional
6,M5H,"Richmond, Adelaide, King",Food,Professional,Nightlife,Shops,College & Education
7,M5J,"Harbourfront East, Union Station, Toronto Islands",Food,Arts & Entertainment,Nightlife,Outdoors & Recreation,Residence
8,M5K,"Toronto Dominion Centre, Design Exchange",Food,Professional,Outdoors & Recreation,Arts & Entertainment,Nightlife
9,M5L,"Commerce Court, Victoria Hotel",Food,Professional,Nightlife,Outdoors & Recreation,Shops


**Cluster 2**

There is a good chance areas in this cluster are peaceful, with offices, parks and institutes dominating the venues list.

In [17]:
df_2 = pd.DataFrame(columns=columns)
df_2['Postal Code'] = df_clustered[df_clustered['Cluster Labels']==2]['Postal Code']
df_2['Neighborhoods'] = df_clustered[df_clustered['Cluster Labels']==2]['Neighborhood']
df_2 = df_2.reset_index(drop = True)


for i, row in df_2.iterrows():
    most_common = get_most_common(df_2.loc[i, 'Postal Code'])
    for j in range(2, 2 + num_top_values):
        df_2.iloc[i, j] = most_common[j - 2]

df_2


Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4N,Lawrence Park,Professional,College & Education,Outdoors & Recreation,Shops,Arts & Entertainment
1,M4T,"Moore Park, Summerhill East",Outdoors & Recreation,Professional,Food,Shops,Arts & Entertainment
2,M4W,Rosedale,Outdoors & Recreation,Professional,Residence,Shops,Travel
3,M5N,Roselawn,Professional,Nightlife,Outdoors & Recreation,Shops,Travel
4,M5P,"Forest Hill North & West, Forest Hill Road Park",Professional,Outdoors & Recreation,Food,Shops,Nightlife


**Cluster 3**

This cluster appears to be an outlier, containing only one Postal Code. From the map, we can tell it accomodates the only airport in the wider area, which explains it's uniqueness to the surrounding Postal Codes.

In [18]:
df_3 = pd.DataFrame(columns=columns)
df_3['Postal Code'] = df_clustered[df_clustered['Cluster Labels']==3]['Postal Code']
df_3['Neighborhoods'] = df_clustered[df_clustered['Cluster Labels']==3]['Neighborhood']
df_3 = df_3.reset_index(drop = True)


for i, row in df_3.iterrows():
    most_common = get_most_common(df_3.loc[i, 'Postal Code'])
    for j in range(2, 2 + num_top_values):
        df_3.iloc[i, j] = most_common[j - 2]

df_3


Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M5V,"CN Tower, King and Spadina, Railway Lands, Har...",Travel,Outdoors & Recreation,Food,Professional,Arts & Entertainment


**Cluster 4**

Areas in Cluster 4 appear to be geared towards corporate offices, with Professional and Food related venues topping the list.

In [19]:
df_4 = pd.DataFrame(columns=columns)
df_4['Postal Code'] = df_clustered[df_clustered['Cluster Labels']==4]['Postal Code']
df_4['Neighborhoods'] = df_clustered[df_clustered['Cluster Labels']==4]['Neighborhood']
df_4 = df_4.reset_index(drop = True)


for i, row in df_4.iterrows():
    most_common = get_most_common(df_4.loc[i, 'Postal Code'])
    for j in range(2, 2 + num_top_values):
        df_4.iloc[i, j] = most_common[j - 2]

df_4


Unnamed: 0,Postal Code,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4K,"The Danforth West, Riverdale",Food,Outdoors & Recreation,Shops,Professional,Nightlife
1,M4M,Studio District,Food,Professional,Outdoors & Recreation,Shops,Arts & Entertainment
2,M4P,Davisville North,Residence,Professional,Shops,Outdoors & Recreation,Food
3,M4S,Davisville,Food,Professional,Shops,Outdoors & Recreation,Residence
4,M4V,"Summerhill West, Rathnelly, South Hill, Forest...",Food,Professional,Residence,Shops,Outdoors & Recreation
5,M4X,"St. James Town, Cabbagetown",Food,Shops,Professional,Residence,Outdoors & Recreation
6,M6H,"Dufferin, Dovercourt Village",Shops,Professional,Outdoors & Recreation,Food,Arts & Entertainment
7,M6K,"Brockton, Parkdale Village, Exhibition Place",Food,Professional,Nightlife,Residence,Shops
