# Part 1: Creating cleaned dataset of Toronto neighborhoods

In [1]:
##from the week 3 lab
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [2]:
#installing beautifulsoup for web scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [3]:
#reading the wikipedia html into my notebook
d = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [4]:
len(d)

3

In [5]:
#selected the table on the page I was interested in
d[0]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [6]:
#setting a variable for this dataframe
df = d[0]

In [7]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
8,M8A,Not assigned,Not assigned
9,M9A,Etobicoke,Islington Avenue


In [8]:
#I dropped all of the rows with no borough assigned

drop = df[(df['Borough']== 'Not assigned')].index

df.drop(drop, inplace=True)

In [9]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [10]:
#If the neighborhood wasn't assigned, I changed not assigned to the borough

df['Neighbourhood'] = np.where((df.Neighbourhood == 'Not assigned'),'Borough',df.Neighbourhood)
df

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [11]:
#I combined the neighborhoods together if the postcodes (and thus boroughs) were the same

df2 = df.groupby(['Postcode','Borough'], sort=False)['Neighbourhood'].apply(', '.join).reset_index()
df2.columns = ['Postcode', 'Borough', 'Neighbourhood']
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [12]:
#checking the shape of my finalized dataframe
df2.shape

(103, 3)

# Part 2: Creating a dataframe w/ coordinates

In [13]:
#importing the coordinates csv
import os
os.chdir('/Users/tdinman/desktop')
geodata = pd.read_csv("Geospatial_Coordinates.csv")
geodata.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#checking coordinates csv info
geodata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [15]:
#creating a dataframe with the postcodes that match in our dataframe from part 1
geo_df = geodata.loc[geodata["Postal Code"].isin(df2.Postcode.values), :]
geo_df.shape

(103, 3)

In [16]:
#merging the 2 dataframes together
geo_tbl = df2.merge(geo_df, left_on="Postcode", right_on="Postal Code")
geo_tbl.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M3A,North York,Parkwoods,M3A,43.753259,-79.329656
1,M4A,North York,Victoria Village,M4A,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,M5A,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",M6A,43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,M7A,43.662301,-79.389494


In [17]:
geo_final = geo_tbl.iloc[:,[3,1,2,4,5]] 
# excluding colunm 0 with the first list of postcodes since we have 2 columns with this data
# also re-arranging the column names to be in desired order
geo_final

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.654260,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


# Part 3: Exploring Toronto

In [19]:
import geopy
import geopandas

In [36]:
#code from the week 3 lab
#getting the coordinates of Toronto
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [41]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(geo_final['Latitude'], geo_final['Longitude'], geo_final['Borough'], geo_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [42]:
#Defining Foursquare

CLIENT_ID = 'WD1VZ1O5BSLPMSQT2L3X5P35BGCQU3QMAX5RFVXI1MSOYSWJ' # your Foursquare ID
CLIENT_SECRET = 'P5KJSMUCU3FKWQDIPHOWDMIRCPUCANZLLKP1CMYOIOM5CR42' # your Foursquare Secret
VERSION = '20161225' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: WD1VZ1O5BSLPMSQT2L3X5P35BGCQU3QMAX5RFVXI1MSOYSWJ
CLIENT_SECRET:P5KJSMUCU3FKWQDIPHOWDMIRCPUCANZLLKP1CMYOIOM5CR42


In [50]:
#the first neighborhood in the data set

geo_final.loc[0, 'Neighbourhood']

'Parkwoods'

In [57]:
#looking specifically at the Parkwoods neighborhood

Parkwoods_latitude = geo_final.loc[0, 'Latitude'] # neighborhood latitude value
Parkwoods_longitude = geo_final.loc[0, 'Longitude'] # neighborhood longitude value

Parkwoods_name = geo_final.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(Parkwoods_name , 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.7532586, -79.3296565.


In [58]:
# Looking at the top 10 venues within 500 meters of Parkwoods

LIMIT = 10 # limit of number of venues returned by Foursquare API
radius = 500 # define radius 500 meters

url='https://api.forsquare.com/v2/venues/explore?&client_id=[]&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    Parkwoods_latitude, 
    Parkwoods_longitude, 
    radius, 
    LIMIT)
url


'https://api.forsquare.com/v2/venues/explore?&client_id=[]&client_secret=WD1VZ1O5BSLPMSQT2L3X5P35BGCQU3QMAX5RFVXI1MSOYSWJ&v=P5KJSMUCU3FKWQDIPHOWDMIRCPUCANZLLKP1CMYOIOM5CR42&ll=20161225,43.7532586&radius=-79.3296565&limit=500'

In [70]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e66dd99211536001b122f7f'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Islington - City Centre West',
  'headerFullLocation': 'Islington - City Centre West, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 14,
  'suggestedBounds': {'ne': {'lat': 43.6333408045, 'lng': -79.51479402615583},
   'sw': {'lat': 43.6243407955, 'lng': -79.52720477384419}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bc9f9b6b6c49c7469688f91',
       'name': 'South St. Burger',
       'contact': {},
       'location': {'address': '1020 Islington Ave',
        'lat': 43.63131374571208,
        'lng': -79.51840794740386,
        'labeledLa

In [71]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [72]:
#Cleaned the json file and made it into a dataframe
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,South St. Burger,Burger Joint,43.631314,-79.518408
1,Wingporium,Wings Joint,43.630275,-79.518169
2,Dollarama,Discount Store,43.629883,-79.518627
3,Healthy Planet,Supplement Shop,43.630214,-79.518495
4,Artisano Bakery Café,Bakery,43.631006,-79.518172


In [73]:
#showing the # of venues returned by Foursquare
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

5 venues were returned by Foursquare.


In [82]:
#exploring the top 5 venues in each zip code

LIMIT = 5
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(geo_final.Neighbourhood, geo_final.Latitude, geo_final.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [87]:
# create data frame based on 'Location_list'

temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','Category']
temp.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
2,Harbourfront,43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
3,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,Roots,43.718221,-79.466776,Boutique
4,Queen's Park,43.662301,-79.389494,Queen's Park,43.663946,-79.39218,Park


In [88]:
print("{} nearby locations downloaded for {} neighbourhood.".format(len(temp.Venue), len(geo_final.Neighbourhood)))

100 nearby locations downloaded for 103 neighbourhood.


In [90]:
#making a table with the neighborhood and venue categories

cat = pd.get_dummies(temp.Category) # one hot encoding with get_dummies()
df3 = pd.concat([temp[['Neighbourhood']], cat], axis=1)
df3.head()

Unnamed: 0,Neighbourhood,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Brewery,...,Sandwich Place,Skating Rink,Sports Bar,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Queen's Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
#import k-means and make_blobs like we did in the lab

from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_blobs

In [119]:
df4 = df3.drop('Neighbourhood', axis=1)

#group neighborhoods into 5 clusters
n_group = 8 

#run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df4)

#check that cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 2, 0, 1, 0, 0, 0, 0, 2], dtype=int32)

In [116]:
# add clustering labels
df3.insert(1, 'label', kmeans.labels_)

ValueError: cannot insert label, already exists

In [97]:
df3.head()

Unnamed: 0,Neighbourhood,label,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,...,Sandwich Place,Skating Rink,Sports Bar,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,Parkwoods,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Harbourfront,4,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Lawrence Heights, Lawrence Manor",0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Queen's Park,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
#mergeing dataframe with coordinates & Foursquare dataframe
#dataframe will have coordinates and venues for each neighbourhood

merged = pd.merge(geo_final, df3, on='Neighbourhood', how='right')
merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,label,Airport,Arts & Crafts Store,Bakery,Bank,...,Sandwich Place,Skating Rink,Sports Bar,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,M3A,North York,Parkwoods,43.753259,-79.329656,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M4A,North York,Victoria Village,43.725882,-79.315572,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M7A,Downtown Toronto,Queen's Park,43.662301,-79.389494,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
#importing to customize on map
import matplotlib.cm as cm
import matplotlib.colors as colors

In [120]:
# create the map
cluster_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(n_group)
yc = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(yc)))
color1 = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged['Latitude'], merged['Longitude'], merged['Neighborhood'], merged['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color1[cluster-1],
        fill=True,
        fill_color=color1[cluster-1],
        fill_opacity=0.7).add_to(cluster_map)
       
cluster_map