# Toronto Web Scraping
_Explores Toronto Boroughs and Neighborhoods using the Foursqaure API and clustering techniques. Help from Alex Aklson and Polong Lin_

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

### Read Wikipedia Wikipedia Page with Pandas

In [2]:
wiki = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
table = pd.read_html(wiki, header = 0)

### Grab the First Table in Wikipedia Page

In [3]:
toronto = table[0]

In [4]:
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Filter Out Bourough Containg 'Not Assigned'

In [5]:
toronto = toronto[toronto['Borough'] != 'Not assigned']

### Aggreagte Neighbourhoods with Same Boroughs
Groups by each Borough then joins the Neighbourhoods with a comma

In [6]:
df = toronto.groupby('Postcode').agg(lambda t: ', '.join(set(t))).reset_index()

### Replace 'Not Assigned' Neighbourhoods With Borough

In [7]:
df['Neighbourhood'].replace('Not Assigned', df['Borough'], inplace = True)

### Get Number of Rows

In [8]:
df.shape[0]

103

## Adding Longitude and Latitude

In [9]:
geo = pd.read_csv('Geospatial_Coordinates.csv')
geo.rename(columns = {'Postal Code':'Postcode'}, inplace = True)

In [10]:
df1 = pd.merge(df, geo, how = 'inner', on = 'Postcode')
df1

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Richview Gardens, Martin Gr...",43.688905,-79.554724
101,M9V,Etobicoke,"Jamestown, South Steeles, Thistletown, Humberg...",43.739416,-79.588437


In [11]:
df1['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [12]:
#!conda install -c conda-forge geopy --yes

In [13]:
#!conda install -c conda-forge folium=0.5.0 --yes

In [14]:
from geopy.geocoders import Nominatim 
import requests
import folium

### Get Longitude and Latitude Function

In [15]:
def get_lat(borough):
    lat = df1[df1['Borough']==borough]['Latitude'].iloc[0]
    return lat

def get_lot(borough):
    long = df1[df1['Borough']==borough]['Longitude'].iloc[0]
    return long

### York Latitude and Longitude

In [16]:
boro = 'York'
latitude = get_lat(boro)
longitude = get_lot(boro)

In [17]:
york_df = df1[df1['Borough'] == 'York'].reset_index(drop=True)

### Create York Map

In [18]:
york_map = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(york_df['Latitude'], york_df['Longitude'], york_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(york_map)  
    
york_map

### Get Foursqaure API Parameters

In [19]:
CLIENT_ID = 'GXEXHOWAMC5YBFNFSRPF0BLCKY4UIZSHBCW3QFPYG0TFHKV5' # your Foursquare ID
CLIENT_SECRET = 'TUL3YMP3DNJUBCVLEU1U14NHVSQD3SA3124SCARWKBF43SRX' # your Foursquare Secret
VERSION = '20200206' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GXEXHOWAMC5YBFNFSRPF0BLCKY4UIZSHBCW3QFPYG0TFHKV5
CLIENT_SECRET:TUL3YMP3DNJUBCVLEU1U14NHVSQD3SA3124SCARWKBF43SRX


### Get First Neighborhood in York

In [20]:
york_df.loc[0, 'Neighbourhood']

'Humewood-Cedarvale'

### Get Latitudes and Longitudes of Humewood-Cedarvale

In [21]:
neighbourhood_latitude = york_df.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = york_df.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = york_df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Humewood-Cedarvale are 43.6937813, -79.42819140000002.


### Use Foursqaure API

In [22]:
LIMIT = 100 
radius = 500 
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=GXEXHOWAMC5YBFNFSRPF0BLCKY4UIZSHBCW3QFPYG0TFHKV5&client_secret=TUL3YMP3DNJUBCVLEU1U14NHVSQD3SA3124SCARWKBF43SRX&v=20200206&ll=43.6937813,-79.42819140000002&radius=500&limit=100'

### Get Results from API Call

In [23]:
results = requests.get(url).json()

### Get Categories of each Venue

In [24]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Structure json File

In [25]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

venues = results['response']['groups'][0]['items']  
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Cedarvale Park,Field,43.692535,-79.428705
1,Cedarvale Tennis Courts,Tennis Court,43.692744,-79.432244
2,Phil White Arena,Hockey Arena,43.691303,-79.431761
3,Cedarvale Ravine,Trail,43.690188,-79.426106


### Explore Venues Near York

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### List Venues Near York

In [27]:
york_venues = getNearbyVenues(names=york_df['Neighbourhood'],
                                   latitudes=york_df['Latitude'],
                                   longitudes=york_df['Longitude'])

Humewood-Cedarvale
Caledonia-Fairbanks
Silverthorn, Del Ray, Keelesdale, Mount Dennis
The Junction North, Runnymede
Weston


In [28]:
york_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Park,43.692535,-79.428705,Field
1,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Tennis Courts,43.692744,-79.432244,Tennis Court
2,Humewood-Cedarvale,43.693781,-79.428191,Phil White Arena,43.691303,-79.431761,Hockey Arena
3,Humewood-Cedarvale,43.693781,-79.428191,Cedarvale Ravine,43.690188,-79.426106,Trail
4,Caledonia-Fairbanks,43.689026,-79.453512,KFC,43.690647,-79.456326,Fast Food Restaurant


### Analyze Venues

In [29]:
# one hot encoding
york_onehot = pd.get_dummies(york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
york_onehot['Neighborhood'] = york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [york_onehot.columns[-1]] + list(york_onehot.columns[:-1])
york_onehot = york_onehot[fixed_columns]

york_onehot.head()

Unnamed: 0,Neighborhood,Bar,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant,Field,Grocery Store,Hockey Arena,Market,Park,Restaurant,Sandwich Place,Skating Rink,Tennis Court,Trail,Turkish Restaurant,Women's Store
0,Humewood-Cedarvale,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
2,Humewood-Cedarvale,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Humewood-Cedarvale,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Caledonia-Fairbanks,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


### See Most Poplar Venues

In [30]:
york_grouped = york_onehot.groupby('Neighborhood').mean().reset_index()
york_grouped

Unnamed: 0,Neighborhood,Bar,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant,Field,Grocery Store,Hockey Arena,Market,Park,Restaurant,Sandwich Place,Skating Rink,Tennis Court,Trail,Turkish Restaurant,Women's Store
0,Caledonia-Fairbanks,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.2
1,Humewood-Cedarvale,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.0
2,"Silverthorn, Del Ray, Keelesdale, Mount Dennis",0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.2,0.0,0.0,0.2,0.0
3,"The Junction North, Runnymede",0.0,0.25,0.25,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get Top 5 Venues

In [31]:
num_top_venues = 5

for hood in york_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = york_grouped[york_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Caledonia-Fairbanks----
                  venue  freq
0                  Park   0.4
1                Market   0.2
2  Fast Food Restaurant   0.2
3         Women's Store   0.2
4     Convenience Store   0.0


----Humewood-Cedarvale----
          venue  freq
0         Trail  0.25
1  Tennis Court  0.25
2         Field  0.25
3  Hockey Arena  0.25
4           Bar  0.00


----Silverthorn, Del Ray, Keelesdale, Mount Dennis----
                venue  freq
0                 Bar   0.2
1  Turkish Restaurant   0.2
2        Skating Rink   0.2
3      Sandwich Place   0.2
4          Restaurant   0.2


----The Junction North, Runnymede----
               venue  freq
0           Bus Line  0.25
1  Convenience Store  0.25
2      Grocery Store  0.25
3     Breakfast Spot  0.25
4                Bar  0.00




### Display Top Venues for Each Neighborhood

In [32]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [33]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = york_grouped['Neighborhood']

for ind in np.arange(york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Caledonia-Fairbanks,Park,Women's Store,Fast Food Restaurant,Market,Skating Rink,Sandwich Place,Restaurant,Tennis Court,Turkish Restaurant,Hockey Arena
1,Humewood-Cedarvale,Trail,Tennis Court,Field,Hockey Arena,Women's Store,Grocery Store,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
2,"Silverthorn, Del Ray, Keelesdale, Mount Dennis",Bar,Skating Rink,Sandwich Place,Restaurant,Turkish Restaurant,Field,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
3,"The Junction North, Runnymede",Breakfast Spot,Bus Line,Convenience Store,Grocery Store,Women's Store,Hockey Arena,Fast Food Restaurant,Field,Market,Turkish Restaurant


## Cluster Neighborhoods

In [34]:
# set number of clusters
from sklearn.cluster import KMeans
kclusters = 3

york_grouped_clustering = york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 0, 1, 1], dtype=int32)

In [35]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

york_merged = york_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
york_merged = york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')
york_merged['Cluster Labels'] = york_merged['Cluster Labels'].fillna(value = 0)
york_merged['Cluster Labels'] = york_merged['Cluster Labels'].astype(int)
york_merged.head() # check the last columns!

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,0,Trail,Tennis Court,Field,Hockey Arena,Women's Store,Grocery Store,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
1,M6E,York,Caledonia-Fairbanks,43.689026,-79.453512,2,Park,Women's Store,Fast Food Restaurant,Market,Skating Rink,Sandwich Place,Restaurant,Tennis Court,Turkish Restaurant,Hockey Arena
2,M6M,York,"Silverthorn, Del Ray, Keelesdale, Mount Dennis",43.691116,-79.476013,1,Bar,Skating Rink,Sandwich Place,Restaurant,Turkish Restaurant,Field,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
3,M6N,York,"The Junction North, Runnymede",43.673185,-79.487262,1,Breakfast Spot,Bus Line,Convenience Store,Grocery Store,Women's Store,Hockey Arena,Fast Food Restaurant,Field,Market,Turkish Restaurant
4,M9N,York,Weston,43.706876,-79.518188,0,,,,,,,,,,


In [36]:
### Plotting the Cluster Map

In [37]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(york_merged['Latitude'], york_merged['Longitude'], york_merged['Neighbourhood'], york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Each Cluster

In [38]:
york_merged.loc[york_merged['Cluster Labels'] == 0, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,York,0,Trail,Tennis Court,Field,Hockey Arena,Women's Store,Grocery Store,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
4,York,0,,,,,,,,,,


In [39]:
york_merged.loc[york_merged['Cluster Labels'] == 1, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,York,1,Bar,Skating Rink,Sandwich Place,Restaurant,Turkish Restaurant,Field,Breakfast Spot,Bus Line,Convenience Store,Fast Food Restaurant
3,York,1,Breakfast Spot,Bus Line,Convenience Store,Grocery Store,Women's Store,Hockey Arena,Fast Food Restaurant,Field,Market,Turkish Restaurant


In [40]:
york_merged.loc[york_merged['Cluster Labels'] == 2, york_merged.columns[[1] + list(range(5, york_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,York,2,Park,Women's Store,Fast Food Restaurant,Market,Skating Rink,Sandwich Place,Restaurant,Tennis Court,Turkish Restaurant,Hockey Arena
