## Analysis of Toronto Neighborhood Venues

In [71]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

### Part One: Webscraping the Wikipedia Page

In [2]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page

<Response [200]>

In [3]:
soup = BeautifulSoup(page.content, 'html5')

Use Beautiful Soup to scrape the Wikipedia Page

In [4]:
tbody = soup.find('tbody')

Format the strings from the Wikipedia Page to fit criteria for DataFrame

In [5]:
p = tbody.find_all('p')
temp =[]
for line in p:
    full_text = line.get_text()
    par = full_text.find("(")
    pc = full_text[0:3]
    bor = full_text[3:par]
    city = full_text[par:].replace("(", "").replace(")", "").replace(" / ", ",").replace("\n", "")
    #print("string: " + pc + " " + bor + " " + city)
    if bor != "Not assigned":
        temp.append([pc,bor,city])

In [6]:
col_titles = ["PostalCode", "Borough", "Neighborhood"]
to_neigh = pd.DataFrame(temp, columns=col_titles)
to_neigh.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [7]:
to_neigh.shape

(103, 3)

### Part Two: Getting Latitude and Longitude Information for the Postal Codes

In [None]:
## Could Not get API Key instead using provided csv
import geocoder # import geocoder

temp_ll = []

for pc in to_neigh["PostalCode"]:
    print(pc)
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(pc))
      lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    print(str(latitude) + " " + str(longitude))
    temp_ll.append([latitude, longitude])
print(temp_ll)

In [8]:
to_data = pd.read_csv("Geospatial_Coordinates.csv")
to_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [9]:
to_data = to_data.rename(columns={"Postal Code":"PostalCode"})
to_merged=to_data.merge(to_neigh, on='PostalCode')
to_merged.head()

Unnamed: 0,PostalCode,Latitude,Longitude,Borough,Neighborhood
0,M1B,43.806686,-79.194353,Scarborough,"Malvern,Rouge"
1,M1C,43.784535,-79.160497,Scarborough,"Rouge Hill,Port Union,Highland Creek"
2,M1E,43.763573,-79.188711,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,43.770992,-79.216917,Scarborough,Woburn
4,M1H,43.773136,-79.239476,Scarborough,Cedarbrae


In [10]:
to_merged = to_merged[['PostalCode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude']]
to_merged.head(4)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917


In [17]:
to_merged.shape

(103, 5)

### Part Three: Clustering and Analysis

I will be running the analysis on data in the Scarborough Borough

In [31]:
scarb = to_merged[to_merged["Borough"]=="Scarborough"]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [33]:
from sklearn.cluster import KMeans 

### Map of Scarborough using Latitude and Longitude Values

In [80]:
from geopy.geocoders import Nominatim
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium
import json

In [48]:
address = 'Scarborough, ON'

geolocator = Nominatim(user_agent="Scarborough_explore")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Scarborough are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Scarborough are 43.7729744, -79.2576479.


In [54]:
# create map of New York using latitude and longitude values
map_scarb = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(scarb['Latitude'], scarb['Longitude'], scarb['Borough'], scarb['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='red',
        fill=False,
        fill_color='#3186cc',
        fill_opacity=0.8,
        parse_html=False).add_to(map_scarb)  
    
map_scarb

### Define Foursquare Credentials

In [56]:
CLIENT_ID = '-' # your Foursquare ID
CLIENT_SECRET = '-' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: FB1FG25U2R32CW5WZIDI31NFHENKIY41K55JGTX2ANOGVXQN
CLIENT_SECRET:XYXO4C0PIPUAJ0EQX04B5EYUY5XKZ52JY4QKAPTIHAMBOSJU


#### Get the top 100 venues that are in each Scarborough Neighborhood within a radius of 500 meters.

In [61]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [62]:
scarb_ven = getNearbyVenues(scarb['Neighborhood'], scarb["Latitude"], scarb["Longitude"], radius=500)
scarb_ven.head()

Malvern,Rouge
Rouge Hill,Port Union,Highland Creek
Guildwood,Morningside,West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park,Ionview,East Birchmount Park
Golden Mile,Clairlea,Oakridge
Cliffside,Cliffcrest,Scarborough Village West
Birch Cliff,Cliffside West
Dorset Park,Wexford Heights,Scarborough Town Centre
Wexford,Maryvale
Agincourt 
Clarks Corners,Tam O'Shanter,Sullivan
Milliken,Agincourt North,Steeles East,L'Amoreaux East
Steeles West,L'Amoreaux West
Upper Rouge


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern,Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497,Great Shine Window Cleaning,43.783145,-79.157431,Home Service
2,"Rouge Hill,Port Union,Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood,Morningside,West Hill",43.763573,-79.188711,RBC Royal Bank,43.76679,-79.191151,Bank
4,"Guildwood,Morningside,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [64]:
scarb_ven.shape


(94, 7)

In [65]:
scarb_ven.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,4,4,4,4,4,4
"Birch Cliff,Cliffside West",4,4,4,4,4,4
Cedarbrae,7,7,7,7,7,7
"Clarks Corners,Tam O'Shanter,Sullivan",12,12,12,12,12,12
"Cliffside,Cliffcrest,Scarborough Village West",3,3,3,3,3,3
"Dorset Park,Wexford Heights,Scarborough Town Centre",6,6,6,6,6,6
"Golden Mile,Clairlea,Oakridge",10,10,10,10,10,10
"Guildwood,Morningside,West Hill",9,9,9,9,9,9
"Kennedy Park,Ionview,East Birchmount Park",6,6,6,6,6,6
"Malvern,Rouge",1,1,1,1,1,1


In [66]:
print('There are {} uniques categories.'.format(len(scarb_ven['Venue Category'].unique())))

There are 58 uniques categories.


### One Hot Encoding 

In [67]:
# one hot encoding
scarb_onehot = pd.get_dummies(scarb_ven[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
scarb_onehot['Neighborhood'] = scarb_ven['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [scarb_onehot.columns[-1]] + list(scarb_onehot.columns[:-1])
scarb_onehot = scarb_onehot[fixed_columns]

scarb_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Rental Car Location,Restaurant,Sandwich Place,Skating Rink,Soccer Field,Supermarket,Thai Restaurant,Train Station,Vietnamese Restaurant,Women's Store
0,"Malvern,Rouge",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Rouge Hill,Port Union,Highland Creek",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Rouge Hill,Port Union,Highland Creek",0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood,Morningside,West Hill",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood,Morningside,West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Grouping the Scarborough Data by Neighborhood

In [68]:
scarb_grouped = scarb_onehot.groupby('Neighborhood').mean().reset_index()
scarb_grouped

Unnamed: 0,Neighborhood,American Restaurant,Arts & Crafts Store,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Bus Line,...,Rental Car Location,Restaurant,Sandwich Place,Skating Rink,Soccer Field,Supermarket,Thai Restaurant,Train Station,Vietnamese Restaurant,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Birch Cliff,Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
2,Cedarbrae,0.0,0.0,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0
3,"Clarks Corners,Tam O'Shanter,Sullivan",0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0
4,"Cliffside,Cliffcrest,Scarborough Village West",0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Dorset Park,Wexford Heights,Scarborough Town C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
6,"Golden Mile,Clairlea,Oakridge",0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0
7,"Guildwood,Morningside,West Hill",0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.111111,0.0,...,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,"Kennedy Park,Ionview,East Birchmount Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0
9,"Malvern,Rouge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Analysis from New York Neighborhood Analysis adapted to Scarborough for getting top 12 venues in each neighborhood

In [69]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [99]:
num_top_venues = 12

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = scarb_grouped['Neighborhood']

for ind in np.arange(scarb_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(scarb_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue
0,Agincourt,Latin American Restaurant,Breakfast Spot,Lounge,Clothing Store,Women's Store,College Stadium,Gym,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
1,"Birch Cliff,Cliffside West",General Entertainment,Skating Rink,Café,College Stadium,Women's Store,Coffee Shop,Gym,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop
2,Cedarbrae,Hakka Restaurant,Bakery,Caribbean Restaurant,Bank,Gas Station,Athletics & Sports,Thai Restaurant,General Entertainment,College Stadium,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
3,"Clarks Corners,Tam O'Shanter,Sullivan",Pizza Place,Chinese Restaurant,Bank,Convenience Store,Fast Food Restaurant,Coffee Shop,Italian Restaurant,Intersection,Fried Chicken Joint,Gas Station,Thai Restaurant,Electronics Store
4,"Cliffside,Cliffcrest,Scarborough Village West",American Restaurant,Intersection,Motel,Hobby Shop,Gym,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop,Discount Store


### K Means Clustering - Breaking the data into 4 Groups by similarity of Common Venues

In [103]:
# set number of clusters
kclusters = 4

scarb_grouped_clust = scarb_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(scarb_grouped_clust)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([0, 0, 0, 0, 0])

In [105]:
# add clustering labels
#neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

scarb_merged = scarb

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighborhood
scarb_merged = scarb_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

scarb_merged.dropna(axis=0, inplace=True)# check the last columns!
scarb_merged.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue
11,M1R,Scarborough,"Wexford,Maryvale",43.750072,-79.295849,1.0,Middle Eastern Restaurant,Vietnamese Restaurant,Auto Garage,Bakery,Sandwich Place,Gym,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop
12,M1S,Scarborough,Agincourt,43.7942,-79.262029,1.0,Latin American Restaurant,Breakfast Spot,Lounge,Clothing Store,Women's Store,College Stadium,Gym,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store
13,M1T,Scarborough,"Clarks Corners,Tam O'Shanter,Sullivan",43.781638,-79.304302,1.0,Pizza Place,Chinese Restaurant,Bank,Convenience Store,Fast Food Restaurant,Coffee Shop,Italian Restaurant,Intersection,Fried Chicken Joint,Gas Station,Thai Restaurant,Electronics Store
14,M1V,Scarborough,"Milliken,Agincourt North,Steeles East,L'Amorea...",43.815252,-79.284577,1.0,Arts & Crafts Store,Intersection,Playground,Park,Women's Store,Coffee Shop,General Entertainment,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Donut Shop
15,M1W,Scarborough,"Steeles West,L'Amoreaux West",43.799525,-79.318389,1.0,Fast Food Restaurant,Nail Salon,Bank,Gym,Pharmacy,Pizza Place,Breakfast Spot,Coffee Shop,Chinese Restaurant,Sandwich Place,Electronics Store,Supermarket


### Create Map of Clusters

In [107]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(scarb_merged['Latitude'], scarb_merged['Longitude'], scarb_merged['Neighborhood'], scarb_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=False,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

We can see that the data has been clustered into 3 distinct groups where one group is predominant