In [0]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [0]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [0]:
#I'm using Beautifyl Soup to scrape Wikipedia
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')
for items in soup.find('table', class_='wikitable sortable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    try:
        PostalCode = data[0].text
        Borough = data[1].text
        Neighborhood = data[2].text.split("\n")[0]       
    except: pass
    neighborhoods = neighborhoods.append({'Borough': Borough,
                                          'Neighborhood': Neighborhood,
                                          'PostalCode': PostalCode}, ignore_index=True)

In [122]:
#Here is the input pandas data frame from Wikipedia
neighborhoods.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [0]:
# Remove unassigned neighborhoods
neighborhoods = neighborhoods[neighborhoods.Borough != 'Not assigned']

In [115]:
neighborhoods.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [0]:
# Concatonate neighborhoods in same postal code
grouped = neighborhoods.groupby('PostalCode').agg(lambda x: ','.join(set(x)))

In [118]:
grouped.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
M1E,Scarborough,"West Hill,Morningside,Guildwood"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [0]:
# This finds all still unassigned neighborhoods and gives them the borough name
mask = grouped.Neighborhood == 'Not assigned'

grouped.loc[mask,'Neighborhood'] = grouped.loc[mask,'Borough']

In [125]:
grouped.loc[mask,:]

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M7A,Queen's Park,Queen's Park


In [126]:
#Here is the shape of my output dataframe
grouped.shape

(103, 2)

In [133]:
# Import CSV file with lat long data

from google.colab import files
uploaded = files.upload()
import io


Saving Geospatial_Coordinates.csv to Geospatial_Coordinates (1).csv


In [0]:
#Put lat long data into dataframe
gc = pd.read_csv(io.BytesIO(uploaded['Geospatial_Coordinates.csv']))

In [135]:
gc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [0]:
# Merge with grouped dataframe
grpll = grouped.merge(gc, left_on='PostalCode',right_on='Postal Code', how = 'left')

In [139]:
grpll.head()

Unnamed: 0,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353
1,Scarborough,"Highland Creek,Port Union,Rouge Hill",M1C,43.784535,-79.160497
2,Scarborough,"West Hill,Morningside,Guildwood",M1E,43.763573,-79.188711
3,Scarborough,Woburn,M1G,43.770992,-79.216917
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [0]:
import folium

In [0]:
latitude = grpll.Latitude.mean()
longitude = grpll.Longitude.mean()

In [0]:
neighborhoods = grpll

In [144]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [145]:
CLIENT_ID = 'LCI1HINAS0GITQJRCF2OUU0NQDQFXEYUWQYWIDXJS1H3CCLX' # your Foursquare ID
CLIENT_SECRET = 'J4DNTHVLTC1H3FZ31MSH3IEBFSCDUEYSGOSQSMVDLHOADJZ0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: LCI1HINAS0GITQJRCF2OUU0NQDQFXEYUWQYWIDXJS1H3CCLX
CLIENT_SECRET:J4DNTHVLTC1H3FZ31MSH3IEBFSCDUEYSGOSQSMVDLHOADJZ0


In [146]:
neighborhood_latitude = neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Malvern,Rouge are 43.806686299999996, -79.19435340000001.


In [0]:
radius=1000
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

In [153]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5cc8a9c01ed2196c42216f5c'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-4d669cba83865481c948fa53-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/shops/spa_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1ed941735',
         'name': 'Spa',
         'pluralName': 'Spas',
         'primary': True,
         'shortName': 'Spa'}],
       'id': '4d669cba83865481c948fa53',
       'location': {'address': '8130 Sheppard Ave E',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Morningside Ave',
        'distance': 595,
        'formattedAddress': ['8130 Sheppard Ave E (Morningside Ave)',
         'Toronto ON M1B 3W3',
         'Canada'],
        'labeledLatLngs': [{'label': 'd

In [0]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [155]:
#Here we'll create a data frame of venues in each neighborhood
toronto_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )


Malvern,Rouge
Highland Creek,Port Union,Rouge Hill
West Hill,Morningside,Guildwood
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park,Ionview,Kennedy Park
Clairlea,Golden Mile,Oakridge
Cliffside,Cliffcrest,Scarborough Village West
Cliffside West,Birch Cliff
Scarborough Town Centre,Dorset Park,Wexford Heights
Maryvale,Wexford
Agincourt
Tam O'Shanter,Sullivan,Clarks Corners
L'Amoreaux East,Milliken,Agincourt North,Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Henry Farm,Oriole,Fairview
Bayview Village
Silver Hills,York Mills
Willowdale,Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park,Don Mills South
Bathurst Manor,Wilson Heights,Downsview North
York University,Northwood Park
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Parkview Hill,Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West,Riverdale
India Bazaar,The Beac

In [156]:
print(toronto_venues.shape)
toronto_venues.head()

(4859, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern,Rouge",43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa
1,"Malvern,Rouge",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant
2,"Malvern,Rouge",43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,"Malvern,Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
4,"Malvern,Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant


In [157]:
# Let's see how many venues there are for each neighborhood
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Agincourt,43,43,43,43,43,43
"Bathurst Manor,Wilson Heights,Downsview North",28,28,28,28,28,28
Bayview Village,14,14,14,14,14,14
Berczy Park,100,100,100,100,100,100
Business Reply Mail Processing Centre 969 Eastern,49,49,49,49,49,49
"CFB Toronto,Downsview East",21,21,21,21,21,21
Caledonia-Fairbanks,23,23,23,23,23,23
Canada Post Gateway Processing Centre,49,49,49,49,49,49
Cedarbrae,29,29,29,29,29,29


In [158]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 326 uniques categories.


In [167]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
#fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
#toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [0]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
#toronto_grouped

Let's print each neighborhood along with the top 5 most common venues

In [171]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
                 venue  freq
0                 Café  0.06
1          Coffee Shop  0.05
2                Hotel  0.04
3              Theater  0.04
4  American Restaurant  0.04


----Agincourt----
                  venue  freq
0    Chinese Restaurant  0.12
1                Bakery  0.05
2  Caribbean Restaurant  0.05
3         Shopping Mall  0.05
4           Pizza Place  0.05


----Bathurst Manor,Wilson Heights,Downsview North----
               venue  freq
0        Coffee Shop  0.07
1  Convenience Store  0.04
2      Shopping Mall  0.04
3        Men's Store  0.04
4   Sushi Restaurant  0.04


----Bayview Village----
                  venue  freq
0         Grocery Store  0.14
1   Japanese Restaurant  0.14
2                  Bank  0.14
3  Fast Food Restaurant  0.07
4     Convenience Store  0.07


----Berczy Park----
         venue  freq
0  Coffee Shop  0.07
1        Hotel  0.06
2         Café  0.05
3       Bakery  0.04
4     Beer Bar  0.04


----Business Reply Ma

In [182]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Café,Coffee Shop,Hotel,Theater,American Restaurant,Concert Hall,Steakhouse,Sushi Restaurant,Pizza Place,Gym
1,Agincourt,Chinese Restaurant,Pizza Place,Shopping Mall,Coffee Shop,Caribbean Restaurant,Bakery,Malay Restaurant,Japanese Restaurant,Sushi Restaurant,Discount Store
2,"Bathurst Manor,Wilson Heights,Downsview North",Coffee Shop,Middle Eastern Restaurant,Ski Chalet,Supermarket,Dog Run,Sushi Restaurant,Sandwich Place,Fried Chicken Joint,Bank,Frozen Yogurt Shop
3,Bayview Village,Bank,Japanese Restaurant,Grocery Store,Skate Park,Chinese Restaurant,Café,Shopping Mall,Intersection,Skating Rink,Fast Food Restaurant
4,Berczy Park,Coffee Shop,Hotel,Café,Bakery,Restaurant,Beer Bar,Park,Japanese Restaurant,Cocktail Bar,Gastropub


In [0]:
## Now let's cluster the neighborhoods
from sklearn.cluster import KMeans

In [177]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       4, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1], dtype=int32)

In [185]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = neighborhoods

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Postal Code,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Scarborough,"Malvern,Rouge",M1B,43.806686,-79.194353,0.0,Fast Food Restaurant,Coffee Shop,Gym,Sandwich Place,Park,Martial Arts Dojo,Caribbean Restaurant,Fruit & Vegetable Store,Chinese Restaurant,Construction & Landscaping
1,Scarborough,"Highland Creek,Port Union,Rouge Hill",M1C,43.784535,-79.160497,2.0,Breakfast Spot,Playground,Burger Joint,Park,Italian Restaurant,Zoo,Falafel Restaurant,Eastern European Restaurant,Electronics Store,Empanada Restaurant
2,Scarborough,"West Hill,Morningside,Guildwood",M1E,43.763573,-79.188711,0.0,Pizza Place,Fast Food Restaurant,Coffee Shop,Discount Store,Smoothie Shop,Bus Station,Food & Drink Shop,Sports Bar,Beer Store,Supermarket
3,Scarborough,Woburn,M1G,43.770992,-79.216917,0.0,Coffee Shop,Pharmacy,Chinese Restaurant,Park,Electronics Store,Fast Food Restaurant,Indian Restaurant,Zoo,Dumpling Restaurant,Eastern European Restaurant
4,Scarborough,Cedarbrae,M1H,43.773136,-79.239476,0.0,Coffee Shop,Bakery,Pharmacy,Indian Restaurant,Pizza Place,Grocery Store,Fried Chicken Joint,Caribbean Restaurant,Bank,Burger Joint


In [0]:
toronto_merged=toronto_merged.dropna()

In [197]:
## Lets map out the clusters. The colors show which luster the neighborhood belongs to

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters