# First Assignment - Explore and cluster the neighborhoods in Toronto

## Scraping data from Wiki

In [1]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np
!{sys.executable} -m pip install folium
import folium # map rendering library
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urlopen(url)
html = page.read().decode("utf-8")
toronto_soup = BeautifulSoup(html, "html.parser")

/usr/bin/sh: {sys.executable}: command not found


## Creating DataFrame

In [2]:
table = toronto_soup.find('table', attrs={'class':'wikitable sortable'}) #getting the postal code tables
table_rows = table.find_all('tr')
headers = [h.get_text().rstrip() for h in table.find_all('th')] #getting the headers
l = []
for tr in table_rows: #retrieving the values
    td = tr.find_all('td')
    if td:
        row = [tr.text.rstrip() for tr in td]
        l.append(row)
df = pd.DataFrame(l, columns=headers)
df.dropna()
print("DataFrame original shape: ", df.shape)
df.head()

DataFrame original shape:  (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


## Dropping 'Not Assigned' borough cells

In [3]:
###################################
# Dropping 'Not Assigned' cells ###
###################################
df_filtered = df[df['Borough'] != 'Not assigned'] 
print("DataFrame filtered shape: ", df_filtered.shape)
df_filtered.head()

DataFrame filtered shape:  (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Checking for duplicates Postal Codes

In [4]:
#############################
# Checking for duplicates ###
#############################
duplicate = df_filtered.duplicated(subset=['Postal Code'])
duplicate_values = 0
if duplicate.any():
    duplicate_values += 1
    print(df_filtered['Postal Code'][~duplicate], end='\n\n')
if duplicate_values == 0:
    print("No duplicate values any longer") #there are no duplicates
    

No duplicate values any longer


## Looking for 'Not Assigned' Neighbourhood

In [5]:
print(df_filtered.loc[df_filtered['Neighbourhood'] == 'Not Assigned'])

Empty DataFrame
Columns: [Postal Code, Borough, Neighbourhood]
Index: []


## Dataframe shape

In [6]:
print("Final data frame shape:",  df_filtered.shape)

Final data frame shape: (103, 3)


# Second Assignment - Getting Longitude and Latitude of each postal code

In [7]:
#Installing geocoder

import sys
#!conda install --yes --prefix {sys.prefix} geocoder
!{sys.executable} -m pip install geocoder



In [8]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
coordinates = []
#######################################
#### NOT WORKING: PERMISSION DENIED ###
#######################################
#for postal_code in df_filtered:
    # loop until you get the coordinates
    #while(lat_lng_coords is None):
    #    g = geocoder.google('Toronto, Ontario', sensor=False)
    #    lat_lng_coords = g.latlng
    #coordinates = [c for c in lat_lng_coords]


In [9]:
#Loading data from csv file
df_coordinates = pd.read_csv('https://raw.githubusercontent.com/vscoca/Coursera_Capstone/main/Geospatial_Coordinates.csv')  
df_coordinates.head()
df_final = df_filtered.join(df_coordinates.set_index('Postal Code'), on='Postal Code')
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Third Assignment - Explore and cluster the neighborhoods in Toronto

In [10]:
# selectin only Toronto Borough
df_toronto = df_final[df_final['Borough'].str.contains('Toronto')]
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
13,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
22,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
30,M4E,East Toronto,The Beaches,43.676357,-79.293031


## Getting Neighbourhood venues

In [11]:
#config
import requests # library to handle requests

CLIENT_ID = 'FRZ2MVLR1O5VO5W2Q041BYRIYSAUJF3TE31AATZJDI1CCI5W' # your Foursquare ID
CLIENT_SECRET = 'BGT2NKHBCJARC02PD0VRXCPKFJIAPLREZUKHZEDNDQ0JZTRC' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
radius = 500

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat,
        lng,
        radius, 
        LIMIT)
        # make the GET request
        #print(url)
        results = requests.get(url).json()['response']['venues']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name'] if v['categories'] else None ) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
df_torono_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )
print(df_torono_venues.shape)
df_torono_venues.head()

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West,  Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Oldtown Bodega,43.653966,-79.360752,Café
1,"Regent Park, Harbourfront",43.65426,-79.360636,Sackville Playground,43.654656,-79.359871,Park
2,"Regent Park, Harbourfront",43.65426,-79.360636,Terroni Sud Forno Produzione e Spaccio,43.653903,-79.360018,Gourmet Shop
3,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
4,"Regent Park, Harbourfront",43.65426,-79.360636,TTC Streetcar #503 Kingston Rd,43.663514,-79.337697,Moving Target


## Getting venue one hot encoding

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(df_torono_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood'] = df_torono_venues['Neighbourhood']

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot = toronto_onehot.groupby(['Neighbourhood']).mean().reset_index()

## Getting Neighbourhood coordinates

In [46]:
df_torono_venues_coordinates = df_torono_venues[['Neighbourhood','Neighbourhood Latitude','Neighbourhood Longitude']]
df_torono_venues_coordinates.head()
df_neigh_coordinates = df_torono_venues_coordinates.drop_duplicates()
df_neigh_coordinates.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude
0,"Regent Park, Harbourfront",43.65426,-79.360636
30,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
60,"Garden District, Ryerson",43.657162,-79.378937
90,St. James Town,43.651494,-79.375418
120,The Beaches,43.676357,-79.293031


## A bit of analysis: Top 5 venues for each Neighbourhood

In [16]:
num_top_venues = 5
for borough in toronto_onehot['Neighbourhood']:
    print("----"+borough+"----")
    temp = toronto_onehot[toronto_onehot['Neighbourhood'] == borough].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
                                      venue  freq
0                                    Office  0.11
1  Residential Building (Apartment / Condo)  0.07
2                                   Parking  0.07
3                                     Hotel  0.07
4                                  Building  0.07


----Brockton, Parkdale Village, Exhibition Place----
                                      venue  freq
0                                    Office  0.24
1  Residential Building (Apartment / Condo)  0.21
2                              Tech Startup  0.10
3                          Dentist's Office  0.07
4                                       Spa  0.03


----Business reply mail Processing Centre, South Central Letter Processing Plant Toronto----
                  venue  freq
0   Government Building  0.07
1           Yoga Studio  0.03
2  Fast Food Restaurant  0.03
3    Light Rail Station  0.03
4             Surf Spot  0.03


----CN Tower, King and Spadina, Railway Lands, H

              venue  freq
0              Park  0.11
1  Asian Restaurant  0.06
2         Pet Store  0.06
3       Pizza Place  0.06
4        Playground  0.06


----The Danforth West, Riverdale----
                  venue  freq
0    Salon / Barbershop  0.07
1                   Spa  0.07
2     Health Food Store  0.07
3  Gym / Fitness Center  0.07
4      Greek Restaurant  0.07


----Toronto Dominion Centre, Design Exchange----
                venue  freq
0              Office  0.20
1                Park  0.10
2             Parking  0.07
3  Italian Restaurant  0.07
4         Event Space  0.07


----University of Toronto, Harbord----
                       venue  freq
0     College Residence Hall  0.17
1  College Academic Building  0.10
2            College Library  0.10
3                College Gym  0.07
4             Medical School  0.03




## Clustering similar neighbourhoods in five groups

In [17]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_onehot_clustering = toronto_onehot.drop('Neighbourhood', 1)
toronto_onehot_clustering

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_onehot_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 3, 4, 2, 3, 2, 1, 3, 0], dtype=int32)

## Preparing data for the map visualizatoin: adding coordinates and cluster labels

In [47]:
latitude = '43.651070'
longitude = '-79.347015'

df_toronto_onehot_coordinates = toronto_onehot.join(df_neigh_coordinates.set_index('Neighbourhood'), on='Neighbourhood')

Unnamed: 0,Neighbourhood,ATM,Adult Boutique,Advertising Agency,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Veterinarian,Vietnamese Restaurant,Voting Booth,Warehouse,Wine Shop,Winery,Women's Store,Yoga Studio,Neighbourhood Latitude,Neighbourhood Longitude
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.644771,-79.373306
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.636847,-79.428191
2,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,43.662744,-79.321558
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.033333,0.033333,0.3,0.033333,0.066667,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.628947,-79.39442
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.657952,-79.387383


In [51]:
df_toronto_onehot_coordinates.insert(0, 'Cluster Labels', kmeans.labels_)

df_toronto_onehot_coordinates.head()

Unnamed: 0,Cluster Labels,Neighbourhood,ATM,Adult Boutique,Advertising Agency,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,...,Veterinarian,Vietnamese Restaurant,Voting Booth,Warehouse,Wine Shop,Winery,Women's Store,Yoga Studio,Neighbourhood Latitude,Neighbourhood Longitude
0,1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.644771,-79.373306
1,0,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.636847,-79.428191
2,3,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,43.662744,-79.321558
3,4,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.033333,0.033333,0.3,0.033333,0.066667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.628947,-79.39442
4,2,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.657952,-79.387383


## Final visualization of clustered neighbourhoods on the map

In [52]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_onehot_coordinates['Neighbourhood Latitude'], df_toronto_onehot_coordinates['Neighbourhood Longitude'], df_toronto_onehot_coordinates['Neighbourhood'], df_toronto_onehot_coordinates['Cluster Labels']):
    print(lat,lon)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

43.644770799999996 -79.3733064
43.6368472 -79.42819140000002
43.6627439 -79.321558
43.6289467 -79.3944199
43.6579524 -79.3873826
43.669542 -79.4225637
43.6658599 -79.38315990000001
43.6481985 -79.37981690000001
43.7043244 -79.3887901
43.7127511 -79.3901975
43.66900510000001 -79.4422593
43.6484292 -79.3822802
43.6969476 -79.41130720000001
43.6571618 -79.37893709999999
43.6408157 -79.38175229999999
43.6616083 -79.46476329999999
43.6689985 -79.31557159999998
43.6532057 -79.4000493
43.7280205 -79.3887901
43.647926700000006 -79.4197497
43.6895743 -79.38315990000001
43.7153834 -79.40567840000001
43.6489597 -79.456325
43.6623015 -79.3894938
43.6542599 -79.3606359
43.65057120000001 -79.3845675
43.6795626 -79.37752940000001
43.7116948 -79.41693559999999
43.6515706 -79.4844499
43.6514939 -79.3754179
43.667967 -79.3676753
43.6464352 -79.37484599999999
43.6595255 -79.340923
43.68641229999999 -79.4000493
43.6727097 -79.40567840000001
43.67635739999999 -79.2930312
43.6795571 -79.352188
43.6471768 -7