# Neighborhoods in Toronto

This notebook explores neighborhoods in Toronto for clustering

## 1. Preparing Neighborhood Data

In [1]:
#!conda install -c conda-forge beautifulsoup4
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

The data source of Toronto neighborhoods is a Wikipedia page. You can explore the data from <a href="http://www.wikizero.biz/index.php?q=aHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGlzdF9vZl9wb3N0YWxfY29kZXNfb2ZfQ2FuYWRhOl9N" target=_blank>here</a>

In [2]:
url="http://www.wikizero.biz/index.php?q=aHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGlzdF9vZl9wb3N0YWxfY29kZXNfb2ZfQ2FuYWRhOl9N" 
#resource for the Toronto Neighborhoods
html_doc=requests.get(url).text
soup=BeautifulSoup(html_doc)

Toronto neigbhborhood data is in a table. "th" tags are column heads, "tr" is the row and "td" tags are for cells.

In [3]:
#lets find the column names which are inside <th> tags
column_names=[]
column_heads=soup.table.find_all('th')
for ch in column_heads:
    column_names.append(ch.get_text())
column_names


['Postcode', 'Borough', 'Neighbourhood\n']

In [4]:
#select all the rows and initialize np array with column headers
rows=soup.table.find_all("tr")
a=np.array([column_names])
i=0
for r in rows:
    j=0
    cells=r.find_all('td') #select all cells in the row i.e. postcode, borough and neighbourhood
    row=[]
    for c in cells:
        row.append(c.get_text().replace("\n","")) #get the row and remove "\n" at the ende
        j=j+1
    if(i>0): #first row doesn't contain data 
        if(row[1]!='Not assigned'): #if borough is not assigned ignore
            a=np.append(a,[row],axis=0)
    i=i+1

#create data frame and assing first row as headers
df=pd.DataFrame(a)
df.columns = df.iloc[0]
df.drop(0,inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Heights


In [5]:
df.columns

Index(['Postcode', 'Borough', 'Neighbourhood\n'], dtype='object', name=0)

In [6]:
df.rename(columns={'Neighbourhood\n':'Neighborhood'},inplace=True) #change the column name
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Heights


In [7]:
df.shape

(211, 3)

The postcode column should be unique, if there are more than one neighborhood under the same postcode, the neighborhoods should be combined using "," as seperator.

In [8]:
test_postcode=df["Postcode"].is_unique
print(test_postcode)

False


In [9]:
#since the postcode column is not unique the dataframe should be sorted and neighborhoods will be combined
df.sort_values(by=['Postcode','Borough','Neighborhood'],inplace=True)
for i in range(df.shape[0]-1,0,-1):
    if df.iloc[i,0]==df.iloc[i-1,0]:
        if df.iloc[i,1]==df.iloc[i-1,1]:
            df.iloc[i-1,2]=df.iloc[i-1,2]+", "+df.iloc[i,2]
            df.drop(df.index[i],inplace=True)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
10,M1B,Scarborough,"Malvern, Rouge"
22,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill"
34,M1E,Scarborough,"Guildwood, Morningside, West Hill"
40,M1G,Scarborough,Woburn
44,M1H,Scarborough,Cedarbrae
55,M1J,Scarborough,Scarborough Village
67,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
80,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
94,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
109,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [10]:
test_postcode=df["Postcode"].is_unique
print(test_postcode)

True


The neighborhoods are combined and we have unique postcode list. <br>
Finally we can check the shape of the dataframe.

In [11]:
df.shape

(103, 3)

## 2. Adding Coordinates to the Table

We will get coordinates of the neighborhoods in Toronto by postal code and add to our neighborhood table.

In [12]:
df_coor=pd.read_csv("https://cocl.us/Geospatial_data")
df_coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
df_coor.shape

(103, 3)

In [14]:
df_toronto=pd.merge(df,df_coor,left_on="Postcode",right_on="Postal Code")
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [15]:
df_toronto.drop("Postal Code", axis=1, inplace=True)
df_toronto.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Port Union, Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 3. Clustering Neighborhoods

In [16]:
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


First of all let's find the coordinates of Toronto

In [17]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


Then we will look to the neighborhoods of Toronto on a map.

In [18]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, postcode, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Postcode'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



In [19]:
CLIENT_ID = "xxx" # your Foursquare ID
CLIENT_SECRET = "xxx" # your Foursquare Secret
VERSION = '20180605' # Foursquare API version




The function below finds the venues around the coordinate of a neighborhood(s) having the same postal code. 

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=600):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
LIMIT=50
toronto_venues=getNearbyVenues(names=df_toronto["Neighborhood"], latitudes=df_toronto["Latitude"],longitudes=df_toronto["Longitude"])

Malvern, Rouge
Highland Creek, Port Union, Rouge Hill
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South, Flemingdon Park
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale, The Danf

All the places in the neighborhoods including category data will be written to toronto_venues dataframe.

In [22]:
print(toronto_venues.shape)


(2140, 7)


Lets count the venues to see the popular neighborhoods.

In [23]:
toronto_venues[["Neighborhood","Venue"]].groupby("Neighborhood").count().sort_values(by="Venue", ascending=False).head()

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
"Adelaide, King, Richmond",50
"Harbourfront, Regent Park",50
"Commerce Court, Victoria Hotel",50
"Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West",50
"Design Exchange, Toronto Dominion Centre",50


Lets find popular places by top 10 category in Toronto

In [24]:
toronto_venues[["Venue Category", "Venue"]].groupby("Venue Category").count().sort_values(by="Venue", ascending=False).head(10)

Unnamed: 0_level_0,Venue
Venue Category,Unnamed: 1_level_1
Coffee Shop,147
Café,101
Pizza Place,67
Park,64
Restaurant,55
Bakery,49
Italian Restaurant,48
Sandwich Place,45
Fast Food Restaurant,37
Bar,36


Wow, Toronto eats Italian (pizza is italian too) and drinks lots of coffee.

In [25]:
toronto_onehot=pd.get_dummies(toronto_venues[["Venue Category"]], prefix="", prefix_sep="")
toronto_onehot["Neighborhood"]=toronto_venues["Neighborhood"]
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]


In [26]:
toronto_onehot.shape

(2140, 282)

In [27]:
toronto_grouped=toronto_onehot.groupby("Neighborhood").mean().reset_index()


In [28]:
toronto_grouped.shape

(101, 282)

Top 10 Venues for each neighborhood.

In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",American Restaurant,Coffee Shop,Café,Steakhouse,Gastropub
1,Agincourt,Lounge,Sporting Goods Shop,Sandwich Place,Breakfast Spot,Eastern European Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Fast Food Restaurant,BBQ Joint,Park,Chinese Restaurant,Women's Store
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Hardware Store,Coffee Shop,Beer Store,Fast Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pool,Skating Rink,Dance Studio
5,"Bathurst Manor, Downsview North, Wilson Heights",Coffee Shop,Frozen Yogurt Shop,Supermarket,Middle Eastern Restaurant,Shopping Mall
6,"Bathurst Quay, CN Tower, Harbourfront West, Is...",Airport Service,Coffee Shop,Boat or Ferry,Airport Terminal,Airport Lounge
7,Bayview Village,Chinese Restaurant,Bank,Café,Japanese Restaurant,Dessert Shop
8,"Bedford Park, Lawrence Manor East",Coffee Shop,Italian Restaurant,Fast Food Restaurant,Grocery Store,Bakery
9,Berczy Park,Coffee Shop,Café,Hotel,Steakhouse,Cocktail Bar


### Clustering Neighborhoods

Lets use K means for clustering the neighborhoods into 4 clusters.

In [40]:
# set number of clusters
kclusters =4

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=4).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1,
       0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2], dtype=int32)

In [44]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


(101, 7)

In [45]:
toronto_merged=pd.merge(neighborhoods_venues_sorted, df_toronto, on="Neighborhood",how="left")
print(toronto_merged.shape)
toronto_merged.head()

(101, 11)


Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Postcode,Borough,Latitude,Longitude
0,0,"Adelaide, King, Richmond",American Restaurant,Coffee Shop,Café,Steakhouse,Gastropub,M5H,Downtown Toronto,43.650571,-79.384568
1,0,Agincourt,Lounge,Sporting Goods Shop,Sandwich Place,Breakfast Spot,Eastern European Restaurant,M1S,Scarborough,43.7942,-79.262029
2,2,"Agincourt North, L'Amoreaux East, Milliken, St...",Fast Food Restaurant,BBQ Joint,Park,Chinese Restaurant,Women's Store,M1V,Scarborough,43.815252,-79.284577
3,0,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Hardware Store,Coffee Shop,Beer Store,Fast Food Restaurant,M9V,Etobicoke,43.739416,-79.588437
4,0,"Alderwood, Long Branch",Pizza Place,Coffee Shop,Pool,Skating Rink,Dance Studio,M8W,Etobicoke,43.602414,-79.543484


Lets label the points on the map

In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters