# Segmenting and Clustering Neighborhoods in Toronto

## 1. To Transfer data in pandas dataframe using BeautfulSoup Library

In [41]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [42]:
# getting data from wikipedia page using requests library

source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# print(source)

In [43]:
soup = BeautifulSoup(source, 'lxml')
# soup

In [44]:
# Converting data into pandas dataframe

data = soup.find('table', class_='wikitable sortable').tbody
# print(data)

columns = ['PostCode', 'Borough', 'Neighborhood']
toronto_data = pd.DataFrame(columns=columns)

for row in data.find_all('tr'):
    row_data = []
    for col in row.find_all('td'):
        row_data.append(col.text)
        
    if row_data:
        df = pd.DataFrame({"PostCode" : [row_data[0]],
                      "Borough" : [row_data[1]],
                      "Neighborhood" : [row_data[2]]})
        toronto_data = toronto_data.append(df, ignore_index=True)
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned\n
1,M2A,Not assigned,Not assigned\n
2,M3A,North York,Parkwoods\n
3,M4A,North York,Victoria Village\n
4,M5A,Downtown Toronto,Harbourfront\n


In [45]:
# Cleaning the data
toronto_data['Neighborhood'] = [value.split('\n')[0] for value in toronto_data['Neighborhood']]

# Removing all rows having 'Not assigned' value to both Neighborhood and Borough column
toronto_data = toronto_data[(toronto_data.Neighborhood != 'Not assigned') | (toronto_data.Borough != 'Not assigned')]

# Assigning Borough to Neighborhood columns having values as 'Not assigned'
toronto_data.loc[toronto_data.Neighborhood == 'Not assigned', 'Neighborhood'] = toronto_data['Borough']

In [46]:
# Merging rows having same 'PostCode' values

# postcodes = toronto_data['PostCode'].unique()
# print(postcodes.shape)

toronto_data = toronto_data.groupby('PostCode', as_index=False).agg(','.join)
toronto_data['Borough'] = toronto_data['Borough'].str.split(',').str[0]


In [8]:
toronto_data.head(10)

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## 2.Get the latitude and the longitude coordinates of each neighborhood from Postal Code

In [47]:
postal_data = pd.read_csv('Geospatial_Coordinates.csv')
postal_data = postal_data.rename(columns={'Postal Code': 'PostCode'})
postal_data.head()


Unnamed: 0,PostCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [48]:
toronto_data = pd.merge(toronto_data, postal_data, on='PostCode', how='left')


In [49]:
toronto_data.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [50]:
# Shape of the dataframe after applying all preprocessing technique
toronto_data.shape

(103, 5)

## 3. Explore and cluster the neighborhoods in Toronto

In [51]:
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors

In [52]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tl-toronto-neigh")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


In [53]:
# Using Folium to create a Map of Toronto with Boroughs markers on top

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, long, post, borough, neigh in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['PostCode'], toronto_data['Borough'], toronto_data['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

**Reduce the number of Boroughs to explore**
To reduce the numbers of calls to FourSquare API, we will only explore boroughs that have Toronto in their names.

In [54]:
toronto_boroughs = ['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']
toronto_central_df = toronto_data[toronto_data['Borough'].isin(toronto_boroughs)].reset_index(drop=True)
print(toronto_central_df.shape)
toronto_central_df.head()

(38, 5)


Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [55]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)


for lat, long, post, borough, neigh in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

**Using FourSquare API to explore the Boroughs**

In [57]:
# Get recommend places inside or near each Borough in Toronto Central

radius = 500
LIMIT = 100


CLIENT_ID = 'client-id'
CLIENT_SECRET = 'client-secret'
VERSION = '20180605'


venues = []

for lat, long, post, borough, neighborhood in zip(toronto_central_df['Latitude'], toronto_central_df['Longitude'], toronto_central_df['PostCode'], toronto_central_df['Borough'], toronto_central_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [58]:
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
print(venues_df.shape)
venues_df.head()

(1724, 9)


Unnamed: 0,PostCode,Borough,Neighborhood,BoroughLatitude,BoroughLongitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood


In [59]:
venues_df.groupby(['PostCode', 'Borough', 'Neighborhood'])['VenueName'].count()

PostCode  Borough           Neighborhood                                                                                        
M4E       East Toronto      The Beaches                                                                                               5
M4K       East Toronto      The Danforth West,Riverdale                                                                              44
M4L       East Toronto      The Beaches West,India Bazaar                                                                            25
M4M       East Toronto      Studio District                                                                                          38
M4N       Central Toronto   Lawrence Park                                                                                             3
M4P       Central Toronto   Davisville North                                                                                          8
M4R       Central Toronto   North Toronto West         

In [60]:
len(venues_df['VenueCategory'].unique())

240

**Analyze venues in each area**

In [63]:
# one hot encoding
toronto_central_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_central_onehot['PostCode'] = venues_df['PostCode'] 
toronto_central_onehot['Borough'] = venues_df['Borough'] 
toronto_central_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_central_onehot.columns[-3:]) + list(toronto_central_onehot.columns[:-3])
toronto_central_onehot = toronto_central_onehot[fixed_columns]

print(toronto_central_onehot.shape)
toronto_central_onehot.head()

(1724, 243)


Unnamed: 0,PostCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M4E,East Toronto,The Beaches,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**Get the frequency of occurance of each category in an area**

In [64]:
toronto_central_venues_freq = toronto_central_onehot.groupby(['PostCode', 'Borough', 'Neighborhoods']).mean().reset_index()
print(toronto_central_venues_freq.shape)
toronto_central_venues_freq.head()

(38, 243)


Unnamed: 0,PostCode,Borough,Neighborhoods,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Theme Restaurant,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,M4E,East Toronto,The Beaches,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M4K,East Toronto,"The Danforth West,Riverdale",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.022727
2,M4L,East Toronto,"The Beaches West,India Bazaar",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M4M,East Toronto,Studio District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026316
4,M4N,Central Toronto,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Get 10 most occurance venue types in each area**

In [66]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostCode'] = toronto_central_venues_freq['PostCode']
neighborhoods_venues_sorted['Borough'] = toronto_central_venues_freq['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_central_venues_freq['Neighborhoods']

for ind in np.arange(toronto_central_venues_freq.shape[0]):
    row_categories = toronto_central_venues_freq.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
neighborhoods_venues_sorted.head()

Unnamed: 0,PostCode,Borough,Neighborhoods,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",Airport Lounge,Airport Service,Airport Terminal,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina
31,M6H,West Toronto,"Dovercourt Village,Dufferin",Bakery,Supermarket,Pharmacy,Park,Bank,Bar,Furniture / Home Store,Music Venue,Middle Eastern Restaurant,Café
32,M6J,West Toronto,"Little Portugal,Trinity",Bar,Asian Restaurant,Coffee Shop,Café,Men's Store,Cocktail Bar,Pizza Place,Boutique,Restaurant,French Restaurant
35,M6R,West Toronto,"Parkdale,Roncesvalles",Breakfast Spot,Gift Shop,Restaurant,Dog Run,Movie Theater,Italian Restaurant,Bookstore,Bar,Dessert Shop,Eastern European Restaurant
25,M5S,Downtown Toronto,"Harbord,University of Toronto",Café,Bar,Japanese Restaurant,Bookstore,Sandwich Place,Restaurant,Bakery,Chinese Restaurant,Beer Bar,Beer Store


**Clustering areas**  
Use KMeans algorigthm, try to cluster the toronto central areas into 3 clusters

In [68]:
kclusters = 3

toronto_central_venues_freq_clustering = toronto_central_venues_freq.drop(['PostCode', 'Borough', 'Neighborhoods'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_central_venues_freq_clustering)

toronto_central_clustered_df = toronto_central_df
toronto_central_clustered_df['Cluster'] = kmeans.labels_

toronto_central_clustered_df = toronto_central_clustered_df.join(neighborhoods_venues_sorted.drop(['Borough', 'Neighborhoods'], 1).set_index('PostCode'), on='PostCode')
toronto_central_clustered_df.sort_values(['Cluster'] + freqColumns, inplace=True)
toronto_central_clustered_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude,Cluster,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
27,M5V,Downtown Toronto,"CN Tower,Bathurst Quay,Island airport,Harbourf...",43.628947,-79.39442,0,Airport Lounge,Airport Service,Airport Terminal,Plane,Bar,Coffee Shop,Sculpture Garden,Boutique,Boat or Ferry,Harbor / Marina
31,M6H,West Toronto,"Dovercourt Village,Dufferin",43.669005,-79.442259,0,Bakery,Supermarket,Pharmacy,Park,Bank,Bar,Furniture / Home Store,Music Venue,Middle Eastern Restaurant,Café
32,M6J,West Toronto,"Little Portugal,Trinity",43.647927,-79.41975,0,Bar,Asian Restaurant,Coffee Shop,Café,Men's Store,Cocktail Bar,Pizza Place,Boutique,Restaurant,French Restaurant
35,M6R,West Toronto,"Parkdale,Roncesvalles",43.64896,-79.456325,0,Breakfast Spot,Gift Shop,Restaurant,Dog Run,Movie Theater,Italian Restaurant,Bookstore,Bar,Dessert Shop,Eastern European Restaurant
25,M5S,Downtown Toronto,"Harbord,University of Toronto",43.662696,-79.400049,0,Café,Bar,Japanese Restaurant,Bookstore,Sandwich Place,Restaurant,Bakery,Chinese Restaurant,Beer Bar,Beer Store


**Show those clusters onto a map**

In [69]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_central_clustered_df['Latitude'], toronto_central_clustered_df['Longitude'], toronto_central_clustered_df['PostCode'], toronto_central_clustered_df['Borough'], toronto_central_clustered_df['Neighborhood'], toronto_central_clustered_df['Cluster']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Upon observing the result, we can name the clusters as follow:

Cluster 0: Living area (with mostly park, trail, school; and some small businesses)

Cluster 1: Roselawn - Central Toronto (nothing here except a garden)

Cluster 2: Business area (with lots of business venues)