# Segmenting and Clustering Neighborhoods in Toronto

## Scraping Wikipedia page

In [1]:
import requests as rq
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

COLUMNS = ['PostalCode', 'Borough', 'Neighborhood']
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [2]:
def bs_row_to_series(bs_row):
    return pd.Series({COLUMNS[0]: bs_row[0].text, COLUMNS[1]: bs_row[1].text, COLUMNS[2]: bs_row[2].text.rstrip()})

In [3]:
res = rq.get(URL)
dom = BeautifulSoup(res.text)
trs = dom.table.find_all('tr')
bs_rows = [tr.find_all('td') for tr in trs]
series = list(map(bs_row_to_series, bs_rows[1:]))
raw_df = pd.DataFrame(columns=COLUMNS, data=series)

In [4]:
raw_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Processing dataframe

Dropping rows not with all data. Aggregating neighborhoods by postal codes.

In [5]:
raw_df.replace('Not assigned', value=np.nan, inplace=True)
raw_df.dropna(inplace=True)
raw_df['Neighborhood'].fillna(raw_df['Borough'], inplace=True)
raw_df = raw_df.groupby(by=['PostalCode'], as_index=False)
df = raw_df.aggregate(lambda nbh: ", ".join(set(nbh)))

In [6]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek"
2,M1E,Scarborough,"Morningside, West Hill, Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Oakridge, Golden Mile, Clairlea"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


## Adding coordinates

In [7]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_05c97552501045e8a9d929ffe9065e9b = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='',
    ibm_auth_endpoint="https://iam.ng.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_05c97552501045e8a9d929ffe9065e9b.get_object(Bucket='courseradap-donotdelete-pr-hv0z4bomsyjnis',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

# If you are reading an Excel file into a pandas DataFrame, replace `read_csv` by `read_excel` in the next statement.
geocoords = pd.read_csv(body)
geocoords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
df = pd.merge(df, geocoords, left_on=['PostalCode'], right_on=['Postal Code'], how='left')
del df['Postal Code']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Getting venue data

In [9]:
# @hidden_cell
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180604'
radius = 500
LIMIT = 100

In [10]:
import requests

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'])

Rouge, Malvern
Port Union, Rouge Hill, Highland Creek
Morningside, West Hill, Guildwood
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Oakridge, Golden Mile, Clairlea
Cliffcrest, Cliffside, Scarborough Village West
Cliffside West, Birch Cliff
Wexford Heights, Scarborough Town Centre, Dorset Park
Wexford, Maryvale
Agincourt
Sullivan, Tam O'Shanter, Clarks Corners
Steeles East, L'Amoreaux East, Agincourt North, Milliken
L'Amoreaux West
Upper Rouge
Hillcrest Village
Oriole, Henry Farm, Fairview
Bayview Village
York Mills, Silver Hills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South, Flemingdon Park
Wilson Heights, Downsview North, Bathurst Manor
York University, Northwood Park
Downsview East, CFB Toronto
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

In [12]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Morningside, West Hill, Guildwood",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Morningside, West Hill, Guildwood",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


## Data preprocessing

Not all neighborhoods have venues around, so we have to drop neighborhoods without venues

In [13]:
print(len(set(df['Neighborhood'])), len(set(toronto_venues['Neighborhood'])))
neigh_no_venues = set(df['Neighborhood']) - set(toronto_venues['Neighborhood'])
df = df.loc[~df['Neighborhood'].isin(neigh_no_venues)]
df.shape

102 100


(100, 5)

In [14]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Lounge,Clothing Store,Skating Rink,Breakfast Spot,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant
1,Bayview Village,Café,Bank,Japanese Restaurant,Chinese Restaurant,Women's Store,Donut Shop,Diner,Discount Store,Dog Run,Doner Restaurant
2,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Cheese Shop,Bakery,Steakhouse,Seafood Restaurant,Café,Farmers Market,Irish Pub
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Pizza Place,Recording Studio,Auto Workshop,Burrito Place,Fast Food Restaurant,Farmers Market,Garden Center,Garden,Comic Shop
4,"Cabbagetown, St. James Town",Coffee Shop,Pub,Pizza Place,Italian Restaurant,Restaurant,Café,Market,Bakery,Playground,Caribbean Restaurant


## Clusterisation

We need to find the best number of clusters. The silhouette metric is suitable for choosing the best number.

In [17]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

best_score = {'n_clusters': 0, 'score': 0}
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

for n in n_clusters:
    kmeans = KMeans(n_clusters=n, random_state=0)
    labels = kmeans.fit(toronto_grouped_clustering)
    silhouette_avg = silhouette_score(toronto_grouped_clustering, labels.labels_)
    if silhouette_avg > best_score['score']: best_score = {'n_clusters': n, 'score': silhouette_avg}
    print('n: {}, score: {}'.format(n, silhouette_avg))
print('the best clusters number: {}'.format(best_score['n_clusters']))

n: 2, score: 0.2121247246005695
n: 3, score: 0.21862978000389194
n: 4, score: 0.22487295584427355
n: 5, score: 0.23161191050240337
n: 6, score: 0.23837598987964637
n: 7, score: 0.15876172364969626
n: 8, score: 0.2417203947557077
n: 9, score: 0.24702445368352577
n: 10, score: 0.22304970758353618
the best clusters number: 9


In [18]:
kmeans = KMeans(n_clusters=best_score['n_clusters'], random_state=0)
labels = kmeans.fit(toronto_grouped_clustering)
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', labels.labels_)
toronto_merged = df
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,7,Fast Food Restaurant,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore,Farmers Market
1,M1C,Scarborough,"Port Union, Rouge Hill, Highland Creek",43.784535,-79.160497,1,History Museum,Bar,Women's Store,Donut Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
2,M1E,Scarborough,"Morningside, West Hill, Guildwood",43.763573,-79.188711,1,Medical Center,Electronics Store,Mexican Restaurant,Pizza Place,Rental Car Location,Breakfast Spot,Intersection,Women's Store,Dim Sum Restaurant,Diner
3,M1G,Scarborough,Woburn,43.770992,-79.216917,8,Coffee Shop,Korean Restaurant,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Dumpling Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1,Hakka Restaurant,Fried Chicken Joint,Athletics & Sports,Bakery,Bank,Thai Restaurant,Caribbean Restaurant,Diner,Discount Store,Dog Run


## Visualisation

In [20]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[43.6529, -79.3849], zoom_start=11)
x = np.arange(best_score['n_clusters'])
ys = [i + x + (i*x)**2 for i in range(best_score['n_clusters'])]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

## Groups

In [21]:
for group_num in set(toronto_merged['Cluster Labels']):
    print("group {}: {}".format(group_num, ', '.join(toronto_merged[toronto_merged['Cluster Labels'] == group_num]['Neighborhood'])))

group 0: Hillcrest Village, Roselawn, Martin Grove, Cloverdale, Islington, West Deane Park, Princess Gardens
group 1: Port Union, Rouge Hill, Highland Creek, Morningside, West Hill, Guildwood, Cedarbrae, Kennedy Park, Ionview, East Birchmount Park, Oakridge, Golden Mile, Clairlea, Cliffcrest, Cliffside, Scarborough Village West, Cliffside West, Birch Cliff, Wexford Heights, Scarborough Town Centre, Dorset Park, Wexford, Maryvale, Agincourt, Sullivan, Tam O'Shanter, Clarks Corners, L'Amoreaux West, Oriole, Henry Farm, Fairview, Bayview Village, Willowdale South, Willowdale West, Don Mills North, Don Mills South, Flemingdon Park, Wilson Heights, Downsview North, Bathurst Manor, York University, Northwood Park, Downsview Central, Downsview Northwest, Victoria Village, Parkview Hill, Woodbine Gardens, Woodbine Heights, The Beaches, Leaside, Thorncliffe Park, The Danforth West, Riverdale, The Beaches West, India Bazaar, Studio District, Davisville North, North Toronto West, Davisville, Summ