# Segmenting and Clustering Neighborhoods in Toronto 

## Part 1: Collecting and cleaning neighborhoods data
First, we have to download the wikipedia data and select the neighborhoods' table.

In [70]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

toronto = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M", 
    match="Neighbourhood"
)[0]

print(toronto.shape)
toronto.head()

(180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Then, let's do some clean up on the dataframe, by applying the following steps:
- drop all rows not assigned to a `Bourough`;
- `Not assigned` neighborhoods should have the same value as the `Borough`;
- and make sure all postal codes are unique.

In [2]:
def fix_nb(entry):
    if entry["Neighbourhood"] == "Not Assigned":
        entry["Neighbourhood"] = entry["Borough"]
    return entry
        
toronto = toronto[toronto['Borough'] != "Not assigned"]
toronto = toronto.apply(fix_nb, axis=1)
toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [3]:
num_postal_codes = len(toronto['Postal Code'].unique())
if num_postal_codes == toronto.shape[0]:
    print('All postal codes in the dataframe contains a single neighbourhood')
else:
    print('There are more than one neighbourhood for some postal codes')

All postal codes in the dataframe contains a single neighbourhood


In [4]:
print(toronto.shape)

(103, 3)


## Part 2: Adding up Geolocation data

Let's download the geospatial data an merge it together to our dataframe

In [5]:
geospatial_data = pd.read_csv("https://cocl.us/Geospatial_data")

In [6]:
toronto = pd.merge(toronto, geospatial_data, on='Postal Code')
toronto.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Part 3: Getting venues and clustering the Neighbourhoods

First, let's plot Toronto's neighbourhoods on a map

In [54]:
import folium
lat_avg, lng_avg = toronto['Latitude'].mean(), toronto['Longitude'].mean()
map_toronto = folium.Map(location=[lat_avg, lng_avg], zoom_start=10)

def add_markers_to_map(entry, city_map):
    label = folium.Popup(f"{entry['Neighbourhood']}, {entry['Borough']}", parse_html=True)
    folium.CircleMarker(
        [entry["Latitude"], entry["Longitude"]],
        radius=5,
        popup=label,
        color='#7a1c1c',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.6,
        parse_html=False
    ).add_to(city_map)  

    
toronto.apply(add_markers_to_map, axis=1, args=(map_toronto,))
map_toronto

Now, we're going to retrieve venues data for each neighbourhood

In [48]:
import os
import requests

CLIENT_ID = os.environ.get("FOURSQUARE_CLIENT_ID")
CLIENT_SECRET = os.environ.get("FOURSQUARE_CLIENT_SECRET")
VERSION = "20180605"

def make_search_url(lat, lng, radius=500, limit=100):
    return (
        f'https://api.foursquare.com/v2/venues/explore?'
        f'client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}'
        f'&ll={lat},{lng}&radius={radius}&limit={limit}'
    )

def get_venues_from_neighbourhood(entry):
    url = make_search_url(entry["Latitude"], entry["Longitude"])
    results = requests.get(url).json()["response"]["groups"][0]["items"]
    return [(
        entry["Neighbourhood"],
        entry["Latitude"], 
        entry["Longitude"], 
        v['venue']['name'], 
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results
    ]

venues_lists = toronto.apply(get_venues_from_neighbourhood, axis=1)

In [72]:
# flatten the venues list and make a dataframe for them
venues = pd.DataFrame([venue for sublist in venues_lists for venue in sublist])
venues.columns = [
    'Neighbourhood', 
    'Neighbourhood Latitude', 
    'Neighbourhood Longitude', 
    'Venue', 
    'Venue Latitude', 
    'Venue Longitude', 
    'Venue Category'
]
print(venues.shape)
venues.head()

(2141, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
3,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant


Let's check how many venues were returned for each neighbourhood

In [78]:
pd.DataFrame(venues.groupby("Neighbourhood")["Neighbourhood"].count())

Unnamed: 0_level_0,Neighbourhood
Neighbourhood,Unnamed: 1_level_1
Agincourt,5
"Alderwood, Long Branch",7
"Bathurst Manor, Wilson Heights, Downsview North",21
Bayview Village,4
"Bedford Park, Lawrence Manor East",22
Berczy Park,55
"Birch Cliff, Cliffside West",4
"Brockton, Parkdale Village, Exhibition Place",23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16


In [80]:
#TODO:
# get dummies for venues categories
# cluster nb by venues categories