## Toronto neighborhoods data science project

### - Segmenting and Clustering Neighborhoods in Toronto

### As we are planning to go on a vacation in Toronto, and we really like to sample a diverse set of different food we want to answer the question: "Which neighborhoods have most diverse set of restaurants?"


### Fetching and cleaning data

In [3]:
import pandas as pd
import numpy as np

# Fetch postal codes
# To make things more stable I have saved the resulting data into a CSV file which is used during development
'''
page_content_list = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
# Page contains several tables, let's access the first one
df = page_content_list[0]
'''

df = pd.read_csv("postal_codes.csv")

# let's look at the data we got
print("df shape: " + str(df.shape))
df.head()


df shape: (180, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [4]:
# Let's clean up the "Not assigned" -Borough rows
df = df[df['Borough'] != 'Not assigned']

# Let's also clean up any remaining "Not assigned" -neighbourhoods by assigning the rows Borough value to Neighbourhood column in that case.
df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df.loc[df['Neighbourhood'] == 'Not assigned', 'Borough'].values

print("df shape: " + str(df.shape))
df.head()

df shape: (103, 3)


Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Adding location data


### define a function to get coordinates for a postal code

In [5]:

def getCoords(postal_code: str):

    # Here the provided preloaded CSV file was used because the geocoder was so unreliable.
    # Let's read the geospatial data to pandas dataframe. 
    coords = pd.read_csv("Geospatial_Coordinates.csv")

    # and return the wanted coordinates as a tuple
    return (coords[coords['Postal Code']==postal_code]['Latitude'].values[0],
            coords[coords['Postal Code']==postal_code]['Longitude'].values[0] )

### Append coordinates to our dataframe

In [6]:
postal_code_list = df['Postal Code'].to_list()
lat_coords = []
long_coords = []

for postal_code in postal_code_list:

    coords = getCoords(postal_code)
    lat_coords.append(coords[0])
    long_coords.append(coords[1])


df['Latitude'] = lat_coords
df['Longitude'] = long_coords
df.reset_index(inplace=True, drop=True)

# Let's display 
df.head(12)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


## Including FourSquare data about food places



In [7]:

import config as cfg

CLIENT_ID = cfg.foursquare['CLIENT_ID']
CLIENT_SECRET = cfg.foursquare['CLIENT_SECRET']
VERSION = cfg.foursquare['VERSION']
LIMIT = 10

CHINESE_RESTAURANT = "4bf58dd8d48988d145941735"
FOOD_CATEGORY = "4d4b7105d754a06374d81259"

CATEGORIES = FOOD_CATEGORY


In [8]:

import requests
import json

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        '''
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            CATEGORIES,
            LIMIT)
            
        # make the GET request
        json_res = requests.get(url).json()
        '''
        # The same here, during development to make things more reliable the resulting JSON was saved into a file for each neighborhood
        #
        # To fetch the data from Foursquare just uncomment the above code block and comment out the following 3 lines
        #
        f = open("venues_" + name.replace(",", "_") + ".txt", "r")
        json_res = json.load(f)
        f.close()

        #print( "json_res: " + str(json_res))

        results = json_res["response"]['groups'][0]['items']
        #print("results: " + str(results))
        

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                'Neighborhood Latitude', 
                'Neighborhood Longitude', 
                'Venue', 
                'Venue Latitude', 
                'Venue Longitude', 
                'Venue Category']
    
    return(nearby_venues)



In [9]:
neighborhood_list = df['Neighbourhood'].to_list()
latitude_list = df['Latitude'].to_list()
longitude_list = df['Longitude'].to_list()

#print( "neighborhood_list: " + str(neighborhood_list))
#print( "latitude_list: " + str(latitude_list))
#print( "longitude_list: " + str(longitude_list))

nearby_venues_df = getNearbyVenues(neighborhood_list,latitude_list,longitude_list)


In [10]:

# Check dataframe size
print("nearby_venues_df.shape: " + str(nearby_venues_df.shape))

# Check part of the data
nearby_venues_df[nearby_venues_df['Neighborhood'] == 'Agincourt']



nearby_venues_df.shape: (554, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
408,Agincourt,43.7942,-79.262029,Panagio's Breakfast & Lunch,43.79237,-79.260203,Breakfast Spot
409,Agincourt,43.7942,-79.262029,El Pulgarcito,43.792648,-79.259208,Latin American Restaurant
410,Agincourt,43.7942,-79.262029,Royal Chinese Seafood Restaurant,43.798496,-79.262196,Chinese Restaurant


### Let's see how many unique food place categories there are

In [11]:
print('There are {} uniques food place categories.'.format(len(nearby_venues_df['Venue Category'].unique())))

There are 67 uniques food place categories.


### Let's show first the neighborhoods on a map

In [12]:
import folium

latitude = 43.70891788636827
longitude = -79.41435612720262

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Here is the same map as an image because Github does not seem to render Folium maps correctly
![Map of neighborhoods](map1.PNG)

### Prepare data for clustering and grouping

### At first let's do one hot encoding of the venue categories

In [13]:

# one hot encoding
toronto_onehot = pd.get_dummies(nearby_venues_df[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = nearby_venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()


Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Belgian Restaurant,Bistro,Brazilian Restaurant,Breakfast Spot,...,Seafood Restaurant,Snack Place,Steakhouse,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### And let's group by the neighborhoods and calculate the mean of the occurence of each category restaurant

In [14]:

toronto_onehot_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_onehot_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Belgian Restaurant,Bistro,Brazilian Restaurant,Breakfast Spot,...,Seafood Restaurant,Snack Place,Steakhouse,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.0,0.0


### Now we are almost ready to feed this data to a clustering algorithm

In [15]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

# Let's drop the Neighborhood column before feeding the dataframe into KMeans
toronto_grouped_clustering = toronto_onehot_grouped.drop('Neighborhood', 1)

# Train k-means clustering model
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# Check cluster labels generated for each row in the dataframe
kmeans.labels_[0:20] 

array([1, 2, 1, 1, 1, 1, 1, 1, 4, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [16]:

# Let's create a new merged dataframe which will include cluster labels and lat&long information as well.
toronto_merged = toronto_onehot_grouped

# Let's add cluster labels
toronto_merged.insert(0, 'Cluster Labels', kmeans.labels_)

# Let's merge dataframes to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(df.set_index('Neighbourhood'), on='Neighborhood')
toronto_merged.head(10)


Unnamed: 0,Cluster Labels,Neighborhood,American Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,Belgian Restaurant,Bistro,Brazilian Restaurant,...,Tapas Restaurant,Thai Restaurant,Theme Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wings Joint,Postal Code,Borough,Latitude,Longitude
0,1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M1S,Scarborough,43.7942,-79.262029
1,2,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.0,0.0,M8W,Etobicoke,43.602414,-79.543484
2,1,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M3H,North York,43.754328,-79.442259
3,1,Bayview Village,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M2K,North York,43.786947,-79.385975
4,1,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.1,0.0,0.0,0.0,0.0,M5M,North York,43.733283,-79.41975
5,1,Berczy Park,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.0,...,0.0,0.1,0.0,0.1,0.0,0.0,M5E,Downtown Toronto,43.644771,-79.373306
6,1,"Birch Cliff, Cliffside West",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M1N,Scarborough,43.692657,-79.264848
7,1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M6K,West Toronto,43.636847,-79.428191
8,4,"Business reply mail Processing Centre, South C...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,M7Y,East Toronto,43.662744,-79.321558
9,1,"CN Tower, King and Spadina, Railway Lands, Har...",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,M5V,Downtown Toronto,43.628947,-79.39442


In [17]:
import matplotlib.cm as cm
import matplotlib.colors as colors

cluster_map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(cluster_map_toronto)
    
cluster_map_toronto

### Here is the same map as an image because Github does not seem to render Folium maps correctly
![Clustered map](map2.PNG)


In [18]:

# Let's get columns only for the neighborhood + the restaurant types
restaurant_types_columns = toronto_merged.columns[ [1] + list(range(2, toronto_merged.shape[1] - 4 )) ]

# Let's show the content for cluster 0
cluster_0 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, restaurant_types_columns]

temp = cluster_0.iloc[:,list(range(1, cluster_0.shape[1] ))]
temp = temp.transpose()
temp = temp[temp.sum(axis=1) > 2]
temp

# So this group is just bakeries

Unnamed: 0,10,26,33,52,81
Bakery,1.0,1.0,0.5,0.5,0.333333


In [19]:
# Let's show the content for cluster 1
cluster_1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, restaurant_types_columns]

temp = cluster_1.iloc[:,list(range(1, cluster_0.shape[1] ))]
temp = temp.transpose()
temp = temp[temp.sum(axis=1) > 2]
temp

# Lot's of different types of restaurants
#
# So looking back at the map we see that many neighborhoods match our criteria, this is good!
#

Unnamed: 0,0,2,3,4,5,6,7,9,11,12,...,73,74,76,77,78,79,83,84,86,87
American Restaurant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.142857,0.0,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bakery,0.0,0.0,0.0,0.0,0.1,0.0,0.1,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.1,0.2,0.2,0.0,0.0,0.0
Breakfast Spot,0.333333,0.0,0.0,0.0,0.1,0.0,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Café,0.0,0.0,0.25,0.1,0.0,0.25,0.2,0.0,0.0,0.0,...,0.0,0.3,0.0,0.0,0.3,0.1,0.0,0.1,0.0,0.2
Fried Chicken Joint,0.0,0.111111,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.2,...,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Italian Restaurant,0.0,0.0,0.0,0.2,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.2,0.0,0.0,0.1,0.0,0.0,0.0,0.0
Japanese Restaurant,0.0,0.0,0.25,0.0,0.0,0.0,0.1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.1,0.0,0.0
Pizza Place,0.0,0.111111,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,...,0.142857,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,0.0
Restaurant,0.0,0.111111,0.0,0.1,0.1,0.0,0.1,0.0,0.0,0.0,...,0.142857,0.0,0.1,0.1,0.1,0.1,0.0,0.1,0.0,0.2
Sandwich Place,0.0,0.111111,0.0,0.1,0.0,0.0,0.1,0.0,0.142857,0.0,...,0.0,0.2,0.0,0.2,0.1,0.0,0.2,0.1,0.0,0.0


In [20]:
# Let's show the content for cluster 2
cluster_2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, restaurant_types_columns]

#cluster_2.head()

temp = cluster_2.iloc[:,list(range(1, cluster_0.shape[1] ))]
temp = temp.transpose()
temp = temp[temp.sum(axis=1) > 2]
temp

# Mainly pizza places


Unnamed: 0,1,20,27,32,38,40,43,54,57,65,66,67,75,80,82,85
Pizza Place,0.4,0.25,0.333333,0.333333,0.5,1.0,0.5,0.25,0.4,0.333333,1.0,0.4,0.333333,0.333333,0.4,1.0


In [21]:
# Let's show the content for cluster 3
cluster_3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, restaurant_types_columns]

temp = cluster_3.iloc[:,list(range(1, cluster_0.shape[1] ))]
temp = temp.transpose()
temp = temp[temp.sum(axis=1) > 0.1]
temp

# Just cafes

Unnamed: 0,88
Cafeteria,1.0


In [22]:
# Let's show the content for cluster 4
cluster_4 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, restaurant_types_columns]

temp = cluster_4.iloc[:,list(range(1, cluster_0.shape[1] ))]
temp = temp.transpose()
temp = temp[temp.sum(axis=1) > 1]
temp

# Mainly fast food

Unnamed: 0,8,21,37,48,58,63
Fast Food Restaurant,0.4,0.4,0.5,0.5,0.5,1.0


### To get back to the original question, which neighborhoods do contain most diverse restaurants?

### Based on the cluster statistics we can say that any neighborhood belonging to the cluster 1 would match our criteria!