# Toronto Salons 

### Data acquisition and cleaning

First, import the necessary packages.

In [713]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import folium
import folium # map rendering library


I started with postal code data from the previous assignment as references for the longitude and latitude data of Toronto.  

In [13]:
#read wikipedia page into pandas df
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tables = pd.read_html(url)

# first table on the wikipedia page 
neighbs = tables[0] 

#rename and remove unused postal codes from table
neighbs.replace('Not assigned', np.nan, inplace = True) 
neighbs.dropna(subset=["Borough"], axis=0, inplace=True) 

#rename unnamed neighborhoods with the broader borough's name
neighbs["Neighbourhood"].replace(np.nan, neighbs["Borough"], inplace=True)
neighbs

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [714]:
#read geospatial coordinates into pandas df
coords = pd.read_csv(r'Desktop/Geospatial_Coordinates.csv')

#merge the neighborhoods and geospatial data according to postal codes 
df = neighbs.merge(coords, how='inner', left_on=["Postal Code"], right_on=["Postal Code"])
df.head(5)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


##### Visualizing the neighborhoods by postal code

The high density of postal codes in downtown Toronto is one of several indicators that we'll see that reflect the concentration of residents.

In [715]:
# create map of Toronto using latitude and longitude values
latitude = 43.71
longitude = -79.39

map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

### Venue data acquisition

In [717]:
# Establish foursquare credentials
CLIENT_ID = 'JOOTJPHIS1WAAI23Y0MEGCOZEQX504L1O1SPHIRW2GGNVSMH' # your Foursquare ID
CLIENT_SECRET = 'GI3MS31UFZSUJVTEJSTMQ13EE1MKF1ELIDRFMTC1WN4DRCM1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: JOOTJPHIS1WAAI23Y0MEGCOZEQX504L1O1SPHIRW2GGNVSMH
CLIENT_SECRET:GI3MS31UFZSUJVTEJSTMQ13EE1MKF1ELIDRFMTC1WN4DRCM1


In [718]:
# Establish foursquare request parameters
LIMIT = 150 # limit of number of venues returned by Foursquare API

radius = 500 # define radius


In [844]:
# Define a function to obtain relevant information for LIMIT venues within RADIUS meters

def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Run function for the postal codes established above 
Toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [874]:
# Assess the shape and format of the data obtained (after removing one case of PO Boxes)
Toronto_venues = Toronto_venues[Toronto_venues.Neighborhood != 'Stn A PO Boxes']
Toronto_venues.drop_duplicates(inplace = True, ignore_index = True, subset = ['Venue', 'Venue Latitude', 'Venue Longitude'])
Toronto_venues.reset_index(drop = True, inplace = True)
print(Toronto_venues.shape)
Toronto_venues.head()

(3643, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.75984,-79.324719,Caribbean Restaurant
1,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
2,Parkwoods,43.753259,-79.329656,Tim Hortons,43.760668,-79.326368,Café
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
4,Parkwoods,43.753259,-79.329656,Bruno's valu-mart,43.746143,-79.32463,Grocery Store


### Venue data cleaning

There is an abundance of data that won't be useful given the question. However, after viewing the _Toronto_venues_ dataframe, it's clear that not all the data is coherently coded, so I will have to manually recode some of the venue data. Specifically, while I'm looking for **salons** and **restaurants**, some categories are more specific (e.g., Afghan Restaurant). However, not all categories contain the word _restaurant_ (e.g., BBQ Joint). For 'salons', there is a more limited list of categories that fall within this supercategory of interest.

In [875]:
category_search = Toronto_venues.groupby('Venue Category').count()
category_search.reset_index().head(20)

Unnamed: 0,Venue Category,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
0,Accessories Store,3,3,3,3,3,3
1,Afghan Restaurant,2,2,2,2,2,2
2,African Restaurant,1,1,1,1,1,1
3,Airport,2,2,2,2,2,2
4,American Restaurant,26,26,26,26,26,26
5,Amphitheater,2,2,2,2,2,2
6,Animal Shelter,1,1,1,1,1,1
7,Antique Shop,3,3,3,3,3,3
8,Aquarium,2,2,2,2,2,2
9,Art Gallery,14,14,14,14,14,14


Based on a deep dive into the data, I created a list of alternative venue categories that were related to salons. Some of these aren't entirely synonymous but are relevant for business intelligence. I created a subset with only the venues whose categories matched one of the established synonyms.

Upon examination of the subset, some salons are categorized as "Cosmetics Shops" (e.g. Florio Hair Design, 251). Others are exclusively retail shops (e.g., Sephora, 386), while others still are a combination of both (e.g., Aveda Institute Toronto, 583, where you can both buy products and get a haircut). Therefore, I maintained the original list of related categories, but removed cases of exclusively retail shops. I established these retail shops by searching online for the most repeated "Cosmetics Shops" in the list.

I've left nail salons and spas in the dataframe in an abundance of caution, as some have associated hair studios.

In [1206]:
salonsynonyms = ['Salon', 'Cosmetics Shop', 'Health & Beauty Service', 'Massage Studio', 'Nail Salon', 'Salon / Barbershop', 'Spa', 'Tanning Salon']
Toronto_salon_venues = Toronto_venues[Toronto_venues['Venue Category'].isin(salonsynonyms)]
Toronto_salon_venues.tail(12)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
3325,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Waxon Waxbar,43.682824,-79.391971,Spa
3330,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Manipedi Spa,43.682626,-79.391879,Spa
3409,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Nails for You,43.79875,-79.318768,Cosmetics Shop
3543,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,Dove Spa,43.65979,-79.51343,Spa
3625,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Koala Tan Tanning Salon & Sunless Spa,43.63137,-79.519006,Tanning Salon


In [877]:
retail_cosmetics = ['LUSH', 'SEPHORA', 'MAC Cosmetics', 'The Abnormal Beauty Company', 'MenEssentials']
Toronto_salons = Toronto_salon_venues[~Toronto_salon_venues['Venue'].isin(retail_cosmetics)]
Toronto_salons.tail()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
3325,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Waxon Waxbar,43.682824,-79.391971,Spa
3330,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Manipedi Spa,43.682626,-79.391879,Spa
3409,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Nails for You,43.79875,-79.318768,Cosmetics Shop
3543,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,Dove Spa,43.65979,-79.51343,Spa
3625,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Koala Tan Tanning Salon & Sunless Spa,43.63137,-79.519006,Tanning Salon


As for restaurants, I created a list populated by key words that appeared in the category name for the first 300 venues to indicate a restaurant or food service. I then used embedded for-loops to break apart the venue categories for each venue, assess if any word within the category matched a word in my list, and generated a list of all of the venues that did have a matching word, which I used to subset the original list of venues.

In [878]:
# Create an object with the entirety of venue categories
categories = Toronto_venues['Venue Category']

# Create a list of key words that were observed in venue categories to indicate food service        
rest_names = ['Restaurant', 'Food', 'Bar', 'Diner', 'Joint', 'Breakfast', 'Lunch']

# for-loop to create a subset of all the venue categories that contain a key word
subset_rests = []                       #create an empty list to append to
for names in categories:                #for each category name,
    for i in names.split():             #split the category name into a list of words
        if i in rest_names:             #for each word, if the word is among the key words
            subset_rests.append(names)  #append the full name of the category to the subset
        else:
            continue

# Subset the original dataframe using the items whose names match a word in the list        
Toronto_rest_venues = Toronto_venues[Toronto_venues['Venue Category'].isin(subset_rests)]
Toronto_rest_venues

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant
9,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
18,Parkwoods,43.753259,-79.329656,Spicy Chicken House,43.760639,-79.325671,Chinese Restaurant
29,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant
...,...,...,...,...,...,...,...
3623,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Rocco's Plum Tomato,43.634898,-79.519951,Italian Restaurant
3624,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Kelseys Original Roadhouse,43.622934,-79.516427,Restaurant
3630,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Buon Giorno Cafe,43.622801,-79.519322,Italian Restaurant
3635,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Appalachia BBQ,43.624034,-79.514079,BBQ Joint


I want to join the two subsets back together, but don't want to lose their identity in the new dataframe, so I'm introducing two dummy variables (for salon identity and restaurant identity), which will also serve to identify the venues in the mapping process. 

In [1104]:
Toronto_rest_venues['Restaurant'] = 1   #dummy variable for later
Toronto_rest_venues['Salon'] = 0        #dummy variable for later
Toronto_salons['Salon'] = 1             #dummy variable for later
Toronto_salons['Restaurant'] = 0        #dummy variable for later
Toronto_data = pd.concat([Toronto_rest_venues, Toronto_salons])      #join the two subset dfs
Toronto_data

(41, 9)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Restaurant,Salon,Cluster,LongitudeNormed,LatitudeNormed
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant,1,0,0.0,-57.170968,31.538624
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant,1,0,0.0,-57.172515,31.539202
9,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop,1,0,0.0,-57.177019,31.532955
18,Parkwoods,43.753259,-79.329656,Spicy Chicken House,43.760639,-79.325671,Chinese Restaurant,1,0,0.0,-57.171654,31.539200
29,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant,1,0,0.0,-57.162368,31.514104
...,...,...,...,...,...,...,...,...,...,...,...,...
3325,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Waxon Waxbar,43.682824,-79.391971,Spa,0,1,,,
3330,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049,Manipedi Spa,43.682626,-79.391879,Spa,0,1,,,
3409,"Steeles West, L'Amoreaux West",43.799525,-79.318389,Nails for You,43.798750,-79.318768,Cosmetics Shop,0,1,,,
3543,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944,Dove Spa,43.659790,-79.513430,Spa,0,1,,,


### Visualizing the restaurant and salon data together

To get an idea of the concentration of restaurants and salons within Toronto, I mapped both together. In the map, red signals another salon while blue signals a restaurant venue.

In [1109]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = 2
ys = 2
colors_array = cm.rainbow(np.linspace(0, 1, ys))
rainbow = ['blue', 'red']

# add markers to map
markers_colors = []
for lat, lng, marker, name in zip(Toronto_data['Venue Latitude'], Toronto_data['Venue Longitude'], Toronto_data['Salon'], Toronto_data['Venue']):
    label = folium.Popup(str((marker, name)), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color=rainbow[int(marker)],
        fill=True,
        fill_color=rainbow[int(marker)],
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  

map_Toronto




One potential method for determining where to put the new salon would look at the postal codes with the greatest number of restaurants and the fewest salons. For example, by this method, **Little Portugal, Trinity** would be a good neighborhood within which to place a salon, identified by the orange marker in the map below.

In [1403]:
Toronto_sumRests = Toronto_data.groupby(['Neighborhood']).sum()
TorontosumRests_noSalons = Toronto_sumRests[Toronto_sumRests['Salon'] == 0]
max_rests = TorontosumRests_noSalons['Restaurant'].max()
TorontosumRests_noSalons[TorontosumRests_noSalons['Restaurant']==(max_rests)]
max_rests = TorontosumRests_noSalons.sort_values(by = 'Restaurant', ascending = False)
a = max_rests.iloc[:, [4,5]].head(11)

In [1404]:
a[a.Restaurant != 27]

Unnamed: 0_level_0,Restaurant,Salon
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
"Little Portugal, Trinity",50,0
"Willowdale, Willowdale East",43,0
Studio District,41,0
"The Annex, North Midtown, Yorkville",30,0
"University of Toronto, Harbord",28,0
Agincourt,26,0
Davisville,24,0
"Dufferin, Dovercourt Village",20,0
"Moore Park, Summerhill East",20,0
"Garden District, Ryerson",17,0


In [1405]:
df[df.Neighbourhood == 'Little Portugal, Trinity']
FutureSalon = dict([('Neighborhood', 'Little Portugal, Trinity'), ('Neighborhood Latitude', 43.647927), ('Neighborhood Longitude', -79.41975), ('Venue', 'FUTURE SALON'), ('Venue Latitude', 43.647927), ('Venue Longitude', -79.41975), ('Venue Category', 'Salon / Barbershop'), ('Restaurant', 0), ('Salon', 2)])
recommended = Toronto_data.append(FutureSalon, ignore_index = True)
# create map of Toronto using latitude and longitude values
recommended_site_map = folium.Map(location=[latitude-.05, longitude], zoom_start=11.5)

# set color scheme for the clusters

rainbow = ['blue', 'red', 'orange']
rad = [2,2,5]
opac = [.7,.7, 1]

# add markers to map
markers_colors = []
for lat, lng, marker in zip(recommended['Venue Latitude'], recommended['Venue Longitude'], recommended['Salon']):
    label = folium.Popup(str(marker), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=rad[int(marker)],
        popup=label,
        color=rainbow[int(marker)],
        fill=True,
        fill_color=rainbow[int(marker)],
        fill_opacity=opac[marker],
        parse_html=False).add_to(recommended_site_map)  

recommended_site_map

However, while postal codes are indicators of population density, their borders are often arbitrary in city centers, where residents can easily cross through multiple postal codes in a typical walking distance for their services.

Therefore, I'll complement this process-of-elimination analysis with a DBSCAN (density-based spatial clustering of applications with noise) cluster analysis. This unsupervised machine-learning method identifies areas of high density while ignoring outliers. The data scientist establishes the radius and the number of items that should fall within an area to be considered a cluster. 

I will employ DBSCAN to identify areas of high restaurant density (established arbitrarily, but editable for the client's needs). Because we're looking for areas of high restaurant density and low salon density, only the restaurant  data will be relevant for the DBSCAN. I will use geospatial data to find the clusters of restaurants. (Later, I will use the centroids of these clusters to determine how many salons are within an established distance.)

In [348]:
from sklearn.cluster import DBSCAN  
from sklearn.neighbors.nearest_centroid import NearestCentroid
import matplotlib.pyplot as plt 
from sklearn import metrics
%matplotlib inline



At this latitude, moving 1 degree longitudinally is equal to moving 80 km. 
At this longitude,  moving 1 degree latitudinally is equal to moving 111 km.

[https://www.nhc.noaa.gov/gccalc.shtml]

This information is important in order to normalize the data. The longitudinal changes would otherwise be overemphasized (they're real value is .72 times the changes in latitude, but they're getting considered as if they were equal). 



In [1262]:
# represent points consistently as (lat, lon)
Toronto_rest_venues['LongitudeNormed'] = Toronto_rest_venues['Venue Longitude']*(80/111)

coordinates_rest = Toronto_rest_venues[['Venue Latitude', 'LongitudeNormed']]

# define the number of kilometers in one radian
kms_per_radian = 6371.0088

# define epsilon and convert to radians for use by haversine
epsilon = .408 / kms_per_radian #(the average distance in km of 7.5 city blocks, arbitrary but editable for client's needs )


minimumSamples = 20   # arbitrary number but editable for the client's needs
db_rest = DBSCAN(eps=epsilon, min_samples=minimumSamples, algorithm='ball_tree', metric='haversine').fit(np.radians(coordinates_rest))
cluster_labels_rest = db_rest.labels_
print(cluster_labels_rest)



# get the number of clusters
num_clusters_rest = len(set(cluster_labels_rest))

# all done, print the outcome
message = 'Clustered {:,} points down to {:,} clusters, for {:.1f}% compression'
print(message.format(len(coordinates_rest), num_clusters_rest, 100*(1 - float(num_clusters_rest) / len(coordinates_rest))))
print('Silhouette coefficient: {:0.03f}'.format(metrics.silhouette_score(coordinates_rest, cluster_labels_rest)))


[-1 -1 -1 ... -1 -1 -1]
Clustered 1,172 points down to 12 clusters, for 99.0% compression
Silhouette coefficient: -0.188


In [1263]:
# turn the clusters in to a pandas series, where each element is a cluster of points
# clusters_rest = pd.Series([coordinates_rest[cluster_labels_rest==n] for n in range(num_clusters_rest)])

# I added in the cluster label to the dataframe 
Toronto_rest_venues.drop(columns = ['Cluster'], inplace = True) # first drop the column if it exists from a prior analysis
Toronto_rest_venues['Cluster'] = cluster_labels_rest + 1         

"""Note that I added one to the cluster labels, because dealing with the -1 for the outliers causes probelms for mapping, 
assinging indexes, etc. It's just important to remember this down the road"""

Toronto_rest_venues                                 



Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Restaurant,Salon,LongitudeNormed,Cluster
0,Parkwoods,43.753259,-79.329656,Allwyn's Bakery,43.759840,-79.324719,Caribbean Restaurant,1,0,-57.170969,0
3,Parkwoods,43.753259,-79.329656,A&W,43.760643,-79.326865,Fast Food Restaurant,1,0,-57.172515,0
9,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop,1,0,-57.177019,0
18,Parkwoods,43.753259,-79.329656,Spicy Chicken House,43.760639,-79.325671,Chinese Restaurant,1,0,-57.171654,0
29,Victoria Village,43.725882,-79.315572,Portugril,43.725819,-79.312785,Portuguese Restaurant,1,0,-57.162368,0
...,...,...,...,...,...,...,...,...,...,...,...
3623,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Rocco's Plum Tomato,43.634898,-79.519951,Italian Restaurant,1,0,-57.311677,0
3624,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Kelseys Original Roadhouse,43.622934,-79.516427,Restaurant,1,0,-57.309137,0
3630,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Buon Giorno Cafe,43.622801,-79.519322,Italian Restaurant,1,0,-57.311223,0
3635,"Mimico NW, The Queensway West, South of Bloor,...",43.628841,-79.520999,Appalachia BBQ,43.624034,-79.514079,BBQ Joint,1,0,-57.307444,0


By grouping the data, we can see that there are many outliers (indexed as 0), but that's good! The goal was to have a very selective DBSCAN: high density in small areas. Reducing a city to 11 clusters is effective for the purposes of finding an area to search for available salon real estate.

We can get a better picture for the clusters by grouping the data according to cluster, and especially by mapping the data while distinguishing between different clusters.  

In [1409]:
Toronto_rest_venues.groupby(by= 'Cluster').sum()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude,Restaurant,Salon,LongitudeNormed
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,34351.110545,-62396.183288,34350.69433,-62396.221303,786,0,-44970.249588
1,2400.880807,-4365.624563,2400.73123,-4365.774096,55,0,-3146.503853
2,1571.853925,-2857.96266,1571.980242,-2857.81683,36,0,-2059.687805
3,2139.81236,-3891.502996,2139.546419,-3891.361651,49,0,-2804.584974
4,1747.269133,-3173.876297,1747.1262,-3173.962126,40,0,-2287.540271
5,2356.755372,-4288.843759,2356.798725,-4288.82925,54,0,-3091.048108
6,919.172518,-1667.578349,919.059433,-1667.644798,21,0,-1201.90616
7,1486.158747,-2699.277973,1486.055269,-2699.50181,34,0,-1945.58689
8,916.893774,-1668.760029,916.970908,-1668.822628,21,0,-1202.755047
9,872.739312,-1592.31638,872.751783,-1592.471287,20,0,-1147.727053


In [1410]:
# DBSCAN CLUSTERING


# create map of Toronto using latitude and longitude values
rest_site_map = folium.Map(location=[latitude-.04, longitude-.05], zoom_start=11.5)

# set color scheme for the clusters
x = np.arange(Toronto_rest_venues['Cluster'].max())
ys = [i + x + (i*x)**2 for i in range(Toronto_rest_venues['Cluster'].max())]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
#rainbow = [colors.rgb2hex(i) for i in colors_array]
rainbow = ['white', 'red', 'blue', 'green', 'gold', 'yellow', 'purple', 'white', 'orange', 'cyan','magenta', 'brown']
rainbow2 = ['black', 'red', 'blue', 'green', 'gold', 'yellow', 'purple', 'white', 'orange', 'cyan','magenta', 'brown']
opac = [.2, .7,.7,.7,.7,.7,.7,.7, .7, .7, .7, .7]
filling = [False, True,True,True,True,True,True,True,True,True,True, True]
size = [1, 2, 2,2,2,2,2,2,2,2,2,2,2,2] 
# add markers to map
markers_colors = []
for lat, lng, cluster in zip(Toronto_rest_venues['Venue Latitude'], Toronto_rest_venues['Venue Longitude'], Toronto_rest_venues['Cluster']):
    label = folium.Popup(str((cluster, lat, lng)), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=size[int(cluster)],
        popup=label,
        color=rainbow2[int(cluster)],
        fill=filling[int(cluster)],
        fill_color=rainbow[int(cluster)],
        fill_opacity=opac[int(cluster)],
        parse_html=False).add_to(rest_site_map)  

rest_site_map

Now, we'll take the centroids of the clusters of interest, ignoring the outliers, and calculate which clusters have the fewest salons nearby. This will require a series of for-loops performed on arrays of two different dataframes (the centroids and the venues). Here, it's important to go back to the pre-normalization longitudinal coordinate, but I will still account for the difference in longitude and latitude when I calculate euclidean distance.

Once I obtained the centroids, I created a new data frame to present to the client. This dataframe will contain the recommendations according to the DBSCAN. I will supplement the geospatial data with the number of restaurants and salons within the vicinity, as well as prices of the salon nearest to the centroid.

In [1278]:
X = Toronto_rest_venues[['Venue Latitude', 'Venue Longitude']]
y = Toronto_rest_venues['Cluster'] 
clf = NearestCentroid()
clf.fit(X, y)

clf.centroids_            # we aren't interested in the 0-index, which is the outliers.

array([[ 43.70317345, -79.38450547],
       [ 43.64965872, -79.37771083],
       [ 43.66611784, -79.38380082],
       [ 43.66421263, -79.4155439 ],
       [ 43.67815499, -79.34905315],
       [ 43.64442083, -79.42276389],
       [ 43.76473491, -79.41165703],
       [ 43.70750791, -79.39711205],
       [ 43.66528133, -79.46774417],
       [ 43.63758914, -79.62356433],
       [ 43.6545056 , -79.40240858],
       [ 43.68697568, -79.39367598]])

In [1279]:
# Here I convert the venue data frame to an array to facilitate the for-looping to come.

Toronto_dataIN = Toronto_data.reset_index()
array_Toronto = Toronto_dataIN[['index','Venue Latitude', 'Venue Longitude', 'Salon', 'Venue']].to_numpy(dtype=None)
array_Toronto

array([[0, 43.75984035203157, -79.32471879917513, 0, "Allwyn's Bakery"],
       [3, 43.760643076161315, -79.3268649067081, 0, 'A&W'],
       [9, 43.75197441585782, -79.33311418516017, 0, 'Variety Store'],
       ...,
       [3409, 43.79875014252455, -79.31876781034187, 1, 'Nails for You'],
       [3543, 43.65979, -79.51343, 1, 'Dove Spa'],
       [3625, 43.63137, -79.519006, 1,
        'Koala Tan Tanning Salon & Sunless Spa']], dtype=object)

In [1411]:
for i, j in clf.centroids_:                                  # i = lat, j = long
    total_salons = []                                        # create an empty list for each coordinate pair
    for k, l, m, n, o in array_Toronto:                      # k = index, l = lat, m = long, o = Salon dummy var
        if n == 1:                                           # only consider salons
            dis = (((np.radians(i-l))**2) + (((np.radians(j-m))**2)*.72072))**.5
                                         
            if dis > (1 / kms_per_radian):                   # skip venues > 12 city blocks in radians away
                continue
            else: 
                total_salons.append([k, l, m, n, o])            
    print(total_salons)

[]
[[538, 43.65009640955562, -79.37362952211767, 1, 'Aveda Institute Toronto'], [864, 43.654734082347616, -79.38024826065258, 1, 'Solei Tanning Salon']]
[[220, 43.66563031656884, -79.38135926650858, 1, "Ho's Team Barber Shop"], [250, 43.670169709332704, -79.39140019059467, 1, 'Fiorio Hair Design'], [274, 43.66940593538455, -79.38674752555433, 1, 'NC Salon +']]
[[896, 43.66483011797359, -79.41272601896088, 1, 'Aroma Wellness Clinic and Spa'], [956, 43.66431438008676, -79.41517061947587, 1, 'Spring Nails']]
[[1272, 43.67982572188027, -79.34030209156394, 1, 'Tips Nail Bar'], [1299, 43.67930961860643, -79.34290082858878, 1, 'Allure Body Bar'], [1588, 43.676667789021906, -79.35660190332753, 1, 'Urban Nails']]
[[1684, 43.64279609182282, -79.42559768832729, 1, 'Float Toronto']]
[]
[[2424, 43.71511093497453, -79.40030409940266, 1, 'Civello Salon']]
[[2551, 43.66546666817928, -79.4714906407299, 1, 'Lucy Nails'], [2568, 43.665543850492575, -79.47450997275811, 1, 'High Park Nails and Spa']]
[]
[[

In [1412]:
cluster_centroids['TotalRestaurants'] = [55, 36,49,40,54,21,34,21,20,29,27]
cluster_centroids['TotalSalons'] = [2,3,2,3,1,0,1,2,0,2,2]
cluster_centroids['WomensHair'] = [1, 2,0,0,0,0,1,0,0,0,0]
cluster_centroids['WomensCutPrice'] = ['\$22-\$60+', '\$30-\$75', 'N/A','N/A', 'N/A', 'N/A', '\$50+','N/A', 'N/A', 'N/A', 'N/A']
cluster_centroids
                                               
                                               

Unnamed: 0,Latitude,Longitude,TotalRestaurants,TotalSalons,WomensHair,WomensCutPrice
1,43.649659,-79.377711,55,2,1,\$22-\$60+
2,43.666118,-79.383801,36,3,2,\$30-\$75
3,43.664213,-79.415544,49,2,0,
4,43.678155,-79.349053,40,3,0,
5,43.644421,-79.422764,54,1,0,
6,43.764735,-79.411657,21,0,0,
7,43.707508,-79.397112,34,1,1,\$50+
8,43.665281,-79.467744,21,2,0,
9,43.637589,-79.623564,20,0,0,
10,43.654506,-79.402409,29,2,0,


In [1281]:
# create a data frame from the centroids, excluding the 0-index, which is group of the outliers

cluster_centroids = pd.DataFrame(clf.centroids_)
cluster_centroids.rename(columns = {0: 'Latitude', 1: 'Longitude'}, inplace = True)
cluster_centroids = cluster_centroids.tail(11)  # we aren't interested in the 0-index, which is the outliers.
cluster_centroids

Unnamed: 0,Latitude,Longitude
1,43.649659,-79.377711
2,43.666118,-79.383801
3,43.664213,-79.415544
4,43.678155,-79.349053
5,43.644421,-79.422764
6,43.764735,-79.411657
7,43.707508,-79.397112
8,43.665281,-79.467744
9,43.637589,-79.623564
10,43.654506,-79.402409


In [1333]:
#df_indexed = df.reset_index()
array_neighborhoods = df_indexed[['index','Latitude', 'Longitude', 'Neighbourhood', 'Postal Code', 'Borough']].to_numpy(dtype=None)
array_neighborhoods

array([[0, 43.7532586, -79.3296565, 'Parkwoods', 'M3A', 'North York'],
       [1, 43.725882299999995, -79.31557159999998, 'Victoria Village',
        'M4A', 'North York'],
       [2, 43.6542599, -79.3606359, 'Regent Park, Harbourfront', 'M5A',
        'Downtown Toronto'],
       [3, 43.718517999999996, -79.46476329999999,
        'Lawrence Manor, Lawrence Heights', 'M6A', 'North York'],
       [4, 43.6623015, -79.3894938,
        "Queen's Park, Ontario Provincial Government", 'M7A',
        'Downtown Toronto'],
       [5, 43.6678556, -79.53224240000002,
        'Islington Avenue, Humber Valley Village', 'M9A', 'Etobicoke'],
       [6, 43.806686299999996, -79.19435340000001, 'Malvern, Rouge',
        'M1B', 'Scarborough'],
       [7, 43.745905799999996, -79.352188, 'Don Mills', 'M3B',
        'North York'],
       [8, 43.7063972, -79.309937, 'Parkview Hill, Woodbine Gardens',
        'M4B', 'East York'],
       [9, 43.6571618, -79.37893709999999, 'Garden District, Ryerson',
        'M5B

In [1397]:
b=100
full_list = []
for i, j in clf.centroids_:                                 # i = lat, j = long
    c = []                                     # create an empty list for each coordinate pair
    for p, q, r, s, t, u in array_neighborhoods:            # p = indx, q = lat, r = long, s = neigh, t = post, u = brgh
        a = (((np.radians(i-q))**2) + (((np.radians(j-r))**2)*.72072))**.5
        if a < b:                                           # if a is less than the currently lowest distance, 
            b = a                                           # reset b as the lowest distance obtained
            c = [i, t,s]                                  # unite the pertinent information
        else:   
            continue                          
    b = 100                                                 # reset b for each new centroid
    full_list.append(c)                                     # append the pertinent information to the empty list

full_list_Df = pd.DataFrame.from_records(full_list, columns = ['Latitude','Postal Code', 'Neighborhood'])
full_list_Df = full_list_Df.tail(11)               # 0-index are outliers
full_list_Df

Unnamed: 0,Latitude,Postal Code,Neighborhood
1,43.649659,M5L,"Commerce Court, Victoria Hotel"
2,43.666118,M4Y,Church and Wellesley
3,43.664213,M6G,Christie
4,43.678155,M4K,"The Danforth West, Riverdale"
5,43.644421,M6J,"Little Portugal, Trinity"
6,43.764735,M2N,"Willowdale, Willowdale East"
7,43.707508,M4S,Davisville
8,43.665281,M6P,"High Park, The Junction South"
9,43.637589,M7R,Canada Post Gateway Processing Centre
10,43.654506,M5T,"Kensington Market, Chinatown, Grange Park"


In [1422]:
# Now join the centroids and the neighborhoods according to the Latitude of the centroid and its closest postal code

cluster_centroids_N = cluster_centroids.merge(full_list_Df, how = 'inner', on='Latitude')
#df = neighbs.merge(coords, how='inner', left_on=["Postal Code"], right_on=["Postal Code"])
cluster_centroids_Fin = cluster_centroids_N[['Neighborhood', 'Postal Code', 'TotalRestaurants', 'TotalSalons', 'WomensHair', 'WomensCutPrice', 'Latitude', 'Longitude']]
cluster_centroids_Fin.sort_values(by=['TotalRestaurants'], ascending = False).sort_values(by=['WomensHair'])

Unnamed: 0,Neighborhood,Postal Code,TotalRestaurants,TotalSalons,WomensHair,WomensCutPrice,Latitude,Longitude
4,"Little Portugal, Trinity",M6J,54,1,0,,43.644421,-79.422764
2,Christie,M6G,49,2,0,,43.664213,-79.415544
3,"The Danforth West, Riverdale",M4K,40,3,0,,43.678155,-79.349053
9,"Kensington Market, Chinatown, Grange Park",M5T,29,2,0,,43.654506,-79.402409
10,"Summerhill West, Rathnelly, South Hill, Forest...",M4V,27,2,0,,43.686976,-79.393676
5,"Willowdale, Willowdale East",M2N,21,0,0,,43.764735,-79.411657
7,"High Park, The Junction South",M6P,21,2,0,,43.665281,-79.467744
8,Canada Post Gateway Processing Centre,M7R,20,0,0,,43.637589,-79.623564
0,"Commerce Court, Victoria Hotel",M5L,55,2,1,\$22-\$60+,43.649659,-79.377711
6,Davisville,M4S,34,1,1,\$50+,43.707508,-79.397112


In [1309]:
#reprint map to visualize
rest_site_map

This data can be used by the client to consider local competition in each area of high restaurant density (and if present, the local salon competition's prices) when considering real estate options. This will help narrow down promising real estate options for the client that include a high amount of foot traffic and consumer spending, both important for new business owners in this service industry. 