# Recommend a place to open a coffee shop in Downtown of Toronto #

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

**Scrape table of Toronto's list of boroughs and neighborhoods from Wiki, get coordinates and convert to a data frame**

In [2]:
WIKI_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req = requests.get(WIKI_URL)
soup = BeautifulSoup(req.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))[0]
df.drop([0], inplace=True)
df.columns=['PostalCode','Borough','Neighborhood']

df=df[df['Borough']!="Not assigned"].reset_index(drop=True)
for i in range(0,len(df)):
    if df.loc[i,'Neighborhood']=='Not assigned':
        df.loc[i,'Neighborhood']=df.loc[i,'Borough']

grouped=df.groupby(['PostalCode','Borough'])
df_grouped=grouped['Neighborhood'].sum().reset_index()[['PostalCode','Borough']]

df_grouped['Neighborhood']=df_grouped['Borough']  # initialize a new column for neighborhoods
for i in range(0,len(df_grouped)):
    pb=tuple(df_grouped.loc[i,['PostalCode','Borough']])
    mylist=grouped.get_group(pb)['Neighborhood']
    mystr=', '.join(mylist)
    df_grouped.loc[i,'Neighborhood']=mystr
    
df_coord=pd.read_csv('https://cocl.us/Geospatial_data')
df_coord.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
df_full=df_grouped.join(df_coord.set_index('PostalCode'),on='PostalCode')
    
downtown_data = df_full[df_full['Borough'] == 'Downtown Toronto'].reset_index(drop=True)
downtown_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


**Data shape**

In [28]:
downtown_data.shape

(18, 6)

**Get all dependencies for exploring and clustering**

In [3]:
import json 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim 
import requests 
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes 
import folium 
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

**Coordinates of Downtown Toronto**

In [4]:
address = 'Downtown Toronto'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.654027, -79.3802003.


**Map of Downtown Toronto**

In [6]:
map_downtown = folium.Map(location=[latitude, longitude], zoom_start=13)

for lat, lng, borough, neighborhood in zip(downtown_data['Latitude'], downtown_data['Longitude'], downtown_data['Borough'], downtown_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_downtown)  
    
map_downtown

**Define Foursquare Credentials and Version**

In [13]:
CLIENT_ID = 'U4XRSMTZPZ4UP1N1YTJMIAH4AV4QLYGIDASPSO1RC50QG0TM' # your Foursquare ID
CLIENT_SECRET = '2CIKRZ3UCNTLB0KAWRC4OPBRSASSH0JOTBCJGE3HPITSAHFX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: U4XRSMTZPZ4UP1N1YTJMIAH4AV4QLYGIDASPSO1RC50QG0TM
CLIENT_SECRET:2CIKRZ3UCNTLB0KAWRC4OPBRSASSH0JOTBCJGE3HPITSAHFX


**A function to explore all venues withthin 1000m of each neighborhood**

In [9]:
LIMIT=100
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

**Run the above function on all neighborhoods of Downtown Toronto and store in a data frame**

In [44]:
downtown_venues = getNearbyVenues(names=downtown_data['Neighborhood'],
                                   latitudes=downtown_data['Latitude'],
                                   longitudes=downtown_data['Longitude']
                                  )

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie


**Save dataframe to csv file for future use**

In [11]:
#downtown_venues.to_csv('downtown_venues.csv',index=False)
downtown_venues=pd.read_csv('downtown_venues.csv')
downtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Summerhill Market,43.686265,-79.375458,Grocery Store
1,Rosedale,43.679563,-79.377529,Black Camel,43.677016,-79.389367,BBQ Joint
2,Rosedale,43.679563,-79.377529,Toronto Lawn Tennis Club,43.680667,-79.388559,Athletics & Sports
3,Rosedale,43.679563,-79.377529,Tinuno,43.671281,-79.37492,Filipino Restaurant
4,Rosedale,43.679563,-79.377529,Craigleigh Gardens,43.678099,-79.371586,Park


**We extract data of different kinds of restaurants in the neighborhoods**

In [12]:
venue_category = list(downtown_venues['Venue Category'].unique())
restaurant_list=[name for name in venue_category if name.find('Restaurant')!=-1]
print(restaurant_list)

['Filipino Restaurant', 'Japanese Restaurant', 'Italian Restaurant', 'Indian Restaurant', 'Restaurant', 'Caribbean Restaurant', 'Taiwanese Restaurant', 'Thai Restaurant', 'Sushi Restaurant', 'American Restaurant', 'Ramen Restaurant', 'Mexican Restaurant', 'Ethiopian Restaurant', 'Vietnamese Restaurant', 'Afghan Restaurant', 'Persian Restaurant', 'Portuguese Restaurant', 'Seafood Restaurant', 'Chinese Restaurant', 'Mediterranean Restaurant', 'Middle Eastern Restaurant', 'Falafel Restaurant', 'Fast Food Restaurant', 'Vegetarian / Vegan Restaurant', 'French Restaurant', 'Greek Restaurant', 'German Restaurant', 'New American Restaurant', 'Latin American Restaurant', 'Comfort Food Restaurant', 'Belgian Restaurant', 'Tapas Restaurant', 'Asian Restaurant', 'Brazilian Restaurant', 'Dumpling Restaurant', 'Doner Restaurant', 'Korean Restaurant', 'Eastern European Restaurant', 'South American Restaurant', 'Jewish Restaurant']


**Get onehot dummies then group rows by neighborhood and by taking the total occurrence of each category**

In [18]:
downtown_onehot = pd.get_dummies(downtown_venues[['Venue Category']], prefix="", prefix_sep="")
downtown_onehot['Neighborhood'] = downtown_venues['Neighborhood'] 
downtown_onehot = downtown_onehot[['Neighborhood']+restaurant_list]
downtown_onehot_grouped = downtown_onehot.groupby('Neighborhood').sum().reset_index()
downtown_onehot_grouped.head()

Unnamed: 0,Neighborhood,Filipino Restaurant,Japanese Restaurant,Italian Restaurant,Indian Restaurant,Restaurant,Caribbean Restaurant,Taiwanese Restaurant,Thai Restaurant,Sushi Restaurant,...,Belgian Restaurant,Tapas Restaurant,Asian Restaurant,Brazilian Restaurant,Dumpling Restaurant,Doner Restaurant,Korean Restaurant,Eastern European Restaurant,South American Restaurant,Jewish Restaurant
0,"Adelaide, King, Richmond",0,3,1,1,2,0,0,2,3,...,0,0,1,1,0,0,0,0,0,0
1,Berczy Park,0,3,2,0,4,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Cabbagetown, St. James Town",1,2,1,2,3,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,Central Bay Street,0,3,2,0,1,0,0,2,1,...,0,1,0,0,0,0,0,0,0,0


**Cluster neighborhoods**

In [14]:
kclusters = 5

downtown_clustering = downtown_onehot_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_clustering)

kmeans.labels_[0:10] 

array([0, 0, 3, 3, 4, 2, 1, 4, 0, 0], dtype=int32)

**Label each neighborhood with the corresponding cluster**

In [15]:
downtown_cluster = downtown_onehot_grouped[['Neighborhood']]
downtown_cluster['Cluster']=kmeans.labels_
downtown_cluster=downtown_data.join(downtown_cluster.set_index('Neighborhood'),on='Neighborhood')
downtown_cluster.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,3
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,3
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,4
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,3
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,0


**Visualize clusters**

In [17]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(downtown_cluster['Latitude'], downtown_cluster['Longitude'], downtown_cluster['Neighborhood'], downtown_cluster['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

**Analyze the centers of the clusters, and sort the clusters by ascending order of the total number of restaurants**

In [48]:
centers = pd.DataFrame(kmeans.cluster_centers_)
centers.columns = downtown_onehot.columns[1:]
centers.index = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4','Cluster 5']
centers['Sum'] = centers.sum(axis = 1)
centers.sort_values(axis = 0, by = ['Sum'], ascending=False)

Unnamed: 0,Filipino Restaurant,Japanese Restaurant,Italian Restaurant,Indian Restaurant,Restaurant,Caribbean Restaurant,Taiwanese Restaurant,Thai Restaurant,Sushi Restaurant,American Restaurant,...,Tapas Restaurant,Asian Restaurant,Brazilian Restaurant,Dumpling Restaurant,Doner Restaurant,Korean Restaurant,Eastern European Restaurant,South American Restaurant,Jewish Restaurant,Sum
Cluster 2,0.0,2.0,1.0,3.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,12.0,1.0,1.0,1.0,34.0
Cluster 3,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,30.0
Cluster 5,0.0,4.5,2.0,0.5,1.5,0.5,0.0,1.5,2.0,1.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28.0
Cluster 1,0.0,2.625,2.25,0.25,4.125,0.0,-1.387779e-17,1.375,0.875,2.625,...,-1.387779e-17,0.5,0.375,-5.5511150000000004e-17,-2.775558e-17,0.0,-2.775558e-17,-1.387779e-17,-1.387779e-17,21.375
Cluster 4,0.333333,1.166667,1.666667,0.833333,1.833333,0.333333,0.1666667,0.666667,1.0,0.166667,...,-6.938894e-18,-2.775558e-17,0.0,0.1666667,0.1666667,0.166667,0.1666667,-6.938894e-18,-6.938894e-18,13.0


**Cluster 2 appears to have the highest numbers of restaurants. Lets list all neighborhoods in Cluster 2**

In [62]:
downtown_cluster[downtown_cluster['Cluster']==2]

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster
13,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",43.653206,-79.400049,2


**It is our recommendation that the coffee shop to be open in the areas of Chinatown, Grange Park, Kensington Market**

# Thank you for reading this notebook! #