SEGMENTING AND CLUSTERING - FIRST QUESTION (THIRD QUESTION FURTHER DOWN)
=========================

In [2]:
import pandas as pd
import numpy as np
import json 
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

#Create Dataframe
df=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")[0]
df
#Check shape prior to dropping Not Assigned
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)  

The dataframe has 12 boroughs and 287 neighborhoods.


__TRANSFORMING DATAFRAME__

In [3]:
#Drop "Not Assigned" Boroughs
df=df[df['Borough'] != 'Not assigned'].reset_index(drop=True)

#Check the new shape
print('The dataframe has {} boroughs and {} neighbourhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
        )
)  

The dataframe has 11 boroughs and 210 neighborhoods.


In [4]:
#Replacing Not Assigned Neighbourhoods with Burough Name.
df.loc[df['Neighbourhood'] == ('Not assigned'), 'Neighbourhood'] = df['Borough']

#Grouping by Postcode
df_grouped = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_grouped.head(5)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


__PRINTING HOW MANY ROWS THE DATAFRAME HAS__

In [5]:
#How many rows the dataframe has.
print('The dataframe has {} rows.'.format(
        df_grouped.shape[0]
        )
)  

The dataframe has 103 rows.


SEGMENTING AND CLUSTERING - SECOND QUESTION
=========================

__Loading CSV-File__

In [6]:
#Getting the CSV-file
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head(5)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


__Merging the dataframe with coordinates__

In [7]:
#Rename the column to match df_grouped
coordinates.rename(columns={"Postal Code": "Postcode"}, inplace=True)

In [8]:
#Create new dataframe with combined data from both previous.
df=df_grouped.merge(coordinates, on="Postcode", how="left")
df.head(10)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


SEGMENTING AND CLUSTERING - THIRD QUESTION
===========

In [9]:
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    geopy-1.21.0               |             py_0          58 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.21.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

Getting Geolocation of Toronto
------------------------

In [19]:
city = 'Toronto'
geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(city)
latitude = location.latitude
longitude = location.longitude

print('The Geolocation of Toronto is {},{}.'.format(latitude,longitude))

The Geolocation of Toronto is 43.653963,-79.387207.


__Creating a dataframe where only Bouroughs that have "Toronto" in their names are included.__

In [104]:
df_toronto = df[df['Borough'].str.contains('Toronto')].reset_index(drop = True)
df_toronto.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Central Toronto,Davisville North,43.712751,-79.390197
6,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Central Toronto,Davisville,43.704324,-79.38879
8,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
9,M4V,Central Toronto,"Deer Park, Forest Hill SE, Rathnelly, South Hi...",43.686412,-79.400049


__Create a Map of Toronto with the new dataframe as markers__

In [32]:
# First we create the map at a suitable zoom-level.
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# Now we add markers based on the previous dataframe. 
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='lime',
        fill=True,
        fill_color='cyan',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

__Deeper exploration of one neighbourhood__

In [39]:
print("The neighbourhood I've chosen to explore further is: {}".format(df_toronto.loc[10, 'Neighbourhood']))

The neighborhood I've chosen to explore further is: Rosedale


In [42]:
#Getting the coordinates for Rosedale
neighbourhood_name = df_toronto.loc[10, 'Neighbourhood']
neighbourhood_latitude = df_toronto.loc[10, 'Latitude']
neighbourhood_longitude = df_toronto.loc[10, 'Longitude']
print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Rosedale are 43.6795626, -79.37752940000001.


__Foursquare API Information__

In [123]:
CLIENT_ID = '' # your Foursquare ID
CLIENT_SECRET = '' # your Foursquare Secret
VERSION = '20180604'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 
CLIENT_SECRET:


__Getting the top 5 venues within a 800 meter radius__

In [83]:
radius = 400
LIMIT = 5

url = url = ('https://api.foursquare.com/v2/venues/explore?client_id={}'
       '&client_secret={}&v={}&ll={},{}&radius={}&limit={}').format(CLIENT_ID, 
                                                                  CLIENT_SECRET, 
                                                                  VERSION, 
                                                                  neighbourhood_latitude, 
                                                                  neighbourhood_longitude, 
                                                                  radius,
                                                                  LIMIT)
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5e42c389e826ac001b6726ee'},
  'headerLocation': 'Rosedale',
  'headerFullLocation': 'Rosedale, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 3,
  'suggestedBounds': {'ne': {'lat': 43.6831626036, 'lng': -79.37256090619498},
   'sw': {'lat': 43.67596259639999, 'lng': -79.38249789380505}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bae2150f964a520df873be3',
       'name': 'Mooredale House',
       'location': {'address': '146 Crescent Rd.',
        'crossStreet': 'btwn. Lamport Ave. and Mt. Pleasant Rd.',
        'lat': 43.678630645646535,
        'lng': -79.38009142511322,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.678630645646535,
          'lng': -79.38009142511322}],
       

__Creating _get_category_type method to get name of category__

In [84]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    

In [85]:
venues = result['response']['groups'][0]['items']
nearby_venues = json_normalize(venues)

# Filter the columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# Filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis = 1)

# Clean all column names
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Mooredale House,Building,43.678631,-79.380091
1,Rosedale Park,Playground,43.682328,-79.378934
2,Betline Trail at Roxborough dr.,Bike Trail,43.68053,-79.38149


In [86]:
print('We could only find {} venues in our exploration of central {}!'.format(nearby_venues.shape[0], df_toronto.loc[10, 'Neighbourhood']))


We could only find 3 venues in our exploration of central Rosedale!


Getting all the Venues for all neighbourhoods in our df_toronto
-----

In [87]:
#First we change the values of radius and limit to get a larger amount of results. 
radius = 750
LIMIT = 100

In [88]:


def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [89]:
list_of_venues = getNearbyVenues(df_toronto['Neighbourhood'],
                                 df_toronto['Latitude'],
                                 df_toronto['Longitude'],
                                 radius)

The Beaches
The Danforth West, Riverdale
The Beaches West, India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Moore Park, Summerhill East
Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Roselawn
Forest Hill North, Forest Hill West
The Annex, North Midtown, Yorkville
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
Dovercourt Village, Dufferin
Little Portugal, Trinity
Brockton, Exhibition Place, Parkdale Village
High Park, The Junction Sout

__Let's see how many unique categories of venues we found__

In [93]:
print('We were able to find {} unique venue categories.'.format(len(list_of_venues['Venue Category'].unique())))


We were able to find 284 unique venue categories.


In [95]:
list_of_venues.groupby('Neighbourhood').count()


Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,100,100,100,100,100,100
"Brockton, Exhibition Place, Parkdale Village",84,84,84,84,84,84
Business Reply Mail Processing Centre 969 Eastern,55,55,55,55,55,55
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",26,26,26,26,26,26
"Cabbagetown, St. James Town",67,67,67,67,67,67
Central Bay Street,100,100,100,100,100,100
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,31,31,31,31,31,31
Church and Wellesley,100,100,100,100,100,100


Neighbourhood Analysis
------

In [99]:
toronto_onehot = pd.get_dummies(list_of_venues[['Venue Category']], prefix = "", prefix_sep = "")
toronto_onehot = pd.concat([list_of_venues['Neighbourhood'], toronto_onehot.drop(['Neighborhood'], axis = 1)], axis = 1)
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
#Examine shape of the new data frame.
toronto_onehot.shape


(2695, 284)

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [101]:
toronto_grouped = toronto_onehot.groupby(['Neighbourhood']).mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Udon Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,0.01
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.011905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,...,0.0,0.0,0.011905,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018182,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.038462,0.038462,0.038462,0.076923,0.115385,0.115385,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.01,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.01
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.06,0.0,0.0,0.03,0.01,0.0,0.0,0.01
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,...,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.0,0.01


In [105]:
# Confirm new size.
toronto_grouped.shape

(39, 284)

#### Let's print each neighborhood along with the top 5 most common venues

In [103]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending = False).reset_index(drop = True).head(num_top_venues))
    print('\n')

----Adelaide, King, Richmond----
             venue  freq
0      Coffee Shop  0.08
1             Café  0.06
2       Steakhouse  0.04
3  Thai Restaurant  0.03
4          Theater  0.03


----Berczy Park----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.05
2   Restaurant  0.04
3     Beer Bar  0.04
4        Hotel  0.04


----Brockton, Exhibition Place, Parkdale Village----
         venue  freq
0  Coffee Shop  0.06
1          Bar  0.05
2         Café  0.05
3   Restaurant  0.05
4    Gift Shop  0.04


----Business Reply Mail Processing Centre 969 Eastern----
                  venue  freq
0  Fast Food Restaurant  0.07
1    Light Rail Station  0.05
2                Bakery  0.04
3       Harbor / Marina  0.04
4        Clothing Store  0.04


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
                 venue  freq
0      Airport Service  0.12
1      Harbor / Marina  0.12
2     Airport Terminal  0.12
3  Rental Car L

#### Let's put that into a *pandas* dataframe with the function below.

In [107]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

__Now let's create the new dataframe and display the top 10 venues for each neighborhood.__

In [114]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,Restaurant,Sushi Restaurant,Hotel,Theater,Cosmetics Shop,Thai Restaurant,Gym
1,Berczy Park,Coffee Shop,Café,Beer Bar,Hotel,Park,Restaurant,Italian Restaurant,Breakfast Spot,Cocktail Bar,Japanese Restaurant
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Bar,Restaurant,Café,Bakery,Gift Shop,Music Venue,Plaza,Italian Restaurant,Arts & Crafts Store
3,Business Reply Mail Processing Centre 969 Eastern,Fast Food Restaurant,Light Rail Station,Brewery,Bakery,Italian Restaurant,Clothing Store,Harbor / Marina,Bar,Coffee Shop,Burrito Place
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Harbor / Marina,Airport Service,Airport Terminal,Rental Car Location,Sculpture Garden,Boat or Ferry,Coffee Shop,Airport Lounge,Airport Food Court,Bar


CLUSTERING OF NEIGHBOURHOODS
-----------------

In [115]:
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
NO_OF_CLUSTERS=5
# Run K-Means clustering
kMeans = KMeans(n_clusters = NO_OF_CLUSTERS, random_state = 0).fit(toronto_grouped_clustering)

In [116]:
# Add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kMeans.labels_)

# Combine the data
toronto_merged = df_toronto.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on = 'Neighbourhood')
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Pub,Bar,Breakfast Spot,Gastropub,Sandwich Place,Japanese Restaurant,Coffee Shop,Pet Store,Shoe Store,Café
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Pub,Grocery Store,Italian Restaurant,Fast Food Restaurant,Café,Spa,Yoga Studio,Bakery
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Indian Restaurant,Grocery Store,Brewery,Gym,Fast Food Restaurant,Park,Café,Sandwich Place,Coffee Shop,Snack Place
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Bar,Coffee Shop,Bakery,Diner,American Restaurant,Italian Restaurant,Sandwich Place,Sushi Restaurant,Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Business Service,Park,Coffee Shop,Bus Line,Swim School,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant


In [118]:
# create map
map_clusters = folium.Map(location = [latitude, longitude], zoom_start = 11)

# set color scheme for the clusters
x = np.arange(NO_OF_CLUSTERS)
ys = [i + x + (i*x)**2 for i in range(NO_OF_CLUSTERS)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for neighborhood, cluster, latitude, longitude in zip(toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels'], toronto_merged['Latitude'], toronto_merged['Longitude']):
    label = folium.Popup(str(neighborhood) + ' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [latitude, longitude],
        radius = 5,
        popup = label,
        color = rainbow[cluster - 1],
        fill = True,
        fill_color = rainbow[cluster - 1],
        fill_opacity = 0.7).add_to(map_clusters)
       
map_clusters

In [138]:
# For Cluster 0
result = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
result

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Pub,Bar,Breakfast Spot,Gastropub,Sandwich Place,Japanese Restaurant,Coffee Shop,Pet Store,Shoe Store,Café
1,East Toronto,0,Greek Restaurant,Coffee Shop,Pub,Grocery Store,Italian Restaurant,Fast Food Restaurant,Café,Spa,Yoga Studio,Bakery
2,East Toronto,0,Indian Restaurant,Grocery Store,Brewery,Gym,Fast Food Restaurant,Park,Café,Sandwich Place,Coffee Shop,Snack Place
3,East Toronto,0,Café,Bar,Coffee Shop,Bakery,Diner,American Restaurant,Italian Restaurant,Sandwich Place,Sushi Restaurant,Restaurant
4,Central Toronto,0,Business Service,Park,Coffee Shop,Bus Line,Swim School,Diner,Discount Store,Dive Bar,Dog Run,Doner Restaurant
5,Central Toronto,0,Coffee Shop,Pizza Place,Park,Gym,Café,Supermarket,Bar,Jazz Club,Food & Drink Shop,Taco Place
6,Central Toronto,0,Coffee Shop,Café,Clothing Store,Sporting Goods Shop,Bakery,Italian Restaurant,Restaurant,Skating Rink,Diner,Grocery Store
7,Central Toronto,0,Coffee Shop,Italian Restaurant,Dessert Shop,Pizza Place,Gym,Sandwich Place,Café,Gastropub,Indian Restaurant,Fast Food Restaurant
9,Central Toronto,0,Coffee Shop,Sushi Restaurant,Italian Restaurant,Pharmacy,Café,Pizza Place,Restaurant,Thai Restaurant,Sandwich Place,Skating Rink
11,Downtown Toronto,0,Coffee Shop,Grocery Store,Pizza Place,Restaurant,Café,Pharmacy,Park,Bakery,Diner,Pub


In [134]:
# For Cluster 1
result = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
result

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
23,Central Toronto,1,Jewelry Store,Bus Line,Sushi Restaurant,Trail,Yoga Studio,Donut Shop,Discount Store,Dive Bar,Dog Run,Doner Restaurant


In [135]:

# For Cluster 2
result = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
result

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Downtown Toronto,2,Park,Trail,Playground,Candy Store,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dive Bar,Dog Run


In [136]:
# For Cluster 3
result = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
result

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
8,Central Toronto,3,Park,Grocery Store,Playground,Café,Tennis Court,Thai Restaurant,Japanese Restaurant,Candy Store,Sandwich Place,Gym / Fitness Center


In [137]:
# For Cluster 4
result = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
result

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
22,Central Toronto,4,Playground,Health & Beauty Service,IT Services,Pet Store,Garden,Comic Shop,Dessert Shop,Falafel Restaurant,Event Space,Ethiopian Restaurant
