# Applied Data Science Capstone - Week 3 Project

## Part 1: Data Extraction from https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

### Using Pandas and Beautifulsoup

In [1]:
# Install BeautifulSoup and tabulate, if you have not already done so
# !pip install bs4
# !pip install tabulate

### import the necessary lib

In [1]:
import pandas as pd
import numpy as np

import requests

# Import for web page scraping
from bs4 import BeautifulSoup
from tabulate import tabulate

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


### Accessing the wiki page

In [2]:
res = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

# Use BeautifulSoup to part the page content based on lxml
soup = BeautifulSoup(res.content,'lxml')

# Use BeatifulSoup to find the table in the page
table = soup.find_all('table')[0] 

# Content the table content in the html to a Dataframe
neighborhood_list = pd.read_html(str(table))
df_can_neighborhood = pd.DataFrame(neighborhood_list[0])
df_can_neighborhood.head()
# print( tabulate(df_can_neighborhood[0], headers='keys', tablefmt='psql') )

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Rename the Postcode column to PostalCode

In [3]:
df_can_neighborhood.rename(columns = {"Postcode" : "PostalCode", "Neighbourhood" : "Neighborhood"}, inplace = True)
df_can_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Drop Borough that are Not assigned

In [4]:
indexNames = df_can_neighborhood[ df_can_neighborhood['Borough'] == "Not assigned"].index
df_can_neighborhood.drop(indexNames, inplace = True)
df_can_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### Sort the Dataset by PostalCode

In [5]:
df_can_neighborhood.sort_values(by = 'PostalCode', inplace = True)
df_can_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
28,M1C,Scarborough,Port Union
27,M1C,Scarborough,Rouge Hill
26,M1C,Scarborough,Highland Creek


### If neighborhood is Not Assigned, as it value to be same as Borough. There are NO neighborhood that are Not assigned

In [6]:
df_can_neighborhood[df_can_neighborhood['Neighborhood'] == 'Not Assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


### Postal 'M5A' is NOT defined twice as mentioned in Assignment notes

In [7]:
df_can_neighborhood[df_can_neighborhood['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M5A,Downtown Toronto,Harbourfront


### Combine neighbor that has the same PostalCode

In [8]:
df_can_neighborhood['Neighborhood'] = df_can_neighborhood[['PostalCode','Borough','Neighborhood']].groupby(['PostalCode'], as_index = False)['Neighborhood'].transform(lambda x: ','.join(x))
df_can_neighborhood.drop_duplicates(subset='PostalCode', inplace = True)
df_can_neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
10,M1B,Scarborough,"Rouge,Malvern"
28,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
41,M1E,Scarborough,"Guildwood,Morningside,West Hill"
52,M1G,Scarborough,Woburn
61,M1H,Scarborough,Cedarbrae


In [9]:
df_can_neighborhood

Unnamed: 0,PostalCode,Borough,Neighborhood
10,M1B,Scarborough,"Rouge,Malvern"
28,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
41,M1E,Scarborough,"Guildwood,Morningside,West Hill"
52,M1G,Scarborough,Woburn
61,M1H,Scarborough,Cedarbrae
75,M1J,Scarborough,Scarborough Village
90,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
107,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea"
122,M1M,Scarborough,"Cliffcrest,Scarborough Village West,Cliffside"
140,M1N,Scarborough,"Cliffside West,Birch Cliff"


### Print the number of rows in the DataSet

In [10]:
print('The number of row in the Dataset = ', df_can_neighborhood.shape[0])

The number of row in the Dataset =  103


## Part 2. Get Lat and Long based on Postal Code

### Import the necessary lib

In [11]:
# Install the library, if you have already done so
!pip install geocoder



You are using pip version 19.0.3, however version 19.3.1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [12]:
import geocoder

In [13]:
# Create a PostalCode list
postal_list = df_can_neighborhood['PostalCode']
postal_list
type(postal_list)

pandas.core.series.Series

### Retrieve lat and long based on Postal Code

In [14]:
# Try to retrieve a single PostalCode
postal_code = 'M5A'

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code), key = 'xxxx')
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

print("Postal Code = {}, Lat = {}, Long = {}".format(postal_code, latitude, longitude))

Postal Code = M5A, Lat = 43.65029500000003, Long = -79.35916572299999


In [15]:
# Retrieve lat and long for the postal code list of Toronto
postal_latlng = []

for postal_code in postal_list.iteritems():
    # print('postal_code = ', postal_code)

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code), key = 'xxxx')
        lat_lng_coords = g.latlng

        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]

        postal_latlng.append([postal_code[1], latitude, longitude])
        # print("Postal Code = {}, Lat = {}, Long = {}".format(postal_code, latitude, longitude))    

postal_latlng

[['M1B', 43.811525000000074, -79.19551746399998],
 ['M1C', 43.78566500000005, -79.15872457299997],
 ['M1E', 43.76581500000003, -79.17519294699997],
 ['M1G', 43.768369121000035, -79.21758999999997],
 ['M1H', 43.76968799900004, -79.23943999999995],
 ['M1J', 43.74312500000008, -79.23174973599998],
 ['M1K', 43.72627568400003, -79.26362499999993],
 ['M1L', 43.71305350000006, -79.28505499999994],
 ['M1M', 43.724234575000025, -79.22792499999997],
 ['M1N', 43.69677000000007, -79.25996735299998],
 ['M1P', 43.759975000000054, -79.26897418299995],
 ['M1R', 43.750710464000065, -79.30055999999996],
 ['M1S', 43.79394000000008, -79.26797613999997],
 ['M1T', 43.78472500000004, -79.29906603299997],
 ['M1V', 43.81768500000004, -79.28018721399997],
 ['M1W', 43.80088292800008, -79.32073999999994],
 ['M1X', 43.83421500000003, -79.21670085099998],
 ['M2H', 43.80284500000005, -79.35620744999994],
 ['M2J', 43.780970000000025, -79.34781328099996],
 ['M2K', 43.781015000000025, -79.38052867199997],
 ['M2L', 43.7

### Merge the Lat and Long to the existing Dataset based on Postal Code

In [16]:
# Convert postal_latlng to Dataframe
df_postal_latlng = pd.DataFrame(postal_latlng)
df_postal_latlng.columns = ['PostalCode', 'Latitude', 'Longitude']
df_postal_latlng.head()

df_canada = pd.merge(df_can_neighborhood, df_postal_latlng, on = 'PostalCode')
df_canada.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944


In [17]:
print('The number of row in the Dataset = ', df_canada.shape[0])

The number of row in the Dataset =  103


## Part 3: Cluster Data based on Neighbor

In [18]:
# Limit Dataset to Borough that contains Toronto
toronto_data = df_canada[df_canada['Borough'].str.contains("Toronto") == True].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676531,-79.295425
1,M4K,East Toronto,"Riverdale,The Danforth West",43.683178,-79.355105
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.667965,-79.314667
3,M4M,East Toronto,Studio District,43.660629,-79.334855
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133


### Define foursquare credentials

In [22]:
CLIENT_ID = 'xxxx' # your Foursquare ID
CLIENT_SECRET = 'xxxx' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KCM23YX43DPRZZORU32VMXIEB1RHEDIILZO4BNLIKJCAKO3B
CLIENT_SECRET:WPUTSXCG3LBF55P2PVZAUUKKY0XWDLVXX23MQEICUT4ALMF4


### Define a function that will retrieve nearby venues from Fourquare

In [23]:
def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Use getNearbyVenues to create a list of venues based on Toronto neighborhood

In [24]:
# Define Search Criteria
radius = 500
LIMIT = 100

toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                latitudes=toronto_data['Latitude'],
                                longitudes=toronto_data['Longitude'],
                                radius = radius)


The Beaches
Riverdale,The Danforth West
The Beaches West,India Bazaar
Studio District
Lawrence Park
Davisville North
North Toronto West
Davisville
Summerhill East,Moore Park
South Hill,Summerhill West,Rathnelly,Forest Hill SE,Deer Park
Rosedale
St. James Town,Cabbagetown
Church and Wellesley
Harbourfront
Garden District,Ryerson
St. James Town
Berczy Park
Central Bay Street
Adelaide,Richmond,King
Harbourfront East,Union Station,Toronto Islands
Design Exchange,Toronto Dominion Centre
Commerce Court,Victoria Hotel
Roselawn
Forest Hill North,Forest Hill West
Yorkville,The Annex,North Midtown
University of Toronto,Harbord
Kensington Market,Chinatown,Grange Park
King and Spadina,Railway Lands,South Niagara,CN Tower,Island airport,Harbourfront West,Bathurst Quay
Stn A PO Boxes 25 The Esplanade
Underground city,First Canadian Place
Christie
Dufferin,Dovercourt Village
Little Portugal,Trinity
Exhibition Place,Brockton,Parkdale Village
High Park,The Junction South
Roncesvalles,Parkdale
Runnymede

### Examine the Toronto Venue Dataset

In [25]:
print(toronto_venues.shape)
toronto_venues.head()

(1771, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676531,-79.295425,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676531,-79.295425,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676531,-79.295425,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676531,-79.295425,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
4,The Beaches,43.676531,-79.295425,Upper Beaches,43.680563,-79.292869,Neighborhood


### Check how many venues is returned per neighborhood

In [26]:
toronto_venues.groupby('Neighborhood', as_index = False).count()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide,Richmond,King",100,100,100,100,100,100
1,Berczy Park,61,61,61,61,61,61
2,Business Reply Mail Processing Centre 969 Eastern,100,100,100,100,100,100
3,Central Bay Street,100,100,100,100,100,100
4,Christie,11,11,11,11,11,11
5,Church and Wellesley,84,84,84,84,84,84
6,"Commerce Court,Victoria Hotel",100,100,100,100,100,100
7,Davisville,26,26,26,26,26,26
8,Davisville North,7,7,7,7,7,7
9,"Design Exchange,Toronto Dominion Centre",100,100,100,100,100,100


### Check the number of unique venues

In [27]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 221 uniques categories.


### Organized the Venue Category based on Neighbor

In [28]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
# For some unknown reasons, I cannot assign Neighborhood in toronto_onehot
toronto_onehot['Neighborhood1'] = toronto_venues['Neighborhood']
# print('Number of rows toronto_onehot = {}, toronto_venues = {}'.format(toronto_onehot.shape[0], toronto_venues.shape[0]))
# print('Is Neighborhood and index? In toronto_onehot {} toronto_venues {}'.format(toronto_onehot.keys(), toronto_venues.keys()))

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood1,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group the data by Neighborhood and take the mean of the frequency of the categories

In [29]:
toronto_grouped = toronto_onehot.groupby('Neighborhood1').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood1,Afghan Restaurant,American Restaurant,Antique Shop,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Train Station,Tram Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,Richmond,King",0.0,0.03,0.0,0.01,0.0,0.03,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.016393,0.0,0.0,0.0,0.016393,0.0,...,0.0,0.0,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.02,0.0,0.0,0.01,0.03,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,0.0
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.090909,0.0,0.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Print the Top 5 venues in each neighborhood

In [30]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood1']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood1'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))

----Adelaide,Richmond,King----
         venue  freq
0  Coffee Shop  0.08
1         Café  0.06
2        Hotel  0.05
3          Bar  0.03
4          Gym  0.03
----Berczy Park----
            venue  freq
0     Coffee Shop  0.08
1    Cocktail Bar  0.05
2          Bakery  0.03
3            Café  0.03
4  Breakfast Spot  0.03
----Business Reply Mail Processing Centre 969 Eastern----
         venue  freq
0  Coffee Shop  0.10
1          Bar  0.04
2        Hotel  0.04
3          Gym  0.03
4   Steakhouse  0.03
----Central Bay Street----
            venue  freq
0     Coffee Shop  0.12
1  Clothing Store  0.07
2  Cosmetics Shop  0.04
3           Plaza  0.03
4          Bakery  0.03
----Christie----
           venue  freq
0           Café  0.27
1  Grocery Store  0.18
2    Coffee Shop  0.09
3     Playground  0.09
4    Candy Store  0.09
----Church and Wellesley----
                 venue  freq
0          Coffee Shop  0.11
1           Restaurant  0.05
2  Japanese Restaurant  0.05
3              Gay Bar  

### Create a function to store the top n venues of each neighborhood into a dataframe

In [31]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Run the function to create a dataframe of top 10 venues in each neighborhood in Toronto

In [32]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood1']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood1'] = toronto_grouped['Neighborhood1']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood1,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Hotel,Japanese Restaurant,American Restaurant,Gastropub,Burger Joint,Restaurant,Asian Restaurant,Steakhouse
1,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Beer Bar,Cheese Shop,Bakery,Café,Seafood Restaurant,Restaurant,Hotel
2,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Hotel,Bar,Pub,Seafood Restaurant,Café,Asian Restaurant,Steakhouse,Gym,Sushi Restaurant
3,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Plaza,Bakery,Japanese Restaurant,Restaurant,Bubble Tea Shop,Tea Room,Juice Bar
4,Christie,Café,Grocery Store,Athletics & Sports,Italian Restaurant,Baby Store,Coffee Shop,Playground,Candy Store,Fish & Chips Shop,Fish Market


### Cluster the Neighborhood using Kmeans

In [35]:
# set number of clusters
kclusters = 5

toronto_clustering_data = toronto_grouped.drop('Neighborhood1', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_clustering_data)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 1, 1, 0, 1])

### Export the content of toronto_clustering_data to a csv file so that it can be used in future project

In [36]:
toronto_clustering_data.to_csv(path_or_buf = 'toronto_clustering_data.csv', index = False)

In [37]:
# toronto_data.head()
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood1,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Hotel,Japanese Restaurant,American Restaurant,Gastropub,Burger Joint,Restaurant,Asian Restaurant,Steakhouse
1,Berczy Park,Coffee Shop,Cocktail Bar,Farmers Market,Beer Bar,Cheese Shop,Bakery,Café,Seafood Restaurant,Restaurant,Hotel
2,Business Reply Mail Processing Centre 969 Eastern,Coffee Shop,Hotel,Bar,Pub,Seafood Restaurant,Café,Asian Restaurant,Steakhouse,Gym,Sushi Restaurant
3,Central Bay Street,Coffee Shop,Clothing Store,Cosmetics Shop,Plaza,Bakery,Japanese Restaurant,Restaurant,Bubble Tea Shop,Tea Room,Juice Bar
4,Christie,Café,Grocery Store,Athletics & Sports,Italian Restaurant,Baby Store,Coffee Shop,Playground,Candy Store,Fish & Chips Shop,Fish Market


### Add the venues back to the cluster for further analysis

In [38]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

neighborhoods_venues_sorted.rename(columns = {"Neighborhood1" : "Neighborhood"}, inplace = True)
# neighborhoods_venues_sorted.head()

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676531,-79.295425,0.0,Health Food Store,Pub,Other Great Outdoors,Trail,Neighborhood,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,M4K,East Toronto,"Riverdale,The Danforth West",43.683178,-79.355105,3.0,Bus Line,Park,Grocery Store,Discount Store,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.667965,-79.314667,1.0,Sandwich Place,Burrito Place,Italian Restaurant,Pet Store,Pizza Place,Pub,Movie Theater,Fast Food Restaurant,Fish & Chips Shop,Burger Joint
3,M4M,East Toronto,Studio District,43.660629,-79.334855,1.0,Diner,Brewery,Italian Restaurant,Café,Sushi Restaurant,American Restaurant,Sandwich Place,Arts & Crafts Store,Coffee Shop,Pizza Place
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133,0.0,Bus Line,Bus Stop,Swim School,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant


### Check for NaN cluster and drop it

In [39]:
indexDrop = toronto_merged[toronto_merged['Cluster Labels'].isna()].index
toronto_merged.drop(indexDrop, inplace = True)
toronto_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676531,-79.295425,0.0,Health Food Store,Pub,Other Great Outdoors,Trail,Neighborhood,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,M4K,East Toronto,"Riverdale,The Danforth West",43.683178,-79.355105,3.0,Bus Line,Park,Grocery Store,Discount Store,Electronics Store,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.667965,-79.314667,1.0,Sandwich Place,Burrito Place,Italian Restaurant,Pet Store,Pizza Place,Pub,Movie Theater,Fast Food Restaurant,Fish & Chips Shop,Burger Joint
3,M4M,East Toronto,Studio District,43.660629,-79.334855,1.0,Diner,Brewery,Italian Restaurant,Café,Sushi Restaurant,American Restaurant,Sandwich Place,Arts & Crafts Store,Coffee Shop,Pizza Place
4,M4N,Central Toronto,Lawrence Park,43.72842,-79.387133,0.0,Bus Line,Bus Stop,Swim School,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
5,M4P,Central Toronto,Davisville North,43.712755,-79.388514,0.0,Food & Drink Shop,Department Store,Bus Line,Breakfast Spot,Gym,Park,Hotel,Flea Market,Event Space,Fish Market
6,M4R,Central Toronto,North Toronto West,43.714523,-79.40696,3.0,Playground,Gym Pool,Park,Garden,Dumpling Restaurant,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
7,M4S,Central Toronto,Davisville,43.703395,-79.385964,1.0,Dessert Shop,Coffee Shop,Sandwich Place,Pizza Place,Café,Italian Restaurant,Thai Restaurant,Fast Food Restaurant,Costume Shop,Seafood Restaurant
8,M4T,Central Toronto,"Summerhill East,Moore Park",43.690685,-79.382946,0.0,Convenience Store,Gym,Tennis Court,Yoga Studio,Eastern European Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop,Fast Food Restaurant
9,M4V,Central Toronto,"South Hill,Summerhill West,Rathnelly,Forest Hi...",43.686074,-79.402265,1.0,Light Rail Station,Coffee Shop,Supermarket,Liquor Store,Yoga Studio,Ethiopian Restaurant,Flower Shop,Flea Market,Fish Market,Fish & Chips Shop


### Visualize the cluster using folium map

In [40]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    cluster = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Store the content of the toronto_merged to an csv so that it can be used for future assignment

In [41]:
toronto_merged.to_csv(path_or_buf = 'toronto_merged.csv', index = False)