In [3]:
import pandas as pd
# Set high but not unlimited max rows and columns, to void overstressing my machine
pd.options.display.max_rows = 250
pd.options.display.max_columns = 100
import requests
!pip install BeautifulSoup4
from bs4 import BeautifulSoup

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/d1/41/e6495bd7d3781cee623ce23ea6ac73282a373088fcd0ddc809a047b18eae/beautifulsoup4-4.9.3-py3-none-any.whl (115kB)
[K     |███████████████████████████████▏| 112kB 5.8MB/s eta 0:00:01     |████████████████████████████████| 122kB 5.8MB/s 
[?25hCollecting soupsieve>1.2; python_version >= "3.0" (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/36/69/d82d04022f02733bf9a72bc3b96332d360c0c5307096d76f6bb7489f7e57/soupsieve-2.2.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.2.1


In [4]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html_data = requests.get(url).text

toronto_soup = BeautifulSoup(html_data,"html5lib")

In [5]:
toronto_tables = toronto_soup.find_all('table')
len(toronto_tables)

3

In [6]:
toronto_table = toronto_tables[0]

In [8]:
# I create the dataframe with the named columns, it's empty for now
toronto_df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighborhood'])
toronto_df

# loop through all of the data cells in the table and populate the dataframe
for cell in toronto_table.find_all('td'):
    text = cell.text.strip()
    # Skip any cells that aren't assigned
    if 'Not assigned' not in text:
        # The postal codes are always the first 3 characters of the cell, this make it easy to split off using slicing 
        postalcode = text[0:3]
        
        # The remainder of the text has to be split along the opening parenthesis, and then the neighborhoods have to be reformatted
        other = text[3:].split('(')
        borough = other[0]
        neighborhood = (((other[1].strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        toronto_df = toronto_df.append({'PostalCode': postalcode,
                                    'Borough': borough,
                                    'Neighborhood': neighborhood}, ignore_index=True)


In [9]:
toronto_df['Borough'].value_counts()

North York                                                      24
Downtown Toronto                                                17
Scarborough                                                     17
Etobicoke                                                       11
Central Toronto                                                  9
West Toronto                                                     6
York                                                             5
East Toronto                                                     4
East York                                                        4
East YorkEast Toronto                                            1
East TorontoBusiness reply mail Processing Centre969 Eastern     1
EtobicokeNorthwest                                               1
Queen's Park                                                     1
Downtown TorontoStn A PO Boxes25 The Esplanade                   1
MississaugaCanada Post Gateway Processing Centre              

In [10]:
# There are a handful of Boroughs that didn't get proccessed properly, so let's fix them
toronto_df['Borough']=toronto_df['Borough'].replace({'MississaugaCanada Post Gateway Processing Centre':'Mississauga',
                                                 'EtobicokeNorthwest':'Etobicoke Northwest',
                                                 'East YorkEast Toronto':'East York/East Toronto',
                                                 'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                                 'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                                 })
toronto_df['Borough'].value_counts()

North York                24
Downtown Toronto          17
Scarborough               17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East Toronto               4
East York                  4
East Toronto Business      1
Mississauga                1
East York/East Toronto     1
Downtown Toronto Stn A     1
Etobicoke Northwest        1
Queen's Park               1
Name: Borough, dtype: int64

In [11]:
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [12]:
toronto_df.shape

(103, 3)

## Part 2: Geographical Cordinates

In [13]:
import io

In [14]:
url = 'http://cocl.us/Geospatial_data'

geo_csv = requests.get(url).content

geo_df = pd.read_csv(io.StringIO(geo_csv.decode('utf-8')))

In [15]:
# The Postal Code column in the geo_df is renamed to match that of the existing dataframe, for ease of merging
geo_df.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

toronto_df = pd.merge(toronto_df, geo_df, how='left')

In [17]:
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [18]:
import numpy as np # library to handle data in a vectorized manner

import json # library to handle JSON files

!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# Import k-means from clustering stage
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [19]:
# Create a geolocator agent
geolocator = Nominatim(user_agent="tor_explorer")

In [20]:
# Split the list of each neighborhood on the comma seperator
temp = [i.split(', ') for i in toronto_df['Neighborhood'].tolist()]
# Flaten the new list-of-lists
neigh_list = [item for sublist in temp for item in sublist]
neigh_list[0:10]

['Parkwoods',
 'Victoria Village',
 'Regent Park',
 'Harbourfront',
 'Lawrence Manor',
 'Lawrence Heights',
 'Ontario Provincial Government',
 'Islington Avenue',
 'Malvern',
 'Rouge']

In [21]:
# Create the dataframe to contain the neighborhood cordinate data
neigh_df = pd.DataFrame(columns=['Neighborhood', 'Latitude', 'Longitude'])

# Populate the dataframe
for neighborhood in neigh_list:
    address = '{}, Toronto, ON, Canada'.format(neighborhood)
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
    except:
        latitude = np.nan
        longitude = np.nan
    neigh_df = neigh_df.append({'Neighborhood': neighborhood,
                                'Latitude': latitude,
                                'Longitude': longitude}, ignore_index=True)

In [22]:
neigh_df.loc[neigh_df['Latitude'].isna()]

Unnamed: 0,Neighborhood,Latitude,Longitude
6,Ontario Provincial Government,,
37,Caledonia-Fairbanks,,
102,Keelsdale and Silverthorn,,
128,North Midtown,,
132,Enclave of L4W,,
169,Humber Bay Shores,,
175,Beaumond Heights,,
202,Enclave of M4L,,


In [23]:
print(neigh_df.shape)
neigh_df.dropna(inplace=True)
print(neigh_df.shape)
neigh_df.drop_duplicates(inplace=True)
print(neigh_df.shape)

(216, 3)
(208, 3)
(206, 3)


In [24]:
# Create map of Toronto using latitude and longitude values
# Toronto is located at 43.6532° N, 79.3832° W according to a quick search
toronto_map = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# Add markers to map
# Code taken with slight alterations from lab 3-3-2. No need to reinvent the wheel
for lat, lng, neighborhood in zip(neigh_df['Latitude'], neigh_df['Longitude'], neigh_df['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

In [29]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            'NGL3VSTVUV4IEAIYM1AJAAOZWR0QKTXMXGD1L3MOX2FMTD1I', 
            'DTYFGI5DAY1ED2SSQ5P1MBHGTM5A5UCJQUK5AC1LMJI2BRM0', 
            '20180604', 
            lat, 
            lng, 
            radius, 
            30)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [30]:
toronto_venues = getNearbyVenues(names=neigh_df['Neighborhood'], latitudes=neigh_df['Latitude'], longitudes=neigh_df['Longitude'])

In [31]:
venue_counts = toronto_venues.groupby('Neighborhood').count()[['Venue']]
venue_counts.rename({'Venue': 'Venue Count'},inplace=True)
venue_counts.head(10)

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Adelaide,30
Agincourt,11
Agincourt North,26
Albion Gardens,7
Alderwood,8
Bathurst Manor,4
Bathurst Quay,25
Bayview Village,12
Bedford Park,2
Berczy Park,30


In [32]:
print("Neighborhoods with more than 50 venues within 500 meters", venue_counts[venue_counts['Venue'] > 50].shape[0])
print("Neighborhoods with less than 50 venues within 500 meters", venue_counts[venue_counts['Venue'] < 50].shape[0])
print("Neighborhoods with less than 5 venues within 500 meters", venue_counts[venue_counts['Venue'] < 5].shape[0])

Neighborhoods with more than 50 venues within 500 meters 0
Neighborhoods with less than 50 venues within 500 meters 205
Neighborhoods with less than 5 venues within 500 meters 46


In [33]:
toronto_venues_1k = getNearbyVenues(names=neigh_df['Neighborhood'], latitudes=neigh_df['Latitude'], longitudes=neigh_df['Longitude'], radius=999)

In [34]:
venue_counts_1k = toronto_venues_1k.groupby('Neighborhood').count()[['Venue']]
venue_counts_1k.rename({'Venue':'Venue Count'},axis=1,inplace=True)
venue_counts_1k.head(10)

Unnamed: 0_level_0,Venue Count
Neighborhood,Unnamed: 1_level_1
Adelaide,30
Agincourt,30
Agincourt North,30
Albion Gardens,20
Alderwood,24
Bathurst Manor,19
Bathurst Quay,30
Bayview Village,30
Bedford Park,30
Berczy Park,30


In [35]:
print("Neighborhoods with 100 or more venues within 999 meters", venue_counts_1k[venue_counts_1k['Venue Count'] == 100].shape[0])
print("Neighborhoods with more than 50 venues within 999 meters", venue_counts_1k[venue_counts_1k['Venue Count'] > 50].shape[0])
print("Neighborhoods with less than 50 venues within 999 meters", venue_counts_1k[venue_counts_1k['Venue Count'] < 50].shape[0])
print("Neighborhoods with less than 20 venues within 999 meters", venue_counts_1k[venue_counts_1k['Venue Count'] < 20].shape[0])
print("Neighborhoods with less than 5 venues within 999 meters", venue_counts_1k[venue_counts_1k['Venue Count'] < 5].shape[0])

Neighborhoods with 100 or more venues within 999 meters 0
Neighborhoods with more than 50 venues within 999 meters 0
Neighborhoods with less than 50 venues within 999 meters 206
Neighborhoods with less than 20 venues within 999 meters 57
Neighborhoods with less than 5 venues within 999 meters 2


In [36]:
# one hot encoding
toronto_onehot = (pd.get_dummies(toronto_venues_1k[['Venue Category']], prefix="", prefix_sep=""))
# The Neighborhood column ended up somewhere in the middle of the pile, so droping it and inserting it again at the begining was the easiest way to clean up the table.
toronto_onehot.drop('Neighborhood', axis=1, inplace=True)
toronto_onehot.insert(0,'Neighborhood', toronto_venues_1k['Neighborhood'])
toronto_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Bath House,Beach,Beach Bar,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Boxing Gym,Breakfast Spot,Brewery,...,Smoke Shop,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Sri Lankan Restaurant,Steakhouse,Storage Facility,Supermarket,Supplement Shop,Sushi Restaurant,Swiss Restaurant,Syrian Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Court,Tennis Stadium,Thai Restaurant,Theater,Theme Park,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Parkwoods,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped = toronto_grouped.merge(venue_counts_1k, how='left', on='Neighborhood')
toronto_grouped.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Garage,Automotive Shop,BBQ Joint,Baby Store,Badminton Court,Bagel Shop,Bakery,Bank,Bar,Baseball Field,Baseball Stadium,Basketball Court,Basketball Stadium,Bath House,Beach,Beach Bar,Beer Bar,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bistro,Boat or Ferry,Bookstore,Botanical Garden,Boutique,Bowling Alley,Boxing Gym,Breakfast Spot,Brewery,...,Smoothie Shop,Snack Place,Soccer Field,Soccer Stadium,South American Restaurant,Spa,Spanish Restaurant,Speakeasy,Sporting Goods Shop,Sports Bar,Sports Club,Sri Lankan Restaurant,Steakhouse,Storage Facility,Supermarket,Supplement Shop,Sushi Restaurant,Swiss Restaurant,Syrian Restaurant,Taco Place,Tailor Shop,Taiwanese Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Tennis Court,Tennis Stadium,Thai Restaurant,Theater,Theme Park,Theme Restaurant,Thrift / Vintage Store,Tibetan Restaurant,Toy / Game Store,Track,Trail,Train Station,Turkish Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit,Venue Count
0,Adelaide,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30
2,Agincourt North,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30
3,Albion Gardens,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20
4,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24


In [38]:
# I chose 7 clusters for my K value
kclusters = 7

# I drop the Neighborhood label and then apply a standard scalar, to account for the different scale of the total Venue Count
toronto_grouped_clustering = scale(toronto_grouped.drop('Neighborhood', 1))

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=4).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

  """


array([6, 0, 5, 6, 5, 5, 6, 6, 6, 6], dtype=int32)

In [39]:
# add clustering labels to the original neigh_df for mapping
map_df = neigh_df
map_df['Cluster Label'] = kmeans.labels_

map_df.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,Cluster Label
0,Parkwoods,43.7588,-79.320197,6
1,Victoria Village,43.732658,-79.311189,0
2,Regent Park,43.660706,-79.360457,5
3,Harbourfront,43.64008,-79.38015,6
4,Lawrence Manor,43.722079,-79.437507,5


In [None]:
# create map
clusters_map = folium.Map(location=[43.6532, -79.3832], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(map_df['Latitude'], map_df['Longitude'], map_df['Neighborhood'], map_df['Cluster Label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clusters_map)

clusters_map

In [None]:
cluster_centers = map_df.groupby('Cluster Label')[['Latitude','Longitude']].mean().reset_index()
cluster_centers

In [None]:
# add geographical average of each cluster's latitude and logitude to the map, 
markers_colors = []
for lat, lon, cluster in zip(cluster_centers['Latitude'], cluster_centers['Longitude'], cluster_centers['Cluster Label']):
    label = folium.Popup(' Cluster Center ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=15,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(clusters_map)

clusters_map

In [None]:
# Creating a new dataframe that includes all the relevant information, as I hadn't done so earlier.
cluster_venues = toronto_grouped.copy()
cluster_venues.insert(1, 'Cluster Label', kmeans.labels_)
cluster_venues = cluster_venues.groupby('Cluster Label').mean().reset_index()
cluster_venues

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']


# create columns according to number of top venues
columns = ['Cluster Label']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
top_cluster_venues = pd.DataFrame(columns=columns)
top_cluster_venues['Cluster Label'] = cluster_venues['Cluster Label']

temp = cluster_venues.drop('Venue Count', axis=1)

for ind in np.arange(temp.shape[0]):
    top_cluster_venues.iloc[ind, 1:] = return_most_common_venues(temp.iloc[ind, :], num_top_venues)

top_cluster_venues['Venue Count'] = cluster_venues['Venue Count']

In [None]:
top_cluster_venues

In [None]:
map_df['Cluster Label'].value_counts()