# US Cities Clustering based on Foursquare Venues
--------------------------------------------------

## by Tal Meyerstein

### 1. First - import all the necessary libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner
import os, wget, sys
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes
#from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

ModuleNotFoundError: No module named 'folium'

In [4]:
us_data_filename = 'stanford-bx729wr3020-geojson.json'

In [5]:
# Download the file containing the sample data

url = 'https://github.com/tal-meyerstein/Coursera_Capstone/raw/master/' + us_data_filename
if not os.path.isfile(us_data_filename): wget.download( url )

100% [........................................................................] 19444275 / 19444275

### 2. Load and Explore US Cities data from GEO JSON file

In [6]:
with open(us_data_filename) as json_data:
    us_data = json.load(json_data)

In [7]:
cities_data = us_data['features']

Lets take a look at the Chicago item in the list:

In [8]:
next(item for item in cities_data if item['properties']['name']=='Chicago')

{'type': 'Feature',
 'id': 'bx729wr3020.495',
 'geometry': {'type': 'Point', 'coordinates': [-87.6500523, 41.850033]},
 'geometry_name': 'geom',
 'properties': {'gnis_id': 423587,
  'ansicode': '428803',
  'feature': 'Civil',
  'feature2': 'County Seat',
  'name': 'Chicago',
  'pop_2010': 2695598,
  'county': 'Cook',
  'countyfips': '031',
  'state': 'IL',
  'state_fips': '17',
  'latitude': 41.850033,
  'longitude': -87.6500523,
  'poppllat': 41.850033,
  'poppllong': -87.6500523,
  'elev_in_m': 179,
  'elev_in_ft': 587,
  'bbox': [-87.6500523, 41.850033, -87.6500523, 41.850033]}}

In [9]:
# define the dataframe columns
column_names = ['City', 'State','Population_2010', 'Elevation_in_meters', 'Latitude', 'Longitude'] 

# instantiate the dataframe
df_cities = pd.DataFrame(columns=column_names)

In [10]:
# filling the dataframe with data from the data list, but copying only the cities (feature = 'Civil') with population above 10,000
for data in cities_data:
    if ((data['properties']['feature']=='Civil') and (int(data['properties']['pop_2010'])>=10000)):
        state = data['properties']['state'] 
        city_name = data['properties']['name']
        city_pop = data['properties']['pop_2010']
        city_elev = data['properties']['elev_in_m']
        
        city_latlon = data['geometry']['coordinates']
        city_lat = city_latlon[1]
        city_lon = city_latlon[0]
        
        df_cities = df_cities.append({'State': state,
                                          'City': city_name,
                                          'Population_2010': city_pop,
                                          'Elevation_in_meters': city_elev,
                                          'Latitude': city_lat,
                                          'Longitude': city_lon}, ignore_index=True)

In [11]:
print(df_cities.shape)
# casting the population and elevation columns from string to int
df_cities = df_cities.astype({'Population_2010':np.int32, 'Elevation_in_meters': np.int32})
df_cities.describe()

(2945, 6)


Unnamed: 0,Population_2010,Elevation_in_meters,Latitude,Longitude
count,2945.0,2945.0,2945.0,2945.0
mean,54909.09,249.177589,37.987161,-93.260781
std,199721.6,335.041807,5.097657,15.224561
min,10019.0,-34.0,24.555702,-149.900278
25%,14570.0,45.0,34.021122,-99.507542
50%,23861.0,186.0,38.95922,-88.327295
75%,48205.0,274.0,41.66121,-81.804579
max,8175133.0,2184.0,64.837778,-68.777814


In [12]:
print(df_cities['State'].unique())

['ND' 'ME' 'NY' 'WI' 'SD' 'MI' 'IA' 'MA' 'PA' 'IL' 'OH' 'NJ' 'NE' 'KS'
 'MD' 'VA' 'MO' 'CA' 'LA' 'NH' 'VT' 'RI' 'NV' 'MN' 'CT' 'WV' 'DE' 'DC'
 'IN' 'KY' 'TN' 'NC' 'AR' 'SC' 'AL' 'GA' 'MS' 'FL' 'WA' 'MT' 'ID' 'OR'
 'UT' 'WY' 'CO' 'OK' 'TX' 'AZ' 'NM' 'AK']


In [13]:
# work only with mainland US cities so drop all Alaska cities 
df_cities.drop(df_cities[df_cities['State']=='AK'].index, inplace=True)

In [14]:
# also drop cities with mising population figure (denoted by Population_2010 = -999)
df_cities.drop(df_cities[df_cities.Population_2010==-999].index, inplace=True)

In [15]:
df_cities.describe()

Unnamed: 0,Population_2010,Elevation_in_meters,Latitude,Longitude
count,2942.0,2942.0,2942.0,2942.0
mean,54844.54,249.369816,37.963233,-93.20903
std,199774.7,335.154375,5.044113,15.144171
min,10019.0,-34.0,24.555702,-124.21789
25%,14567.0,45.0,34.020498,-99.400067
50%,23844.0,186.0,38.9561,-88.323391
75%,48201.25,274.75,41.66016,-81.799788
max,8175133.0,2184.0,48.946504,-68.777814


From the above summary table I learn that:
+ The majority of cities have population less than 100,000.
+ The majority of cities are located at elevation below 300 meters

In [16]:
df_cities.head()

Unnamed: 0,City,State,Population_2010,Elevation_in_meters,Latitude,Longitude
0,Minot,ND,40888,491,48.232509,-101.296273
1,Grand Forks,ND,52838,253,47.925257,-97.032855
2,Jamestown,ND,15427,429,46.910544,-98.708436
3,Fargo,ND,105549,275,46.877186,-96.789803
4,Dickinson,ND,17787,735,46.879176,-102.789624


In [17]:
# after checking all 9 cities on the map, I realize that need to fix the Chicago coordinates
Chicago_lat, Chicago_long = 41.880442, -87.632529
df_cities.loc[df_cities.City == 'Chicago','Latitude'] = Chicago_lat
df_cities.loc[df_cities.City == 'Chicago','Longitude'] = Chicago_long

Show the cities on the map of US, but not showing cities with population under 100,000 becuase there are too many of them, so the map will not look good.

In [18]:
# create map of US using latitude and longitude values of center mainland US
latitude, longitude = 39.902546, -98.184233
map_us = folium.Map(location=[latitude, longitude], zoom_start=4)

# add markers to map
for lat, lng, city, pop, elev in zip(df_cities['Latitude'], df_cities['Longitude'], df_cities['City'], df_cities['Population_2010'], df_cities['Elevation_in_meters']):
    if int(pop)>=100000: # now showing cities with population under 100,000 becuase there are too many of them so the map will not look good
        label = '{}, Pop:{:,}, Elevation(m):{:,}'.format(city, pop, elev)
        label = folium.Popup(label, parse_html=True)
        radius_size = (3 if int(pop)<1000000 else 7)
        marker_color = ('green' if int(pop)<1000000 else 'blue')
        folium.CircleMarker(
            [lat, lng],
            radius=radius_size,
            popup=label,
            color=marker_color,
            fill=True,
            fill_color=marker_color,
            fill_opacity=0.7,
            parse_html=False).add_to(map_us)
map_us

### 3. Fetch venues data from Foursquare 
#### (skip this section if its not the first time that you run the notbook on this environment)

In [19]:
CLIENT_ID = 'MIU3ZKAVB0NPUPANUFW024CALVKDILZ4MR5SQYIU25HY4E0V' # my Foursquare ID
CLIENT_SECRET = 'CAQFF0BBSKGI55UVWNQJ2FQNF3DF3NRVOQM02Z4EFKKMNOGP' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

search_radius = 2000
LIMIT = 300

In [20]:
def getSearchVenues(names, latitudes, longitudes, radius=search_radius, categories=None):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&intent=browse&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        if categories is not None:
            url += '&categoryId=' + categories
               
        # make the GET request
        results = requests.get(url).json()["response"]['venues']        
        print(name)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            next(item for item in v['categories'] if item['primary'])['name']) for v in results if len(v['categories'])>0])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
def getExploreVenues(names, states, latitudes, longitudes, radius=search_radius, section=None):
    
    venues_list=[]
    for name, state, lat, lng in zip(names, states, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        if section is not None:
            url += '&section=' + section
        
        # make the GET request
        print(name)
        t = 1
        done = False
        while ((t < 4) & (not(done))):
            try:
                results = requests.get(url).json()["response"]['groups'][0]['items']
            except KeyError as err:
                print("Key error: {0}".format(err))
                print(url)
                print('try number:{}'.format(t))
                print(requests.get(url).json()["response"])                
                t+=1
            except:
                print("Unexpected error:", sys.exc_info()[0])
                raise
            else:
                done = True
                # return only relevant information for each nearby venue
                venues_list.append([(
                    name, 
                    state,
                    lat, 
                    lng, 
                    v['venue']['name'], 
                    v['venue']['location']['lat'], 
                    v['venue']['location']['lng'],  
                    next(item for item in v['venue']['categories'] if item["primary"])['name']) for v in results])
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 'State',
                              'City Latitude', 
                              'City Longitude', 
                              'Venue', 
                              'Venue Latitude', 
                              'Venue Longitude', 
                              'Venue Category']
    
    return(nearby_venues)

In [23]:
cities_venues = getExploreVenues(names=df_cities['City'],
                                   states=df_cities['State'],
                                   latitudes=df_cities['Latitude'],
                                   longitudes=df_cities['Longitude']
                                  )

Minot
Grand Forks
Jamestown
Fargo
Dickinson
Mandan
Bangor
Plattsburgh
Ogdensburg
Augusta
Waterville
Bellevue
Suamico
Pierre
Auburn
Lewiston
Portland
South Portland
Westbrook
Biddeford
Saco
Glens Falls
Fulton
Oswego
Rome
Auburn
Saratoga Springs
Lockport
Niagara Falls
North Tonawanda
Rochester
Wyoming
Gloversville
Syracuse
Mason City
Spencer
Oneida
Batavia
Buffalo
Lackawanna
Amsterdam
Canandaigua
Geneva
Schenectady
Storm Lake
Oneonta
Wixom
Lawrence
Lynn
Salem
Beverly
Gloucester
Newburyport
Peabody
Mount Pleasant
Albany
Cohoes
Cortland
North Adams
Everett
Lowell
Marlborough
Waltham
Cambridge
Malden
Medford
Melrose
Newton
Worcester
Fitchburg
Gardner
Leominster
Dubuque
Fort Dodge
Waterloo
Ithaca
Corning
Dunkirk
Jamestown
Sioux City
Northampton
Olean
Boston
Chelsea
Revere
Ypsilanti
Binghamton
Quincy
Chicopee
Westfield
West Springfield
Holyoke
Springfield
Brockton
Cedar Rapids
Elmira
Erie
Carroll
Kingston
Chicago
Campton Hills
Attleboro
Fall River
Beacon
Poughkeepsie
Clinton
Iowa City
Eastlak

Monroe
Brooklyn Center
Brooklyn Park
Massena
Kenmore
Lancaster
Depew
Fredonia
Johnson City
Endicott
Torrington
Bristol
Hartford
New Britain
Groton
New London
Norwich
Bridgeport
Danbury
Norwalk
Shelton
Stamford
Middletown
Ansonia
Derby
Meriden
Naugatuck
New Haven
Waterbury
West Haven
Kiryas Joel
Woodbury
Harrison
Mamaroneck
Mount Kisco
Ossining
Port Chester
Scarsdale
Tarrytown
Dobbs Ferry
Haverstraw
Spring Valley
Suffern
West Haverstraw
Lake Grove
Lindenhurst
Patchogue
Babylon
Floral Park
Freeport
Garden City
Hempstead
Lynbrook
Massapequa Park
Mineola
Rockville Centre
Valley Stream
Westbury
Weirton
Wheeling
Newark
Wilmington
Middletown
Hagerstown
Cumberland
Elkton
Morgantown
Aberdeen
Havre de Grace
Bel Air
Westminster
Frederick
Fairmont
Martinsburg
Clarksburg
Parkersburg
Vienna
Dover
Smyrna
Gaithersburg
Rockville
Takoma Park
Leesburg
Annapolis
Bowie
College Park
Greenbelt
Hyattsville
Laurel
New Carrollton
Herndon
Front Royal
Washington
Easton
Bridgeton
Cambridge
Culpeper
Charleston
Sout

Independence
Jeffersontown
Lawrenceburg
Radcliff
Nicholasville
Henderson
Owensboro
Richmond
Danville
Paducah
Hopkinsville
Middlesborough
Murray
Portland
Hendersonville
Clarksville
Kingsport
Mount Airy
Henderson
Reidsville
Roanoke Rapids
Paris
Rogers
Mountain Home
Johnson City
Mount Juliet
Lebanon
Morristown
Dickson
Cookeville
Oak Ridge
Paragould
High Point
Knoxville
Crossville
Rocky Mount
Lenoir
Murfreesboro
La Vergne
Raleigh
Morganton
Jonesboro
Columbia
Newton
Hickory
Jackson
Russellville
Lewisburg
Kings Mountain
Charlotte
Concord
Kannapolis
Hendersonville
Lawrenceburg
Chattanooga
Red Bank
Kinston
Mount Holly
New Bern
Havelock
Millington
Lakeland
Conway
Cleveland
Monroe
Rock Hill
Clemson
Laurinburg
North Little Rock
Jacksonville
Corinth
Olive Branch
Hernando
Horn Lake
Huntsville
Dalton
Jacksonville
Muscle Shoals
Hot Springs
Decatur
Rome
Oxford
Newberry
Clarksdale
Pine Bluff
Cartersville
Canton
Cullman
Conway
Myrtle Beach
North Myrtle Beach
Columbia
Cayce
College Park
Milton
Roswell
Jo

Holladay
Riverton
Midvale
Murray
Highland
Lehi
Lindon
Orem
Payson
Pleasant Grove
Provo
Fort Morgan
Red Bluff
Longmont
Louisville
Lafayette
Oroville
Northglenn
Golden
Lakewood
Englewood
Greenwood Village
Littleton
Fernley
Marysville
Lone Tree
Grass Valley
Fruita
Grand Junction
Lincoln
Rocklin
Roseville
Fountain
Placerville
Napa
Healdsburg
Rohnert Park
Petaluma
Elk Grove
Folsom
Galt
Rancho Cordova
Montrose
Fairfield
Pueblo
Mill Valley
Novato
Larkspur
Lodi
Ripon
Manteca
Lathrop
El Cerrito
Hercules
Richmond
Martinez
Oakley
Orinda
Pinole
Pittsburg
Pleasant Hill
Lafayette
Riverbank
Modesto
Newman
Oakdale
Patterson
Emeryville
Fremont
Hayward
Livermore
Newark
Oakland
Piedmont
Pleasanton
Madera
East Palo Alto
Foster City
Half Moon Bay
Redwood City
Menlo Park
Millbrae
Pacifica
Livingston
Los Banos
Merced
Fresno
Reedley
Mendota
Parlier
Kerman
Kingsburg
Gilroy
Los Altos
Milpitas
Morgan Hill
Mountain View
Palo Alto
Flagstaff
Ponca City
Miami
Espaֳ±ola
Farmington
Guymon
Hollister
Greenfield
Marina
M

In [39]:
#sanity counting the number of unique Cities
cities_venues.groupby(['City', 'State']).count().shape

(2942, 6)

In [30]:
with open('cities_venues.json', 'w') as json_file:  
    json.dump(cities_venues.to_json(), json_file)

### 4. Explore the Venues data 

In [36]:
with open('cities_venues.json', 'r') as json_file:  
    json_data = json.load(json_file)
    cities_venues = pd.read_json(json_data)

In [41]:
json_data = None
del(json_data)

In [20]:
CLIENT_ID = 'MIU3ZKAVB0NPUPANUFW024CALVKDILZ4MR5SQYIU25HY4E0V' # my Foursquare ID
CLIENT_SECRET = 'CAQFF0BBSKGI55UVWNQJ2FQNF3DF3NRVOQM02Z4EFKKMNOGP' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

search_radius = 2000
LIMIT = 300

In [43]:
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
categories_list = requests.get(url).json()["response"]['categories']

In [44]:
# Build a dictionary of categories (category, direct parent category) by running a recrusive function on the categories JSON
categories_dict = {}
def Build_Categories_Tree(clist,cparent=None):
    for category in clist:
        categories_dict[category['name']] = (cparent,None)
        if len(category['categories'])>0:
            Build_Categories_Tree(category['categories'],category['name'])
            
Build_Categories_Tree(categories_list)

Extracting a top level Main Category for each of the categories:

In [45]:
# Update the categories dictionary to add a main parent category for each of the categories
# so that each key in the dictionary is a category and its value is a tuple of: (direct parent categorry, main parent category)

# define a list of categories to be considered main categories for analysis 
main_categories = ('Arts & Entertainment', 'Museum', 'Stadium', 'College & University', 'Event', 'Food', 'Café', 'Coffee Shop',
                   'Bar', 'Nightlife Spot', 'Outdoors & Recreation', 'Professional & Other Places', 'Shop & Service', 'Travel & Transport', 'Hotel', 'Residence')
for cag, cag_direct_parent in categories_dict.items():  
    if (cag in main_categories):
        categories_dict[cag] = (cag_direct_parent[0],cag)
    else:
        cag_parent = cag_direct_parent[0]
        while ((cag_parent is not None) & (cag_parent not in main_categories)):
            cag_parent = categories_dict[cag_parent][0]
        categories_dict[cag] = (cag_direct_parent[0],cag_parent)
print(categories_dict)

{'Arts & Entertainment': (None, 'Arts & Entertainment'), 'Amphitheater': ('Arts & Entertainment', 'Arts & Entertainment'), 'Aquarium': ('Arts & Entertainment', 'Arts & Entertainment'), 'Arcade': ('Arts & Entertainment', 'Arts & Entertainment'), 'Art Gallery': ('Arts & Entertainment', 'Arts & Entertainment'), 'Bowling Alley': ('Arts & Entertainment', 'Arts & Entertainment'), 'Casino': ('Arts & Entertainment', 'Arts & Entertainment'), 'Circus': ('Arts & Entertainment', 'Arts & Entertainment'), 'Comedy Club': ('Arts & Entertainment', 'Arts & Entertainment'), 'Concert Hall': ('Arts & Entertainment', 'Arts & Entertainment'), 'Country Dance Club': ('Arts & Entertainment', 'Arts & Entertainment'), 'Disc Golf': ('Arts & Entertainment', 'Arts & Entertainment'), 'Exhibit': ('Arts & Entertainment', 'Arts & Entertainment'), 'General Entertainment': ('Arts & Entertainment', 'Arts & Entertainment'), 'Go Kart Track': ('Arts & Entertainment', 'Arts & Entertainment'), 'Historic Site': ('Arts & Entertai

In [46]:
# Add a new Main category column to the cities_venues dataframe. set its value based on the main category value from the dictionary.
# I realized that "Cafe" and "Coffee Shop" are actually synonym categories so i merged them to one main category: Cafe.

cities_venues['Venue Main Category'] = cities_venues.apply(lambda row : ('Cafe' if categories_dict[row['Venue Category']][1] in ('Café', 'Coffee Shop') 
                                                                                else categories_dict[row['Venue Category']][1]) ,axis=1)

Lets view the breakdown of these Main Categories for all our venues:

In [47]:
print(cities_venues['Venue Main Category'].value_counts())
print('Total: {}'.format(sum(cities_venues['Venue Main Category'].value_counts())))

Food                           80033
Shop & Service                 47560
Outdoors & Recreation          14272
Bar                             8353
Cafe                            7350
Arts & Entertainment            5070
Hotel                           3635
Travel & Transport              2581
Nightlife Spot                  2126
Museum                          1031
Professional & Other Places      854
Stadium                          611
College & University             346
Residence                         55
Event                              5
Name: Venue Main Category, dtype: int64
Total: 173882


Now Lets see how many unique categories can be curated from all the venues:

In [48]:
print('There are {} uniques categories.'.format(len(cities_venues['Venue Category'].unique())))

There are 641 uniques categories.


In [49]:
# sanity to compare with total count of Main Categories
len(cities_venues)

173882

In [53]:
cities_venues[cities_venues.City=='New York'][cities_venues.State=='NY']['Venue Main Category'].value_counts()

  """Entry point for launching an IPython kernel.


Arts & Entertainment           37
Food                           27
Shop & Service                 15
Outdoors & Recreation           8
Hotel                           6
Professional & Other Places     2
Museum                          2
Travel & Transport              1
Bar                             1
Cafe                            1
Name: Venue Main Category, dtype: int64

In [54]:
cities_venues[cities_venues.City=='New York']

Unnamed: 0,City,State,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Venue Main Category
63299,New York,NY,40.761493,-73.981431,Winter Garden Theatre,40.761405,-73.983555,Theater,Arts & Entertainment
63300,New York,NY,40.761493,-73.981431,Le Bernardin,40.761379,-73.981758,French Restaurant,Food
63301,New York,NY,40.761493,-73.981431,Radio City Music Hall,40.759855,-73.97993,Concert Hall,Arts & Entertainment
63302,New York,NY,40.761493,-73.981431,Equinox West 50th Street,40.761912,-73.984268,Gym,Outdoors & Recreation
63303,New York,NY,40.761493,-73.981431,Del Frisco's Double Eagle Steakhouse,40.75941,-73.98222,Steakhouse,Food
63304,New York,NY,40.761493,-73.981431,MoMA Design Store,40.761312,-73.978032,Gift Shop,Shop & Service
63305,New York,NY,40.761493,-73.981431,Estiatorio Milos,40.763386,-73.979204,Greek Restaurant,Food
63306,New York,NY,40.761493,-73.981431,Broadway Theatre,40.763645,-73.983304,Theater,Arts & Entertainment
63307,New York,NY,40.761493,-73.981431,Top of the Rock Observation Deck,40.759095,-73.979352,Scenic Lookout,Outdoors & Recreation
63308,New York,NY,40.761493,-73.981431,Rainbow Room,40.759196,-73.979671,American Restaurant,Food


## 3. Analyze the Cities

In [55]:
# one hot encoding
cities_onehot = pd.get_dummies(cities_venues[['Venue Main Category']], prefix="", prefix_sep="")

# add City and State columns back to dataframe
cities_onehot['City'] = cities_venues['City']
cities_onehot['State'] = cities_venues['State']

# move City and State columns to the first column
fixed_columns = [cities_onehot.columns[-2]] + [cities_onehot.columns[-1]] + list(cities_onehot.columns[:-2])
cities_onehot = cities_onehot[fixed_columns]
cities_onehot.head()

Unnamed: 0,City,State,Arts & Entertainment,Bar,Cafe,College & University,Event,Food,Hotel,Museum,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Stadium,Travel & Transport
0,Minot,ND,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Minot,ND,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
10,Minot,ND,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
100,Grand Forks,ND,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1000,Portland,ME,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [56]:
cities_onehot.shape

(173882, 17)

#### Next, let's group rows by City and State and by taking the mean of the frequency of occurrence of each category

In [57]:
City_grouped = cities_onehot.groupby(['City','State']).mean().reset_index()
City_grouped.head()

Unnamed: 0,City,State,Arts & Entertainment,Bar,Cafe,College & University,Event,Food,Hotel,Museum,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Stadium,Travel & Transport
0,Abbeville,LA,0.0,0.0,0.0,0.0,0.0,0.6,0.05,0.0,0.0,0.0,0.05,0.0,0.3,0.0,0.0
1,Aberdeen,MD,0.0,0.0,0.0,0.0,0.0,0.380952,0.0,0.0,0.0,0.071429,0.0,0.0,0.5,0.0,0.047619
2,Aberdeen,SD,0.020833,0.083333,0.041667,0.0,0.0,0.520833,0.020833,0.0,0.0,0.083333,0.0,0.0,0.229167,0.0,0.0
3,Aberdeen,WA,0.074074,0.037037,0.074074,0.0,0.0,0.351852,0.037037,0.018519,0.0,0.018519,0.0,0.0,0.37037,0.018519,0.0
4,Abilene,TX,0.086957,0.043478,0.021739,0.0,0.0,0.413043,0.0,0.043478,0.0,0.0,0.021739,0.0,0.347826,0.0,0.021739


In [60]:
City_grouped.shape

(2942, 17)

In [63]:
df_cities_venues = pd.merge(City_grouped, df_cities, on=['City','State'])
df_cities_venues.head()

Unnamed: 0,City,State,Arts & Entertainment,Bar,Cafe,College & University,Event,Food,Hotel,Museum,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Stadium,Travel & Transport,Population_2010,Elevation_in_meters,Latitude,Longitude
0,Abbeville,LA,0.0,0.0,0.0,0.0,0.0,0.6,0.05,0.0,0.0,0.0,0.05,0.0,0.3,0.0,0.0,12257,5,29.97465,-92.134292
1,Aberdeen,MD,0.0,0.0,0.0,0.0,0.0,0.380952,0.0,0.0,0.0,0.071429,0.0,0.0,0.5,0.0,0.047619,14959,24,39.509556,-76.16412
2,Aberdeen,SD,0.020833,0.083333,0.041667,0.0,0.0,0.520833,0.020833,0.0,0.0,0.083333,0.0,0.0,0.229167,0.0,0.0,26091,397,45.464698,-98.486483
3,Aberdeen,WA,0.074074,0.037037,0.074074,0.0,0.0,0.351852,0.037037,0.018519,0.0,0.018519,0.0,0.0,0.37037,0.018519,0.0,16896,3,46.975371,-123.815722
4,Abilene,TX,0.086957,0.043478,0.021739,0.0,0.0,0.413043,0.0,0.043478,0.0,0.0,0.021739,0.0,0.347826,0.0,0.021739,117063,524,32.448736,-99.733144


In [67]:
df_cities_venues.corr()

Unnamed: 0,Arts & Entertainment,Bar,Cafe,College & University,Event,Food,Hotel,Museum,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Stadium,Travel & Transport,Population_2010,Elevation_in_meters,Latitude,Longitude
Arts & Entertainment,1.0,0.21317,0.104791,0.084834,0.02041,-0.190097,0.082183,0.21606,0.217047,-0.038143,0.028663,-0.010663,-0.293214,0.105944,-0.039909,0.333895,0.021867,0.083039,0.03867
Bar,0.21317,1.0,0.124849,0.155387,0.029433,-0.232108,0.021809,0.145359,0.271922,-0.03334,0.026284,-0.00077,-0.379577,0.098726,-0.061235,0.087168,-0.024629,0.224759,0.117042
Cafe,0.104791,0.124849,1.0,0.127686,0.039088,-0.053614,0.077342,0.061412,0.221143,-0.061371,-0.019586,0.005684,-0.329667,-0.020272,-0.079285,0.060591,0.070488,0.193595,-0.296637
College & University,0.084834,0.155387,0.127686,1.0,0.062467,0.010478,0.006369,0.084684,0.081048,-0.058233,0.016992,0.034756,-0.172909,-0.006583,-0.071031,0.011671,0.00206,0.043641,0.067374
Event,0.02041,0.029433,0.039088,0.062467,1.0,-0.007136,0.019211,0.007277,0.003485,-0.018141,-0.003674,-0.005042,-0.019613,0.001658,-0.021081,0.015038,0.015818,0.001945,-0.006611
Food,-0.190097,-0.232108,-0.053614,0.010478,-0.007136,1.0,-0.015781,-0.093768,-0.099532,-0.487872,-0.106887,0.032389,-0.336817,-0.092065,-0.190044,-0.023186,-0.035022,-0.228158,-0.081814
Hotel,0.082183,0.021809,0.077342,0.006369,0.019211,-0.015781,1.0,0.140102,0.05139,-0.102822,-0.007156,-0.000879,-0.264883,-0.023048,-0.021497,0.088899,0.088741,-0.122834,-0.103799
Museum,0.21606,0.145359,0.061412,0.084684,0.007277,-0.093768,0.140102,1.0,0.205713,-0.027942,0.062355,-0.019533,-0.245669,0.080194,-0.017672,0.15224,-0.004564,-0.025977,0.079035
Nightlife Spot,0.217047,0.271922,0.221143,0.081048,0.003485,-0.099532,0.05139,0.205713,1.0,-0.04989,0.03934,0.002143,-0.327973,0.085095,-0.027966,0.084502,0.090888,0.133812,-0.001812
Outdoors & Recreation,-0.038143,-0.03334,-0.061371,-0.058233,-0.018141,-0.487872,-0.102822,-0.027942,-0.04989,1.0,0.041972,-0.01803,-0.18168,-0.021448,0.080774,-0.040565,-0.002279,0.077053,0.014336


### Based on the above correlation analysis,there seems to be some correlation (0.33) between a City population and its frequency of "Arts and Entertainments" venues.
### Therfore, I will include it in my clustering analysis

## 4. Cluster Cities

Run *k*-means to cluster the neighborhood into 5 clusters.

In [73]:
# Select features for clustering:
# All categories + Population
X = df_cities_venues.iloc[:,2:-3].values
X

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.22570000e+04],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 4.76190476e-02, 1.49590000e+04],
       [2.08333333e-02, 8.33333333e-02, 4.16666667e-02, ...,
        0.00000000e+00, 0.00000000e+00, 2.60910000e+04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 7.14285714e-02, 1.32880000e+04],
       [2.04081633e-02, 0.00000000e+00, 4.08163265e-02, ...,
        0.00000000e+00, 2.04081633e-02, 2.44130000e+04],
       [3.50877193e-02, 7.01754386e-02, 3.50877193e-02, ...,
        0.00000000e+00, 1.75438596e-02, 1.41600000e+04]])

In [32]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(X)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.

In [33]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = central_toranto_count_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

In [34]:
toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venues Count,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,49,0,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Breakfast Spot,Theater,Beer Store
8,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,100,0,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Restaurant,Lingerie Store,Bar,Japanese Restaurant,Tea Room
14,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,100,0,Coffee Shop,Restaurant,Café,Hotel,Breakfast Spot,Clothing Store,Cosmetics Shop,Bakery,Gastropub,American Restaurant
18,M4E,East Toronto,The Beaches,43.676357,-79.293031,4,0,Health Food Store,Coffee Shop,Pub,Neighborhood,Dog Run,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
19,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306,58,0,Coffee Shop,Cocktail Bar,Cheese Shop,Pub,Italian Restaurant,Steakhouse,Bakery,Seafood Restaurant,Farmers Market,Café


Finally, let's visualize the resulting clusters

In [35]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 5. Examine Clusters

Now, I will examine each cluster and determine the discriminating venue categories that distinguish each cluster.

#### Cluster 1

In [36]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[2,5] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Venues Count,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,"Harbourfront, Regent Park",49,Coffee Shop,Café,Bakery,Pub,Park,Mexican Restaurant,Restaurant,Breakfast Spot,Theater,Beer Store
8,"Ryerson, Garden District",100,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant,Restaurant,Lingerie Store,Bar,Japanese Restaurant,Tea Room
14,St. James Town,100,Coffee Shop,Restaurant,Café,Hotel,Breakfast Spot,Clothing Store,Cosmetics Shop,Bakery,Gastropub,American Restaurant
18,The Beaches,4,Health Food Store,Coffee Shop,Pub,Neighborhood,Dog Run,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
19,Berczy Park,58,Coffee Shop,Cocktail Bar,Cheese Shop,Pub,Italian Restaurant,Steakhouse,Bakery,Seafood Restaurant,Farmers Market,Café
23,Central Bay Street,78,Coffee Shop,Café,Italian Restaurant,Burger Joint,Bar,Sandwich Place,Thai Restaurant,Spa,Ice Cream Shop,Bubble Tea Shop
24,Christie,15,Grocery Store,Café,Park,Italian Restaurant,Convenience Store,Baby Store,Diner,Coffee Shop,Restaurant,Nightclub
29,"Adelaide, King, Richmond",100,Coffee Shop,Bar,Steakhouse,Café,Thai Restaurant,Burger Joint,Gym,Bakery,Hotel,American Restaurant
30,"Dovercourt Village, Dufferin",20,Pharmacy,Discount Store,Supermarket,Bakery,Gym / Fitness Center,Furniture / Home Store,Pool,Music Venue,Middle Eastern Restaurant,Café
35,"Harbourfront East, Toronto Islands, Union Station",100,Coffee Shop,Aquarium,Hotel,Café,Italian Restaurant,Scenic Lookout,Brewery,Pizza Place,Fried Chicken Joint,Bakery


#### Cluster 2

In [37]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[2,5] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Venues Count,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
58,Lawrence Park,3,Park,Swim School,Bus Line,Yoga Studio,Doner Restaurant,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space
65,"Forest Hill North, Forest Hill West",4,Trail,Park,Jewelry Store,Sushi Restaurant,Yoga Studio,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant
88,Rosedale,4,Park,Playground,Trail,Diner,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


#### Cluster 3

In [38]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[2,5] + list(range(7, toronto_merged.shape[1]))]]

Unnamed: 0,Neighborhood,Venues Count,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
80,"Moore Park, Summerhill East",2,Playground,Tennis Court,Concert Hall,Convenience Store,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant


## 6. Conclusions

Based on the above examinations I conclude that Central Toronto is more homogeneous than Manhattan. The vast majorities of its neighborhoods contain a mixture of Restaurants, Coffee shops and Bars (Cluster 1), while a few of its neighborhoods on its outskirts (Cluster 2) seem to be of resedential character - contain Parks and Trails. There is 1 neighbborhood which i will consider an outlier since only 1 Foursquare Venue was found in it and the algorithem has classified it as Cluster 3.