# Setting up the enviroment #

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.11

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##

# Importing New York Data Set #

In [2]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

Data downloaded!


In [3]:
neighborhoods_data = newyork_data['features']
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)


In [4]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# Creating the map of NY #

In [5]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

# Connecting to Foursquare #

In [6]:
CLIENT_ID = 'QX1RISY1NCHO3XIXWBFYJVJ0I2DMHNSQ4QKGMKGDK2MROOTR' # your Foursquare ID
CLIENT_SECRET = 'HGGI2E30BHU0ZAL0QYLGD0UMU1MQJXJHCX24OU2UE410LDMP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# Categorizing venue type #

In [7]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
ny_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
                                   latitudes=neighborhoods['Latitude'],
                                   longitudes=neighborhoods['Longitude']
                                  )
ny_venues.groupby('Neighborhood').count()
# one hot encoding
ny_onehot = pd.get_dummies(ny_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
ny_onehot['Neighborhood'] = ny_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [ny_onehot.columns[-1]] + list(ny_onehot.columns[:-1])
ny_onehot = ny_onehot[fixed_columns]

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [11]:
ny_grouped = ny_onehot.groupby('Neighborhood').mean().reset_index()

In [14]:
num_top_venues = 30

for hood in ny_grouped['Neighborhood']:
    temp = ny_grouped[ny_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [44]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = ny_grouped['Neighborhood']

for ind in np.arange(ny_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ny_grouped.iloc[ind, :], num_top_venues)


In [45]:
# set number of clusters
kclusters = 5

ny_grouped_clustering = ny_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ny_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:50] 

array([1, 1, 0, 0, 1, 4, 4, 2, 4, 1, 4, 4, 4, 1, 4, 4, 1, 4, 1, 2, 4, 1,
       1, 4, 4, 1, 0, 4, 1, 0, 1, 4, 1, 1, 1, 4, 1, 4, 1, 4, 0, 4, 2, 4,
       4, 1, 1, 4, 4, 4], dtype=int32)

In [46]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ny_merged = ny_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ny_merged = ny_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

ny_merged.head() # check the last columns!

NameError: name 'ny_data' is not defined

In [52]:
ny_merged['Cluster Labels'] = ny_merged['Cluster Labels'].fillna(0).astype(np.int64)
ny_merged.dtypes

Borough                   object
Neighborhood              object
Latitude                 float64
Longitude                float64
Cluster Labels             int64
1st Most Common Venue     object
2nd Most Common Venue     object
3rd Most Common Venue     object
4th Most Common Venue     object
5th Most Common Venue     object
dtype: object

In [53]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ny_merged['Latitude'], ny_merged['Longitude'], ny_merged['Neighborhood'], ny_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# Identifying clusters defined by a cafe as its most top venue #

In [54]:
ny_merged.loc[ny_merged['Cluster Labels'] == 0, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
77,Manhattan Beach,Bus Stop,Sandwich Place,Playground,Beach,Ice Cream Shop
172,Breezy Point,Trail,Bus Stop,Beach,Board Shop,Monument / Landmark
198,New Brighton,Bus Stop,Park,Convenience Store,Bowling Alley,Discount Store
202,Grymes Hill,Bus Stop,Dog Run,Women's Store,Factory,Falafel Restaurant
204,South Beach,Beach,Pier,Deli / Bodega,Bus Stop,Athletics & Sports
207,Port Ivory,,,,,
212,Oakwood,Bar,Bus Station,Bus Stop,Flower Shop,Factory
224,Park Hill,Bus Stop,Coffee Shop,Hotel,Athletics & Sports,Gym / Fitness Center
227,Arlington,Intersection,American Restaurant,Bus Stop,Grocery Store,Women's Store
238,Butler Manor,Pool,Baseball Field,Convenience Store,Bus Stop,Furniture / Home Store


In [55]:
ny_merged.loc[ny_merged['Cluster Labels'] == 1, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Wakefield,Laundromat,Pharmacy,Donut Shop,Gas Station,Sandwich Place
1,Co-op City,Park,Restaurant,Bus Station,Fast Food Restaurant,Pharmacy
2,Eastchester,Caribbean Restaurant,Deli / Bodega,Bus Station,Diner,Bus Stop
5,Kingsbridge,Pizza Place,Bar,Sandwich Place,Latin American Restaurant,Discount Store
6,Marble Hill,Coffee Shop,Sandwich Place,Discount Store,Yoga Studio,Pizza Place
7,Woodlawn,Deli / Bodega,Bar,Pub,Pizza Place,Playground
8,Norwood,Pizza Place,Park,Bank,Deli / Bodega,Pharmacy
10,Baychester,Donut Shop,Music Venue,Fried Chicken Joint,Spanish Restaurant,Supermarket
11,Pelham Parkway,Deli / Bodega,Italian Restaurant,Pizza Place,Plaza,Bus Station
13,Bedford Park,Diner,Deli / Bodega,Pizza Place,Fried Chicken Joint,Supermarket


In [56]:
ny_merged.loc[ny_merged['Cluster Labels'] == 2, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Riverdale,Bus Station,Park,Bank,Plaza,Food Truck
15,Morris Heights,Bus Station,Pharmacy,Moving Target,Grocery Store,Spanish Restaurant
18,West Farms,Bus Station,Donut Shop,Bus Stop,Playground,Scenic Lookout
19,High Bridge,Chinese Restaurant,Sandwich Place,Seafood Restaurant,Pharmacy,Pizza Place
26,Soundview,Chinese Restaurant,Liquor Store,Basketball Court,Fried Chicken Joint,Bus Station
30,Parkchester,Supermarket,Pizza Place,Women's Store,American Restaurant,Asian Restaurant
41,Olinville,Fried Chicken Joint,Caribbean Restaurant,Furniture / Home Store,Laundromat,Metro Station
42,Pelham Gardens,Chinese Restaurant,Bus Station,Pharmacy,Donut Shop,Café
45,Edenwald,Gas Station,Supermarket,Bus Station,Fish Market,Grocery Store
56,East Flatbush,Park,Chinese Restaurant,Department Store,Bakery,Supermarket


In [58]:
ny_merged.loc[ny_merged['Cluster Labels'] == 3, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
192,Somerville,Park,Women's Store,Flea Market,Exhibit,Factory
203,Todt Hill,Park,Women's Store,Flea Market,Exhibit,Factory


In [59]:
ny_merged.loc[ny_merged['Cluster Labels'] == 4, ny_merged.columns[[1] + list(range(5, ny_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
3,Fieldston,River,Playground,Plaza,Women's Store,Fish & Chips Shop
9,Williamsbridge,Bar,Nightclub,Caribbean Restaurant,Soup Place,Flea Market
12,City Island,Harbor / Marina,Grocery Store,Seafood Restaurant,Thrift / Vintage Store,Baseball Field
22,Port Morris,Latin American Restaurant,Peruvian Restaurant,Brewery,Grocery Store,Donut Shop
24,Hunts Point,Bank,Spanish Restaurant,Bakery,Farmers Market,BBQ Joint
27,Clason Point,Park,South American Restaurant,Boat or Ferry,Scenic Lookout,Grocery Store
28,Throgs Neck,Coffee Shop,Deli / Bodega,American Restaurant,Sports Bar,Baseball Field
29,Country Club,Sandwich Place,Comic Shop,Playground,Liquor Store,Flea Market
46,Bay Ridge,Italian Restaurant,Spa,Thai Restaurant,American Restaurant,Chinese Restaurant
47,Bensonhurst,Grocery Store,Ice Cream Shop,Donut Shop,Sushi Restaurant,Chinese Restaurant
