# New York City: Where to live for food-loving tourists

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import time
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

## Data Collection and Cleaning

In [2]:
# retriving latitude and longitude for Manhattan, New York for folium
address = 'Manhattan, New York'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Manhattan are 40.7900869, -73.9598295.


In [3]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
with open('newyork_data.json') as json_data:
    dfNY = json.load(json_data)

In [4]:
neighborhoods_data = dfNY['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [5]:
# filling the dataframe one row at a time
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
# viewing the dataframe for checking purposes
dfNY = neighborhoods
dfNY.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [7]:
# slicing original dataframe to have only Manhattan information
mandf = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
mandf.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [8]:
# installing folium library
!pip install folium
import folium



## Collecting foursquare data

In [9]:
CLIENT_ID = 'TEOT3BARGF5JSDCOIS53GKPVPLLENNSK4JAGLXDEVU4EW25X' # my Foursquare ID
CLIENT_SECRET = 'JINH5Q1GKIYG5PWWMRVPQ454Q5DSHKWMFMHPKQFJXW5LEC1Q' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 200

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
manhattan_venues = getNearbyVenues(names=mandf['Neighborhood'],
                                   latitudes=mandf['Latitude'],
                                   longitudes=mandf['Longitude']
                                  )

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


In [12]:
manhattan_venues['Venue Category'].value_counts()

Italian Restaurant                          128
Coffee Shop                                 124
Café                                         82
American Restaurant                          78
Bakery                                       75
Pizza Place                                  74
Park                                         67
Hotel                                        61
Gym                                          60
Gym / Fitness Center                         59
Cocktail Bar                                 58
Mexican Restaurant                           56
Bar                                          53
French Restaurant                            52
Chinese Restaurant                           48
Spa                                          48
Sushi Restaurant                             48
Sandwich Place                               46
Wine Shop                                    45
Japanese Restaurant                          44
Ice Cream Shop                          

In [13]:
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))

There are 338 uniques categories.


## Slicing the foursquare data into the 3 categories for further analysis

In [14]:
manhattan_coffee = manhattan_venues[manhattan_venues['Venue Category'] == 'Café'].reset_index()

In [15]:
manhattan_american = manhattan_venues[manhattan_venues['Venue Category'] == 'American Restaurant'].reset_index()

In [16]:
manhattan_pizza = manhattan_venues[manhattan_venues['Venue Category'] == 'Pizza Place'].reset_index()

### For coffee lovers

In [17]:
manhattan_coffee.groupby('Neighborhood')
manhattan_coffee.head()

Unnamed: 0,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,124,Washington Heights,40.851903,-73.9369,Green Juice Cafe,40.851898,-73.934827,Café
1,127,Washington Heights,40.851903,-73.9369,Le Chéile,40.851459,-73.939989,Café
2,168,Washington Heights,40.851903,-73.9369,Filtered Coffee,40.851607,-73.935014,Café
3,185,Washington Heights,40.851903,-73.9369,Pick and Eat,40.847558,-73.938212,Café
4,197,Washington Heights,40.851903,-73.9369,Scoop Cafe,40.849988,-73.935396,Café


In [18]:
# create map of Manhattan using latitude and longitude values
map_cof = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(mandf['Latitude'], mandf['Longitude'], mandf['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_cof)  

In [19]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [20]:
kclusters = 6

manhattan_coffee_cluster = manhattan_coffee[['Venue Latitude','Venue Longitude']]

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_coffee_cluster)
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 4], dtype=int32)

In [21]:
# add clustering labels
manhattan_coffee.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_coffee.head()

Unnamed: 0,Cluster Labels,index,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,3,124,Washington Heights,40.851903,-73.9369,Green Juice Cafe,40.851898,-73.934827,Café
1,3,127,Washington Heights,40.851903,-73.9369,Le Chéile,40.851459,-73.939989,Café
2,3,168,Washington Heights,40.851903,-73.9369,Filtered Coffee,40.851607,-73.935014,Café
3,3,185,Washington Heights,40.851903,-73.9369,Pick and Eat,40.847558,-73.938212,Café
4,3,197,Washington Heights,40.851903,-73.9369,Scoop Cafe,40.849988,-73.935396,Café


In [22]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_coffee['Venue Latitude'], manhattan_coffee['Venue Longitude'], manhattan_coffee['Venue'], manhattan_coffee['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_cof)
       
map_cof

In [23]:
manhattan_coffee.groupby('Cluster Labels')
manhattan_coffee_f = (manhattan_coffee.drop(['index', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude', 'Venue Category'], axis = 1)
                                            .sort_values(['Cluster Labels', 'Neighborhood'])
                                            .set_index(['Cluster Labels', 'Neighborhood'])
                     )

In [24]:
manhattan_coffee_f

Unnamed: 0_level_0,Unnamed: 1_level_0,Venue
Cluster Labels,Neighborhood,Unnamed: 2_level_1
0,Carnegie Hill,Bonjour Crêpes & Wine
0,Carnegie Hill,Corner Cafe and Bakery
0,Carnegie Hill,Bluestone Lane
0,Carnegie Hill,Bocado Café
0,East Harlem,Frenchy Cafe NYC
0,Lenox Hill,B. Cafe - East
0,Lenox Hill,Inès
0,Lenox Hill,Le Gourmet
0,Roosevelt Island,Pyramid Cafe
0,Yorkville,Ottomanelli Cafe


### For american restaurant lovers

In [25]:
manhattan_american.groupby('Neighborhood')

# create map of Manhattan using latitude and longitude values
map_am = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(mandf['Latitude'], mandf['Longitude'], mandf['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_am)  

In [26]:
kclusters = 6

manhattan_am_cluster = manhattan_american[['Venue Latitude','Venue Longitude']]

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_am_cluster)
kmeans.labels_[0:10] 

array([3, 2, 2, 2, 2, 3, 3, 3, 1, 1], dtype=int32)

In [27]:
# add clustering labels
manhattan_american.insert(0, 'Cluster Labels', kmeans.labels_)

In [28]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_american['Venue Latitude'], manhattan_american['Venue Longitude'], manhattan_american['Venue'], manhattan_american['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_am)
       
map_am

In [29]:
manhattan_american.groupby('Cluster Labels')
manhattan_american = (manhattan_american.drop(['index', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude', 'Venue Category'], axis = 1)
                                            .sort_values(['Cluster Labels', 'Neighborhood'])
                                            .set_index(['Cluster Labels', 'Neighborhood'])
                     )

In [30]:
manhattan_american

Unnamed: 0_level_0,Unnamed: 1_level_0,Venue
Cluster Labels,Neighborhood,Unnamed: 2_level_1
0,Chelsea,Cookshop
0,Chelsea,Westville Chelsea
0,Chelsea,Foragers Table
0,Clinton,West Bank Cafe
0,Clinton,Chef's Table At Brooklyn Fare
0,Clinton,44 & X
0,Clinton,Legacy Records
0,Hudson Yards,Friedman's
0,Hudson Yards,Legacy Records
0,Hudson Yards,Chef's Table At Brooklyn Fare


### For pizza lovers

In [31]:
manhattan_pizza.groupby('Neighborhood')

# create map of Manhattan using latitude and longitude values
map_p = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, label in zip(mandf['Latitude'], mandf['Longitude'], mandf['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_p)  

In [32]:
kclusters = 6

manhattan_p_cluster = manhattan_pizza[['Venue Latitude','Venue Longitude']]

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_p_cluster)
kmeans.labels_[0:10] 

array([3, 5, 3, 3, 3, 3, 2, 2, 2, 2], dtype=int32)

In [33]:
# add clustering labels
manhattan_pizza.insert(0, 'Cluster Labels', kmeans.labels_)

In [34]:
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_pizza['Venue Latitude'], manhattan_pizza['Venue Longitude'], manhattan_pizza['Venue'], manhattan_pizza['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_p)
       
map_p

In [37]:
manhattan_pizza.groupby('Cluster Labels')
manhattan_pizza = (manhattan_pizza.drop(['index', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude', 'Venue Category'], axis = 1)
                                            .sort_values(['Cluster Labels', 'Neighborhood'])
                                            .set_index(['Cluster Labels', 'Neighborhood'])
                     )

In [38]:
manhattan_pizza

Unnamed: 0_level_0,Unnamed: 1_level_0,Venue
Cluster Labels,Neighborhood,Unnamed: 2_level_1
0,Carnegie Hill,Marinara Pizza Upper East
0,Carnegie Hill,San Matteo Pizza Espresso Bar
0,Carnegie Hill,Roma Pizza
0,Carnegie Hill,Nick's Restaurant & Pizzeria
0,Carnegie Hill,Marco Polo Pizza & Deli
0,Carnegie Hill,Little Luzzo's Pizza
0,Lenox Hill,Donna Margherita
0,Lenox Hill,La Crosta Restaurant & Gourmet Pizzeria
0,Lenox Hill,Numero 28 Pizza
0,Lenox Hill,Pizza Park
