# Battle of Neighborhoods.
Let's assume we want to open a a franchise coffee shop in Manhattan borough of New York City. We have 3 addresses where we can lease retail space. Using Four Square data, we want to decide which of the addresses will attract the largest number of reviews, and which these two franchise brands: Starbucks and Dunkin Donuts will have the highest review score based on the  coffee shop data gathered. 

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
import urllib.request
f = urllib.request.urlopen('https://cocl.us/new_york_dataset/newyork_data.json') 
with f as json_data:
    newyork_data = json.load(json_data)

In [3]:
neighborhoods_data = newyork_data['features']

In [4]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [5]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
df_man = neighborhoods[neighborhoods.Borough=='Manhattan'].reset_index(drop=True)
df_man.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [38]:
address = '316 Lafayette St, New York City, NY 10012'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7254535652174, -73.9951589130435.


In [39]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_man['Latitude'], df_man['Longitude'], df_man['Borough'], df_man['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [111]:
#FourSquare credentials
import yaml
FOURSQ_CONFIG_FILE = '4SQ.env'
with open(FOURSQ_CONFIG_FILE, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.BaseLoader)

# set parameters for FourSquare API calls
CLIENT_ID = config['FOURSQUARE']['CLIENT_ID']
CLIENT_SECRET = config['FOURSQUARE']['CLIENT_SECRET']
VERSION = '20190708' # Foursquare API version
radius = 1000 # 1KM radius
limit = 500 # 1K result limit

In [61]:
def getNearbyVenues(coords, categoryID='', q_name='',radius=500, intent="browse"):
    venues_list=[]
    for lat, lng in coords:
        #print(q_name)
            
        # create the API request URL
        url = ('https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}'
        '&intent={}&categoryId={}'
        '&limit={}'
        '&query={}'
        ).format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            intent,
            categoryID,
            limit,
            q_name
        )
            
        # make the GET request
#        print("URL:"+url)
        url_results = requests.get(url).json()["response"]
#        print(url_results)
        #results=url_results['groups'][0]['items']
        results=url_results['venues']
        # return only relevant information for each nearby venue
        venues_list.append([(
            v['id'],
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [ 
                  'Venue ID',
                  'Venue Name', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
# set foursquare category "Coffee Shop" 
fsq_cat_coffeshop ='4bf58dd8d48988d1e0931735'

In [16]:
import math
def shift_coord(lat,lon, dist,bearing):
    R = 6371.0088 #Radius of the Earth
    brng = math.radians(bearing)


    lat1 = math.radians(lat) #lat point converted to radians
    lon1 = math.radians(lon) #long point converted to radians

    lat2 = math.asin( math.sin(lat1)*math.cos(dist/R) +
    math.cos(lat1)*math.sin(dist/R)*math.cos(brng))

    lon2 = lon1 + math.atan2(math.sin(brng)*math.sin(dist/R)*math.cos(lat1),
             math.cos(dist/R)-math.sin(lat1)*math.sin(lat2))

    lat2 = math.degrees(lat2)
    lon2 = math.degrees(lon2)

    return (lat2, lon2)


In [14]:
def cgrid(latitude, longitude, distance, legs):
    ab=[]
    for i in range(-legs,legs):
        if i==0:
            ab.append([latitude, longitude])
            continue
        #ab.append(shift_coord(latitude, longitude, distance*i, j) for j in (0,90,180,270))
        for j in (0,90,180,270):
            ab.append(shift_coord(latitude, longitude, distance*i, j))
        #ab.append(bb)    
    return ab


In [40]:
print("Coord grid for location "+str(latitude)+','+str(longitude))
coords =cgrid(latitude, longitude, 0.25, 2) 
print(coords)


Coord grid for location 40.7254535652174,-73.9951589130435
[(40.72095696339877, -73.9951589130435), (40.72545341331191, -74.00109232208771), (40.729950167036016, -73.9951589130435), (40.72545341331191, -73.9892255039993), (40.72320526430809, -73.9951589130435), (40.72545352724103, -73.998125617569), (40.72770186612671, -73.9951589130435), (40.72545352724103, -73.992192208518), [40.7254535652174, -73.9951589130435], (40.72770186612671, -73.9951589130435), (40.72545352724103, -73.992192208518), (40.72320526430809, -73.9951589130435), (40.72545352724103, -73.998125617569)]


In [41]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=15
                        )

# add markers to map
for lat, lng in coords:
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [42]:
df_coffee = getNearbyVenues(coords, categoryID=fsq_cat_coffeshop,radius=500)

URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2MYLIUVPCUR4BHZ5GGFRCWPIINHULORDFNP3XASR3NYLG1&v=20190708&ll=40.72095696339877,-73.9951589130435&radius=500&intent=browse&categoryId=4bf58dd8d48988d1e0931735&limit=500
URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2MYLIUVPCUR4BHZ5GGFRCWPIINHULORDFNP3XASR3NYLG1&v=20190708&ll=40.72545341331191,-74.00109232208771&radius=500&intent=browse&categoryId=4bf58dd8d48988d1e0931735&limit=500
URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2MYLIUVPCUR4BHZ5GGFRCWPIINHULORDFNP3XASR3NYLG1&v=20190708&ll=40.729950167036016,-73.9951589130435&radius=500&intent=browse&categoryId=4bf58dd8d48988d1e0931735&limit=500
URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2M

In [43]:
#df_coffee.drop_duplicates(inplace=True)
df_coffee =df_coffee.drop_duplicates().set_index("Venue ID")
#.reset_index(inplace=True,drop=True)
df_coffee.shape[0]

122

In [62]:
# get starbucks shops only
df_sb = getNearbyVenues(coords, categoryID=fsq_cat_coffeshop,radius=500, q_name='starbucks')
df_sb =df_sb.drop_duplicates().set_index("Venue ID")
#.reset_index(inplace=True,drop=True)
df_sb.shape[0]

15

In [63]:
# get dunkin' donuts shops only
df_dd = getNearbyVenues(coords, categoryID=fsq_cat_coffeshop,radius=500, q_name='dunkin')
df_dd =df_dd.drop_duplicates().set_index("Venue ID")
#.reset_index(inplace=True,drop=True)
df_dd.shape[0]

5

In [65]:
df_dd["Chain"]="DD"
df_sb["Chain"]="Starbucks"


In [58]:
radius = 1000
limit=500
latitudes=df_man[1:2]['Latitude']
longitudes=df_man[1:2]['Longitude']
df_coffee = getNearbyVenues(names, latitudes, longitudes, categoryID=fsq_cat_coffeshop,radius=2000)
#df_sb = getNearbyVenues(names, latitudes, longitudes, categoryID='4bf58dd8d48988d1e0931735', v_name='Starbucks',radius=1000)
#df_dd = getNearbyVenues(names, latitudes, longitudes, categoryID='4bf58dd8d48988d1e0931735', v_name='Dunkin',radius=1000)

Chinatown 
URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2MYLIUVPCUR4BHZ5GGFRCWPIINHULORDFNP3XASR3NYLG1&v=20190708&ll=40.71561842231432,-73.99427936255978&radius=2000&intent=browse&categoryId=4bf58dd8d48988d1e0931735&limit=500


In [80]:
df_coffee.index

Index(['49e63a33f964a52023641fe3', '49ee02e9f964a52010681fe3',
       '49e644c6f964a5202f641fe3', '560d1028498eee2df9f982f0',
       '57470da2498ed71d9912da41', '5bba6050c36588002c310910',
       '4f0f47650cd695a0e54cb438', '4bae332df964a52045913be3',
       '5abfe6bbc0af5705a1bcacdb', '4f22cc3fe4b0e1207fe7c676',
       ...
       '507067abe4b07b49cb026887', '5b28f5836336be002d4ff5f0',
       '5bf853b1628c83002cb5871e', '4a91768cf964a520441a20e3',
       '5abd76874c954c34ab6d0533', '5869bd8fca10703aaaab01af',
       '49cbb835f964a52010591fe3', '58daf7ee51666a12c54ba74a',
       '5c815acb1953f3002ceb5700', '5c124bda9deb7d002c5b14be'],
      dtype='object', name='Venue ID', length=122)

In [93]:
#get all coffe shops that are not starbucks or dunkin donuts 
manhattan_coffeeshops=df_coffee.drop(df_sb.index,errors="ignore").drop(df_dd.index,errors="ignore")
manhattan_coffeeshops["Chain"]="Other"
manhattan_coffeeshops= pd.concat([manhattan_coffeeshops,df_sb,df_dd])

In [94]:
manhattan_coffeeshops

Unnamed: 0_level_0,Venue Name,Venue Latitude,Venue Longitude,Venue Category,Chain
Venue ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
49e63a33f964a52023641fe3,McNally Jackson Books,40.723493,-73.996048,Bookstore,Other
49ee02e9f964a52010681fe3,La Colombe Coffee Roasters,40.723929,-73.996315,Coffee Shop,Other
560d1028498eee2df9f982f0,JOE & THE JUICE,40.722482,-73.997453,Juice Bar,Other
57470da2498ed71d9912da41,Cha Cha Matcha,40.720229,-73.996141,Tea Room,Other
5bba6050c36588002c310910,Cha Cha Matcha,40.725402,-73.994887,Coffee Shop,Other
4f0f47650cd695a0e54cb438,Jack's Wife Freda,40.722022,-73.997528,Mediterranean Restaurant,Other
4bae332df964a52045913be3,Allegro Coffee Roasters,40.723627,-73.991291,Coffee Shop,Other
5abfe6bbc0af5705a1bcacdb,COFFEE & CREAM by Oddfellows,40.724707,-73.994281,Ice Cream Shop,Other
4f22cc3fe4b0e1207fe7c676,Smile To Go,40.719502,-74.00036,Café,Other
52b0a990498eb3a992b89573,Happy Bones,40.720648,-73.997125,Coffee Shop,Other


In [97]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=15)
chain_color = { "Starbucks":"green","DD":"purple","Other":"blue"}

# add markers to map
for lat, lng, venue, chain in zip(manhattan_coffeeshops['Venue Latitude'], manhattan_coffeeshops['Venue Longitude'], manhattan_coffeeshops['Venue Name'],manhattan_coffeeshops['Chain']):
    label = venue
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=chain_color.get(chain),
        fill=True,
        fill_color=chain_color.get(chain),
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [None]:
#get venue stats
def getVenueStats(v_list):
    venues_list=[]
    for ven_id in v_list:
#        print(ven_id)   
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/{}?&v={}&client_id={}&client_secret={}'.format(
            ven_id,
            VERSION,
            CLIENT_ID, 
            CLIENT_SECRET)          
        # make the GET request
#        print("URL:"+url) 
        url_results = requests.get(url).json()["response"]
#        print(url_results)
        #results=url_results['groups'][0]['items']
        results=url_results['venue']
#        print(results)
        # return only relevant information for each nearby venue
        try:  tip_count = results['stats']['tipCount']            
        except KeyError: tip_count = 0
        try:  visit_count = results['stats']['visitsCount']            
        except KeyError: visit_count = 0
        venues_list.append([(results['id'],tip_count, visit_count)])

    venue_stats = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venue_stats.columns = ['Venue ID', 
                  'tipCount', 
#                  'usersCount', 
#                  'checkinsCount',
                  'visitsCount']
    
    return(venue_stats)

In [131]:
VERSION = '20190708'
df_mmm = getVenueStats(['412d2800f964a520df0c1fe3'])
df_mmm

412d2800f964a520df0c1fe3


Unnamed: 0,Venue ID,tipCount,visitsCount
0,412d2800f964a520df0c1fe3,1782,0


In [134]:
VERSION = '20190708'
df_stat = getVenueStatsRandom(manhattan_coffeeshops.index)
#df_stat = pd.concat([df_sb.join(getVenueStatsRandom(df_sb['Venue ID']).set_index("Venue ID"), on='Venue ID'),
#                     df_dd.join(getVenueStatsRandom(df_dd['Venue ID']).set_index("Venue ID"), on='Venue ID')])
df_stat

Unnamed: 0,Venue ID,tipCount,visitsCount
0,49e63a33f964a52023641fe3,2200,4825
1,49ee02e9f964a52010681fe3,437,4814
2,560d1028498eee2df9f982f0,2272,2162
3,57470da2498ed71d9912da41,4027,124
4,5bba6050c36588002c310910,4841,4089
5,4f0f47650cd695a0e54cb438,4214,805
6,4bae332df964a52045913be3,3248,3019
7,5abfe6bbc0af5705a1bcacdb,2669,1724
8,4f22cc3fe4b0e1207fe7c676,2442,2962
9,52b0a990498eb3a992b89573,3608,2146


In [140]:
df_stat.set_index('Venue ID', inplace=True)

In [133]:
import random
def getVenueStatsRandom(v_list):
    venues_list=[]
    for ven_id in v_list:
        venues_list.append([(ven_id,random.randint(0, 5000), random.randint(0, 5000))])

    venue_stats = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venue_stats.columns = ['Venue ID', 
                  'tipCount', 
#                  'usersCount', 
#                  'checkinsCount',
                  'visitsCount']
    
    return(venue_stats)

In [154]:
#build a KNN model (nearest neigbour)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
import itertools
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
from sklearn import metrics
%matplotlib inline

In [145]:
result =manhattan_coffeeshops.join(df_stat)
                                   #],verify_integrity=True, ignore_errors=True)
result

Unnamed: 0_level_0,Venue Name,Venue Latitude,Venue Longitude,Venue Category,Chain,tipCount,visitsCount
Venue ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
49e63a33f964a52023641fe3,McNally Jackson Books,40.723493,-73.996048,Bookstore,Other,2200,4825
49ee02e9f964a52010681fe3,La Colombe Coffee Roasters,40.723929,-73.996315,Coffee Shop,Other,437,4814
560d1028498eee2df9f982f0,JOE & THE JUICE,40.722482,-73.997453,Juice Bar,Other,2272,2162
57470da2498ed71d9912da41,Cha Cha Matcha,40.720229,-73.996141,Tea Room,Other,4027,124
5bba6050c36588002c310910,Cha Cha Matcha,40.725402,-73.994887,Coffee Shop,Other,4841,4089
4f0f47650cd695a0e54cb438,Jack's Wife Freda,40.722022,-73.997528,Mediterranean Restaurant,Other,4214,805
4bae332df964a52045913be3,Allegro Coffee Roasters,40.723627,-73.991291,Coffee Shop,Other,3248,3019
5abfe6bbc0af5705a1bcacdb,COFFEE & CREAM by Oddfellows,40.724707,-73.994281,Ice Cream Shop,Other,2669,1724
4f22cc3fe4b0e1207fe7c676,Smile To Go,40.719502,-74.00036,Café,Other,2442,2962
52b0a990498eb3a992b89573,Happy Bones,40.720648,-73.997125,Coffee Shop,Other,3608,2146


In [149]:
# Define which columns should be encoded vs scaled
columns_to_encode = ['Chain']
columns_to_scale  = ['Venue Latitude','Venue Longitude']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(result[columns_to_scale]) 
encoded_columns =    ohe.fit_transform(result[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X = np.concatenate([scaled_columns, encoded_columns], axis=1)
y = result["visitsCount"]

In [150]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (101, 5) (101,)
Test set: (26, 5) (26,)


In [151]:
from sklearn.neighbors import KNeighborsClassifier
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
from sklearn import metrics
%matplotlib inline

In [152]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

array([0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [153]:
for k in range(4,12):  #Train Model and Predict  
  neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
  yhat = neigh.predict(X_test)
  print("For k=",k)
  print("Train set Accuracy: ", metrics.accuracy_score(y_train,neigh.predict(X_train)))
  print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))


For k= 4
Train set Accuracy:  0.26732673267326734
Test set Accuracy:  0.0
For k= 5
Train set Accuracy:  0.22772277227722773
Test set Accuracy:  0.0
For k= 6
Train set Accuracy:  0.1782178217821782
Test set Accuracy:  0.0
For k= 7
Train set Accuracy:  0.1485148514851485
Test set Accuracy:  0.0
For k= 8
Train set Accuracy:  0.1188118811881188
Test set Accuracy:  0.0
For k= 9
Train set Accuracy:  0.10891089108910891
Test set Accuracy:  0.0
For k= 10
Train set Accuracy:  0.09900990099009901
Test set Accuracy:  0.0
For k= 11
Train set Accuracy:  0.09900990099009901
Test set Accuracy:  0.0
