# Battle of Neighborhoods.
Let's assume we want to open a a franchise coffee shop in Manhattan borough of New York City. We have 3 addresses where we can lease retail space. Using Four Square data, we want to decide which of the addresses will attract the largest number of reviews, and which these two franchise brands: Starbucks and Dunkin Donuts will have the highest review score based on the  coffee shop data gathered. 

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
import urllib.request
f = urllib.request.urlopen('https://cocl.us/new_york_dataset/newyork_data.json') 
with f as json_data:
    newyork_data = json.load(json_data)

In [3]:
neighborhoods_data = newyork_data['features']

In [4]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [5]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [6]:
df_man = neighborhoods[neighborhoods.Borough=='Manhattan'].reset_index(drop=True)
df_man.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [7]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [8]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_man['Latitude'], df_man['Longitude'], df_man['Borough'], df_man['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [62]:
#FourSquare credentials
import yaml
FOURSQ_CONFIG_FILE = '4SQ.env'

with open(FOURSQ_CONFIG_FILE, 'r') as config_file:
    config = yaml.load(config_file, Loader=yaml.BaseLoader)

# set parameters for FourSquare API calls
CLIENT_ID = config['FOURSQUARE']['CLIENT_ID']
CLIENT_SECRET = config['FOURSQUARE']['CLIENT_SECRET']
VERSION = '20190708' # Foursquare API version
radius = 1000 # 1KM radius
limit = 500 # 1K result limit

In [141]:
def getNearbyVenues(coords, categoryID='', q_name='',radius=500, intent="browse"):
    venues_list=[]
    for lat, lng in coords:
        #print(q_name)
            
        # create the API request URL
        url = ('https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&v={}&ll={},{}&radius={}'
        '&intent={}&categoryId={}'
        '&limit={}'
        #+'&query='
        ).format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            intent,
            categoryID,
            limit,
#            q_name
        )
            
        # make the GET request
#        print("URL:"+url)
        url_results = requests.get(url).json()["response"]
#        print(url_results)
        #results=url_results['groups'][0]['items']
        results=url_results['venues']
        # return only relevant information for each nearby venue
        venues_list.append([(
            v['id'],
            v['name'], 
            v['location']['lat'], 
            v['location']['lng'],  
            v['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [ 
                  'Venue ID',
                  'Venue Name', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [60]:
# set foursquare category "Coffee Shop" 
fsq_cat_coffeshop ='4bf58dd8d48988d1e0931735'

In [None]:
#FourSquare limits venue search result to 50. We'll scrape for coffee shops using a grid of coordinates  
def shift_lat(lat, dist):
    return (lat - lower)/length

def shift_lon(lon,lat, dist):
    return (lat - lower)/length


In [74]:
import math
def shift_coord(lat,lon, dist,bearing):
    R = 6378.1 #Radius of the Earth
    brng = math.radians(bearing)


    lat1 = math.radians(lat) #lat point converted to radians
    lon1 = math.radians(lon) #long point converted to radians

    lat2 = math.asin( math.sin(lat1)*math.cos(dist/R) +
    math.cos(lat1)*math.sin(dist/R)*math.cos(brng))

    lon2 = lon1 + math.atan2(math.sin(brng)*math.sin(d/R)*math.cos(lat1),
             math.cos(d/R)-math.sin(lat1)*math.sin(lat2))

    lat2 = math.degrees(lat2)
    lon2 = math.degrees(lon2)

    return (lat2, lon2)


In [117]:
def cgrid(latitude, longitude, distance, legs):
    ab=[]
    for i in range(-legs,legs):
        if i==0:
            ab.append([latitude, longitude])
            continue
        #ab.append(shift_coord(latitude, longitude, distance*i, j) for j in (0,90,180,270))
        for j in (0,90,180,270):
            ab.append(shift_coord(latitude, longitude, distance*i, j))
        #ab.append(bb)    
    return ab


Coord grid for location 40.7127281,-74.0060152
[(40.70374489504663, -74.0060152), (40.71272749400073, -73.82824469929632), (40.72171130495337, -74.0060152), (40.71272749400073, -74.18378570070367), [40.7127281, -74.0060152]]


In [164]:
print("Coord grid for location "+str(latitude)+','+str(longitude))
coords =cgrid(latitude, longitude, 0.5, 1) 
print(coords)


Coord grid for location 40.7127281,-74.0060152
[(40.708236497523316, -74.0060152), (40.71272794850018, -73.82824469808284), (40.71721970247668, -74.0060152), (40.71272794850018, -74.18378570191715), [40.7127281, -74.0060152]]


In [165]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng in coords:
    label = '{}, {}'.format(lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [161]:
df_coffee = getNearbyVenues(coords, categoryID=fsq_cat_coffeshop,radius=250)

In [162]:
#df_coffee.drop_duplicates(inplace=True)
df_coffee =df_coffee.drop_duplicates().set_index("Venue ID")
#.reset_index(inplace=True,drop=True)
df_coffee.shape[0]

94

In [58]:
radius = 1000
limit=500
latitudes=df_man[1:2]['Latitude']
longitudes=df_man[1:2]['Longitude']
df_coffee = getNearbyVenues(names, latitudes, longitudes, categoryID=fsq_cat_coffeshop,radius=2000)
#df_sb = getNearbyVenues(names, latitudes, longitudes, categoryID='4bf58dd8d48988d1e0931735', v_name='Starbucks',radius=1000)
#df_dd = getNearbyVenues(names, latitudes, longitudes, categoryID='4bf58dd8d48988d1e0931735', v_name='Dunkin',radius=1000)

Chinatown 
URL:https://api.foursquare.com/v2/venues/search?client_id=APFO4O5P2WULX55JIHSWD5B4KU2GF5U230AWJFYWXAWX01LE&client_secret=FU2MYLIUVPCUR4BHZ5GGFRCWPIINHULORDFNP3XASR3NYLG1&v=20190708&ll=40.71561842231432,-73.99427936255978&radius=2000&intent=browse&categoryId=4bf58dd8d48988d1e0931735&limit=500


In [59]:
manhattan_coffeeshops = df_coffee
manhattan_coffeeshops.shape[0]

50

In [18]:
manhattan_coffeeshops.iloc[0][["Neighborhood Latitude","Neighborhood Longitude"]].astype("float")

Neighborhood Latitude     40.715618
Neighborhood Longitude   -73.994279
Name: 0, dtype: float64

In [None]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=manhattan_coffeeshops.iloc[0][["Venue Latitude", 'Venue Longitude']], zoom_start=12)

# add markers to map
for lat, lng, venue in zip(manhattan_coffeeshops['Venue Latitude'], manhattan_coffeeshops['Venue Longitude'], manhattan_coffeeshops['Venue Name']):
    label = venue
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

In [None]:
#get venue status
def getVenueStats(v_list):
    venues_list=[]
    for ven_id in v_list:
        print(ven_id)   
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/{}?&v={}&client_id={}&client_secret={}'.format(
            ven_id,
            VERSION,
            CLIENT_ID, 
            CLIENT_SECRET)          
        # make the GET request
        #print("URL:"+url)
        url_results = requests.get(url).json()["response"]
        print(url_results)
        #results=url_results['groups'][0]['items']
        results=url_results['venue']
        print(results)
        # return only relevant information for each nearby venue
        try:  visit_count = results['stats']['visitsCount']            
        except KeyError: visit_count = 0
        venues_list.append([(results['id'], visit_count)])

    venue_stats = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venue_stats.columns = ['Venue ID', 
#                  'tipCount', 
#                  'usersCount', 
#                  'checkinsCount',
                  'visitsCount']
    
    return(venue_stats)

In [None]:
VERSION = '20190708'
#df_sb_stat = getVenueStats(df_sb['Venue ID'])
df_stat = pd.concat([df_sb.join(getVenueStatsRandom(df_sb['Venue ID']).set_index("Venue ID"), on='Venue ID'),
                     df_dd.join(getVenueStatsRandom(df_dd['Venue ID']).set_index("Venue ID"), on='Venue ID')])
df_stat

In [None]:
import random
def getVenueStatsRandom(v_list):
    venues_list=[]
    for ven_id in v_list:
        venues_list.append([(ven_id, random.randint(0, 5000))])

    venue_stats = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    venue_stats.columns = ['Venue ID', 
#                  'tipCount', 
#                  'usersCount', 
#                  'checkinsCount',
                  'visitsCount']
    
    return(venue_stats)

In [None]:
#build a KNN model (nearest neigbour)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
df_sb['Type']="Starbucks"
df_dd['Type']="DD"
result = pd.concat([df_sb,df_dd])
result

In [None]:
# Define which columns should be encoded vs scaled
columns_to_encode = ['Type']
columns_to_scale  = ['Venue Latitude','Venue Longitude']

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder(sparse=False)

# Scale and Encode Separate Columns
scaled_columns  = scaler.fit_transform(df_stat[columns_to_scale]) 
encoded_columns =    ohe.fit_transform(df_stat[columns_to_encode])

# Concatenate (Column-Bind) Processed Columns Back Together
X = np.concatenate([scaled_columns, encoded_columns], axis=1)
y = df_stat["visitsCount"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import itertools
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import pandas as pd
import numpy as np
import matplotlib.ticker as ticker
from sklearn import preprocessing
from sklearn import metrics
%matplotlib inline

In [None]:
Ks = 10
mean_acc = np.zeros((Ks-1))
std_acc = np.zeros((Ks-1))
ConfustionMx = [];
for n in range(1,Ks):
    
    #Train Model and Predict  
    neigh = KNeighborsClassifier(n_neighbors = n).fit(X_train,y_train)
    yhat=neigh.predict(X_test)
    mean_acc[n-1] = metrics.accuracy_score(y_test, yhat)

    
    std_acc[n-1]=np.std(yhat==y_test)/np.sqrt(yhat.shape[0])

mean_acc

In [None]:
for k in range(4,12):  #Train Model and Predict  
  neigh = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)
  yhat = neigh.predict(X_test)
  print("For k=",k)
  print("Train set Accuracy: ", metrics.accuracy_score(y_train,neigh.predict(X_train)))
  print("Test set Accuracy: ", metrics.accuracy_score(y_test, yhat))
