In [None]:
# Setup environment

# standard libraries
import numpy as np                        # library to handle data in a vectorized manner
import pandas as pd                       # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Foursquare
!conda install -c conda-forge geopy --yes  # find latitude/longitude of a place by name
from geopy.geocoders import Nominatim      # convert an address into latitude and longitude values

# JSON files
import json # library to handle JSON files
import requests # library to handle requests
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe

# plotting data
import matplotlib.cm as cm
import matplotlib.colors as colors

# k-means clustering
from sklearn.cluster import KMeans

# map display
!conda install -c conda-forge folium=0.5.0 --yes
import folium                             # map rendering library

# other functions
!pip install lxml                         # for read_html
!pip install beautifulsoup4               # requested by read_html
import itertools

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    scipy-1.3.2                |   py36h921218d_0        18.0 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    numpy-1.17.3               |   py36h95a1406_0         5.2 MB  conda-forge
    libcblas-3.8.0             |      11_openblas       

In [None]:
# get list of major cities in California
# source: Wikipedia "List of largest California Cities"

lCities = pd.read_html('https://en.wikipedia.org/wiki/List_of_largest_California_cities_by_population')
dfCities = pd.DataFrame(lCities[0])    # add [0] to capture table properly
dfCities.head(10)                      # view first rows to verify dataframe looks correct

In [None]:
# foursquare credentials

CLIENT_ID = 'XO2ABQ5J2EXXKQJTQNMEHEQVMSPBRE1AWOWMD0AOVDL2YG1S'     # your Foursquare ID
CLIENT_SECRET = '22IN2LGH2IWXTMTKDEU4PQRAIEVOEM2V5HMVFC2WI5IWNLRZ' # your Foursquare Secret
VERSION = '20180605'                                               # Foursquare API version

print('Foursquare credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# function that extracts the category of the venue
# from class example

def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
# get data from foursquare

nMax = 100                            # set maximum number of responses
# note: for a venues/explore?near query, if a radius is not specified,
# the default is "city-wide," so do not specify a radius for this query

# use first (largest) city in California, Los Angeles
xCity = dfCities.loc[0, 'City'] + ', CA'
print(xCity)

# the foursquare categoryId for "EV Charging Station" is 5032872391d4c4b30a586d64 (from foursquare documentation)
xCharging = '5032872391d4c4b30a586d64'

# construct URL
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&near={}&categoryId={}&limit={}'.format(
CLIENT_ID, CLIENT_SECRET, VERSION, xCity, xCharging, nMax)
    
results = requests.get(url).json()
venues = results['response']['groups'][0]['items']
lStations = json_normalize(venues) # flatten JSON
filtered_columns = ['venue.id', 'venue.name', 'venue.location.lat', 'venue.location.lng', 'venue.location.city']
lStations = lStations.loc[:, filtered_columns]
print(lStations.head(10))

In [None]:
# create custom lists of venues in each of several amenity classes:
# restaurants, entertainment, lodging, leisurely shopping, services, 
# links to other forms of transportation
# note: I chose to populate dataframe manually in this notebook in order
# to remove need for a supporting external file

# create tuples of venue categories in category classes, such as
# "long duration shopping" or "entertainment"
# created manually from foursquare.com list of possible categories
# note: I selected these categories as potentially compatible activities
# for concurrent EV charging, based on the idea that these activities would
# typically take over 30 minutes and generally an hour or two, allowing
# sufficient time for at least a partial EV charge
# ideas for improvement: justify categories in each list using average "wait times" 
# from source such as Google Maps or a wait time study, or provide an interface for
# the user to select categories of interest from a list

lRestaurants = [('Food', '4d4b7105d754a06374d81259'),
               ('Street Food Gathering', '53e0feef498e5aac066fd8a9')]
tRestaurants = ['Restaurants'] * len(lRestaurants)
lEntertainment = [('Arts & Entertainment', '4d4b7104d754a06370d81259'),
                  ('Boat Rental', '5744ccdfe4b0c0459246b4c1'),
                  ('College Stadium', '4bf58dd8d48988d1b4941735'),
                  ('College Theater', '4bf58dd8d48988d1ac941735'),
                  ('Event', '4d4b7105d754a06373d81259'),
                  ('Night Market', '53e510b7498ebcb1801b55d4'),
                  ('Outdoors & Recreation', '4d4b7105d754a06377d81259'),
                  ('Community Center', '52e81612bcbc57f1066b7a34'),
                  ('Cultural Center', '52e81612bcbc57f1066b7a32')]
tEntertainment = ['Entertainment'] * len(lEntertainment)
lLodging = [('Hotel', '4bf58dd8d48988d1fa931735')]
tLodging = ['Lodging'] * len(lLodging)
lShopping = [('Big Box Store', '52f2ab2ebcbc57f1066b8b42'),
             ('Farmers Market', '4bf58dd8d48988d1fa941735'),
             ('Grocery Store', '4bf58dd8d48988d118951735'),
             ('Market', '50be8ee891d4fa8dcc7199a7'),
             ('Organic Grocery', '52f2ab2ebcbc57f1066b8b45'),
             ('Outlet Mall', '5744ccdfe4b0c0459246b4df'),
             ('Shopping Mall', '4bf58dd8d48988d1fd941735'),
             ('Shopping Plaza', '5744ccdfe4b0c0459246b4dc'),
             ('Supermarket', '52f2ab2ebcbc57f1066b8b46'),
             ('Warehouse Store', '52e816a6bcbc57f1066b7a54')]
tShopping = ['Long Stop Shopping'] * len(lShopping)
lServices = [('Library', '4bf58dd8d48988d12f941735'),
             ('Medical Center', '4bf58dd8d48988d104941735'),
             ('Internet Cafe', '4bf58dd8d48988d1f0941735')]
tServices = ['Long Stop Services'] * len(lServices)
lTransportation = [('Boat or Ferry', '4bf58dd8d48988d12d951735'),
                   ('Bus Station', '4bf58dd8d48988d1fe931735'),
                   ('Light Rail Station', '4bf58dd8d48988d1fc931735'),
                   ('Metro Station', '4bf58dd8d48988d1fd931735'),
                   ('Train Station', '4bf58dd8d48988d129951735'),
                   ('Tram Station', '52f2ab2ebcbc57f1066b8b51')]
tTransportation = ['Transportation Park-Ride Links'] * len(lTransportation)

# merge class lists into dfClasses dataframe
dfClassR = pd.DataFrame(tRestaurants, columns=['class'])
dfClassE = pd.DataFrame(tEntertainment, columns=['class'])
dfClassL = pd.DataFrame(tLodging, columns=['class'])
dfClassS = pd.DataFrame(tShopping, columns=['class'])
dfClassV = pd.DataFrame(tServices, columns=['class'])
dfClassT = pd.DataFrame(tTransportation, columns=['class'])
dfClasses = pd.concat([dfClassR, dfClassE, dfClassL, dfClassS,
                      dfClassV, dfClassT], ignore_index=True)

dfCatR = pd.DataFrame(lRestaurants, columns=['cat_name', 'cat_id'])
dfCatE = pd.DataFrame(lEntertainment, columns=['cat_name', 'cat_id'])
dfCatL = pd.DataFrame(lLodging, columns=['cat_name', 'cat_id'])
dfCatS = pd.DataFrame(lShopping, columns=['cat_name', 'cat_id'])
dfCatV = pd.DataFrame(lServices, columns=['cat_name', 'cat_id'])
dfCatT = pd.DataFrame(lTransportation, columns=['cat_name', 'cat_id'])
dfCategories = pd.concat([dfCatR, dfCatE, dfCatL, dfCatS, dfCatV, dfCatT], ignore_index=True)

dfClassList = pd.concat([dfClasses, dfCategories], axis=1, ignore_index=False)
print(dfClassList)


In [None]:
dfClassList

In [None]:
# create list of venue category id's for use in foursquare.com query
# no quotes, no square brackets, comma separated, no extra spaces

strClassList = ','.join(dfClassList['cat_id'].values)
print(strClassList)


In [None]:
# build dataframe of stations and their amenities
# stationID: foursquare's unique ID for each charging station
# amenityID: foursquare's unique ID for each amenity (venue) nearby the charging station
# amenityName: name of amenity 
# amenityLat: latitude of amenity
# amenityLong: longitude of amenity
# amenityCity: city of amenity
# amenityCatID: primary category ID of amenity
# amenityCatName: primary category name of amenity

# intialize dataframe to hold stations & venues list & setup variables
dfAmenities = pd.DataFrame()
dfSummary = pd.DataFrame(columns=['stationID', 'countAmenities'])
dfSummary['stationID'] = lStations['venue.id']
nStations = len(lStations['venue.id'])
# note: also use strClassList, defined earlier
nRadius = 400     # radius to other venues: 400 m is approx 1/4 mile
                  # reference: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3377942/
                  # reference: https://www.smartcitiesdive.com/ex/sustainablecitiescollective/pedestrians-and-park-planning-how-far-will-people-walk/24937/
nMax = 50         # maximum number of venues to retreive

print("# of stations: " + str(nStations))

# retrieve nearby venues to each station in lStations
# also create a list of # of nearby amentities per station
for iStation in range(nStations):
    xLat = lStations.loc[iStation, 'venue.location.lat']
    xLong = lStations.loc[iStation, 'venue.location.lng']
    
    print("Station # " + str(iStation) + ": ")

    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&categoryId={}&radius={}&limit={}'.format(
           CLIENT_ID, CLIENT_SECRET, VERSION, xLat, xLong, strClassList, nRadius, nMax)
        
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    lAmenities = json_normalize(venues)                  # flatten JSON
    filtered_columns = ['venue.id', 'venue.name', 'venue.location.lat', 
                        'venue.location.lng', 'venue.location.city', 'venue.categories']
    lAmenities = lAmenities.loc[:, filtered_columns]
    nAmenities = lAmenities.shape[0]
    dfSummary.loc[iStation, 'countAmenities'] = nAmenities

    # make list of venue category id's & venue category name's for each venue in lAmenities
    print('# of nearby amenities: ' + str(nAmenities))
    dfCategories = pd.DataFrame(columns=['catID', 'catName'])
    for iAmenity in range(nAmenities):
        dfCategories.loc[iAmenity] = [lAmenities['venue.categories'][iAmenity][0].get('id'), 
                             lAmenities['venue.categories'][iAmenity][0].get('name')]
        # end for loop
    dfAmenities = dfAmenities.append(list(zip(list(itertools.repeat(lStations['venue.id'][iStation],
                                     nAmenities)), lAmenities['venue.id'],
                                     lAmenities['venue.name'], lAmenities['venue.location.lat'],
                                     lAmenities['venue.location.lng'], lAmenities['venue.location.city'],
                                     dfCategories['catID'].values,
                                     dfCategories['catName'].values)),
                                    ignore_index=True)
# end for loop

dfAmenities.rename(columns={0:'stationID', 1: 'amenityID', 2: 'amenityName', 3: 'amenityLat',
                           4: 'amenityLong', 5: 'amenityCity', 6: 'amenityCatID', 7: 'amenityCatName'}, inplace=True)
print(dfAmenities.head(10))


In [None]:
print(np.std(dfSummary['countAmenities']))
print(np.mean(dfSummary['countAmenities']))
dfSummary

In [None]:
# one hot encoding

dfAmenities = pd.concat([dfAmenities, pd.get_dummies(dfAmenities['amenityCatName'], 
              prefix='cat', dummy_na=True, prefix_sep='_')], axis=1)
print(dfAmenities.head(20))

# make list of column names & # column names available for use later
lCols = list(dfAmenities.columns)
nCols = len(list(dfAmenities.columns))
print('# columns in dfAmenities: ' + str(nCols))


In [None]:
# prepare for classification of stations by venue categories

# create dataframe which only contains stationID & one-hot encoded amenity categories
# to be used in classification
# note: due to the way the dataframe was constructed, it is already sorted by stationID

dfSubset = dfAmenities.copy(deep=False)
lCols_encoded = [0] + list(np.arange(8, nCols))
dfTestClassify = dfSubset[dfSubset.columns[lCols_encoded]]

# prepare for classification
dfTestClassify_grouped = dfTestClassify.groupby('stationID').mean().reset_index()
dfTestClassify_noID = dfTestClassify.drop('stationID', 1)


In [None]:
# k-means clustering

kClusters = 5            # arbitrarily choose # of clusters
kmStations = KMeans(n_clusters=kClusters, random_state=0).fit(dfTestClassify_noID)
lGroups = kmStations.labels_
dfGroups = pd.DataFrame(zip(dfTestClassify_grouped['stationID'], lGroups), columns=['stationID','Group'])
print(dfGroups.head(20))


In [None]:
# add the group # assigned by k-means clustering analysis to the list of stations & amenity categories
dfTestClassify_merged = dfTestClassify_grouped.join(dfGroups.set_index('stationID'), on='stationID')
print(dfTestClassify_merged.head(20))

# add lat/long information per stationID
dfLocations = pd.DataFrame(zip(lStations['venue.id'],
                               lStations['venue.name'],
                               lStations['venue.location.lat'],
                               lStations['venue.location.lng']), 
                           columns=['stationID', 'stationName', 'stationLat', 'stationLong'])
dfClassify_locations = dfTestClassify_merged.join(dfLocations.set_index('stationID'), on='stationID')
print(dfClassify_locations.head(20))


In [None]:
dfClassify_locations.head(5)

In [None]:
# count top amenity categories for each stationID

# define function
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:-2]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

# initialize variables
nTops = 5        # arbitrarily choose how many "top venues" (most common amenity categories) to show
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
lColumns = ['stationID']
for ind in np.arange(nTops):
    try:
        lColumns.append('{}{} Most Common Category'.format(ind+1, indicators[ind]))
    except:
        lColumns.append('{}th Most Common Category'.format(ind+1))

# count top # of items
dfTops = pd.DataFrame(columns=lColumns)
nCategories = dfTestClassify_grouped.shape[0]
# note: nStations already defined above; nStations = # charging stations
dfTops['stationID'] = dfTestClassify_grouped['stationID']
for iRow in np.arange(nStations):             # for each row
    dfTops.loc[iRow, 1:] = return_most_common_venues(dfTestClassify.iloc[iRow, :], nTops)

# add group #'s to list of top categories
dfTops = dfTops.join(dfGroups.set_index('stationID'), on='stationID')
dfTops = dfTops.reindex(columns=['stationID','Group'] + list(dfTops.columns[1:-1]))

# add # amenities per stationID
dfTops = dfTops.join(dfSummary.set_index('stationID'), on='stationID')
print(dfTops)

In [None]:
dfTops.head(5)

In [None]:
# map results of k-means clustering of charging stations on nearby amenities

# set map starting location
# note: use geopy to get lat/long of Los Angeles
mapAddress = 'Los Angeles, CA'
geolocator = Nominatim(user_agent="TO_explorer")
location = geolocator.geocode(mapAddress)
mapLat = location.latitude
mapLong = location.longitude

# create cluster map
map_clusters = folium.Map(location=[mapLat, mapLong], tiles='cartodbpositron', 
                          zoom_start=9)

# set color scheme for the clusters
x = np.arange(kClusters)
ys = [i + x + (i*x)**2 for i in range(kClusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, countAmenities in zip(dfClassify_locations['stationLat'], 
                                  dfClassify_locations['stationLong'], 
                                  dfClassify_locations['stationName'], 
                                  dfClassify_locations['Group'],
                                  dfTops['countAmenities']):
    label = folium.Popup(str(poi) + ' - Group # ' + str(cluster) + ' - ' + 
                         str(countAmenities) + ' nearby amenities', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=2 + countAmenities / 2,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


<i>Note:</i> On the map, above, the size of the circular marker represents the number of nearby amenities.  Larger circles indicate more amenities within 1/4 mile of the charging station.  Colors indicate classification groups, based on types of nearby amenities.