In [2]:
# import necessary libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

# import API-related libraries
import requests
import json
import time #will use this to pause execution for a few seconds

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Load & Process Dataframes

In [3]:
# creating one data set with centers and buffers for each cluster
## read files
galv_centers = gpd.read_file('galveston_parcel_cluster_center.geojson')
galv_buffers = gpd.read_file('galveston_parcel_cluster_buffer.geojson')

# create a combined dataframe containing both centroids and buffers
galv_centers = galv_centers.rename(columns = {'geometry' : 'center'})
galv_clusters = galv_centers.assign(buffer = galv_buffers['geometry'])

galv_clusters['site_name'] = np.nan
galv_clusters['site_location'] = np.nan
galv_clusters['site_dist'] = np.nan
galv_clusters.head()

Unnamed: 0,lat,long,center,buffer,site_name,site_location,site_dist
0,29.275801,-94.814192,POINT (-94.81419 29.27580),"POLYGON ((-94.81019 29.27580, -94.81021 29.275...",,,
1,29.130075,-95.060682,POINT (-95.06068 29.13008),"POLYGON ((-95.05668 29.13008, -95.05670 29.129...",,,
2,29.218909,-94.936524,POINT (-94.93652 29.21891),"POLYGON ((-94.93252 29.21891, -94.93254 29.218...",,,
3,29.32192,-94.786283,POINT (-94.78628 29.32192),"POLYGON ((-94.78228 29.32192, -94.78230 29.321...",,,
4,29.267148,-94.875719,POINT (-94.87572 29.26715),"POLYGON ((-94.87172 29.26715, -94.87174 29.266...",,,


In [4]:
# Load the distribution points dataframe
dist = pd.read_csv("complete_galveston.csv")
dist.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,0,0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston","{'lat': 29.2734603, 'lng': -94.85032249999999}","Moody Gardens Hotel, Spa and Convention Center...","Rainforest Cafe5310 Seawall Boulevard, Galveston",86
1,1,1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707531, 'lng': -94.8202072}","Rainforest Cafe5310 Seawall Boulevard, Galveston","Moody Gardens Hotel, Spa and Convention Center...",86
2,2,2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707352, 'lng': -94.8202317}",Landry's Prime Seafood & Steaks5310 Seawall Bo...,Landry's Prime Seafood & Steaks5310 Seawall Bl...,88


In [5]:
# Load the collection points dataframe
clct = pd.read_csv('complete_galveston2.csv')

# manipuate to make sure clct has the same strcture as dist
# for the ease of later combining
clct['Unnamed: 0.1'] = ['-']*len(clct)
first_col= clct.pop('Unnamed: 0.1')
clct.insert(0,'Unnamed: 0.1',first_col)

# look at dataframe
clct.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,-,2,3018 Texas Clipper Rd Parking,"3018 Texas Clipper Road, Galveston","{'lat': 29.3157226, 'lng': -94.81716879999999}",3018 Texas Clipper Rd Parking3018 Texas Clippe...,"Parking utmb365-379 11th Street, Galveston",86
1,-,3,Parking lot,Galveston,"{'lat': 29.3353876, 'lng': -94.77805839999999}",Parking lotGalveston,"Parking utmb365-379 11th Street, Galveston",86
2,-,4,East Beach Free Parking,"Apffel Park Road, Galveston","{'lat': 29.325156, 'lng': -94.7386875}","East Beach Free ParkingApffel Park Road, Galve...","UTMB - Public Parking, Galveston Hospitals200 ...",86


In [6]:
# Combine distribution and collection points to get a complete dataframe of all businesses
complete = pd.concat([dist, clct], ignore_index=True)
complete.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,0,0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston","{'lat': 29.2734603, 'lng': -94.85032249999999}","Moody Gardens Hotel, Spa and Convention Center...","Rainforest Cafe5310 Seawall Boulevard, Galveston",86
1,1,1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707531, 'lng': -94.8202072}","Rainforest Cafe5310 Seawall Boulevard, Galveston","Moody Gardens Hotel, Spa and Convention Center...",86
2,2,2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707352, 'lng': -94.8202317}",Landry's Prime Seafood & Steaks5310 Seawall Bo...,Landry's Prime Seafood & Steaks5310 Seawall Bl...,88


In [7]:
# There are several rows with empty coordinates
# We need to drop them before proceeding
complete.dropna(axis=0, how = 'any', inplace=True)

# after dropping, we need to reset the index
complete['order'] = [i for i in range(0,len(complete))]
complete = complete.set_index("order")

In [8]:
# view the shape of the cleaned dataframe
complete.shape

(2961, 8)

In [9]:
# Create two columns of coordinates in the complete dataframe for the ease of computation
lats = []
longs  = []

# loop through the dataframe
for i in range(0,len(complete)):
    
    # get the content in each cell
    coordinate = complete['Coordinates'][i]
    
    # transform string to a dictionary
    coordinate = eval(coordinate)
    
    # get latitudes (keys are different: some are "lat" and some are "latitude")
    # so we have to use "next(iter(coordinate))"
    lat = coordinate[next(iter(coordinate))]
    
    # get longtitudes
    long = list(coordinate.values())[1]
    
    # append to the bigger list
    lats.append(lat)
    longs.append(long)

# create columns for lat and long
complete['lats'] = lats
complete['longs'] = longs

In [10]:
# make it a geo df
complete = gpd.GeoDataFrame(complete, geometry=gpd.points_from_xy(complete.longs, complete.lats))
# only select columns we need for later use
complete = complete[['Name', 'Address', 'geometry', 'lats', 'longs']]
complete = complete.rename(columns = {'geometry':'Coordinates'})
complete.head(3)

Unnamed: 0_level_0,Name,Address,Coordinates,lats,longs
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston",POINT (-94.85032 29.27346),29.27346,-94.850322
1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston",POINT (-94.82021 29.27075),29.270753,-94.820207
2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston",POINT (-94.82023 29.27074),29.270735,-94.820232


# ---------------------------------------------
# Using Mapbox Matrix API

In [11]:
# My access token to mapbox matrix api
# replace it with yours if you are using my code later
token = "pk.eyJ1IjoiYXNobGV5eml5dXJlbiIsImEiOiJjbGU3a2JncGswNmJwM25wN3Rwa211bG04In0.pGiFstl_qOeFnehPWJhhww"

In [12]:
import time #will use this to pause execution for a few seconds

In [13]:
# finding closest business to each cluster centroid within the cluster boundaries
# this closest business will serve as the actual colletion site (because our centroids are merely theoratical)
# The code below searches within the boundaries for each cluster

for idx in range(len(galv_clusters)):  
    boundary = galv_clusters['buffer'][idx]
    #identify sites within the cluster boundaries
    within_boundary = complete[complete.Coordinates.within(boundary)].reset_index(drop = True)
    
    #rest of the code is the same as above basically
    nearby = within_boundary
    distances = []
    
    #added this if len(nearby) > 0 bc of the troublesome cases where filtering was inaccurate and nothing was nearby
    if len(nearby) > 0:
        # mapbox matrix api can take no more than 25 locations each time
        # we can search for the first 25 stores first
        while len(nearby) > 24:
            short_list = nearby.head(24)
            nearby = nearby[24:].reset_index(drop = True)
            
            coord = str(galv_clusters['long'][idx]) + "," + str(galv_clusters['lat'][idx])
            for i in range(len(short_list)):
                coord = coord + ";" + str(short_list['longs'][i]) +  "," + str(short_list['lats'][i])
                
            url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
                    coord + "?sources=0&annotations=distance,duration"
            r = requests.get(url, params=({'access_token':token}))
            r =  json.loads(r.content)
            time.sleep(2) # no more than 60 requests per minute, so we need to put it to sleep

            for d in r['durations'][0][1:]:
                distances.append(d)
        # then we can search for the rest of stores (if there is less than 25 stores, the code will directly go to this part)    
        coord = str(galv_clusters['long'][idx]) + "," + str(galv_clusters['lat'][idx])
        for i in range(len(nearby)):
            coord = coord + ";" + str(nearby['longs'][i]) +  "," + str(nearby['lats'][i])
        url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
                coord + "?sources=0&annotations=distance,duration"

        r = requests.get(url, params=({'access_token':token}))
        r =  json.loads(r.content)

        for d in r['durations'][0][1:]:
            distances.append(d)
        
        # attach the distances we found to the within-boundary businesses df
        within_boundary = within_boundary.assign(Dist = distances)
        # finding the closest one
        closest = within_boundary[within_boundary.Dist == within_boundary.Dist.min()]

        # attach to the bigger centroid df
        galv_clusters['site_name'][idx] = closest['Name'].values[0]
        galv_clusters['site_location'][idx] = closest['Coordinates'].values[0]
        galv_clusters['site_dist'][idx] = closest['Dist'].values[0]
        
    # if there is no nearby businesses found, the code will directly go the this part below
    else:
        galv_clusters['site_name'][idx] = np.nan
        galv_clusters['site_location'][idx] = np.nan
        galv_clusters['site_dist'][idx] = np.nan

In [14]:
# filter out those clusters where there is no nearby business
galv_clusters_nan = galv_clusters[galv_clusters['site_name'].isnull()]
galv_clusters_nan = galv_clusters_nan.reset_index()
# also filter out those clusters where we have already found the nearby business
galv_clusters_found = galv_clusters[galv_clusters['site_name'].notnull()]

In [15]:
# create a copy of the complete business df
complete_copy = complete

# finding closest dist or collection site for each cluster center where there were no sites within the boundary

for idx in range(len(galv_clusters_nan)):  
    print(idx)
    nearby = complete   
    
    lat = galv_clusters_nan.lat[idx]
    long = galv_clusters_nan.long[idx]
    nearby['euc_diff'] = ((nearby['lats']-lat)**2 + (nearby['longs']-long)**2)**0.5
    nearby = complete_copy.nsmallest(20, ['euc_diff'])
    nearby = nearby.reset_index()
    
    #create a list to store distances
    distances = []
    
    coord = str(galv_clusters_nan['long'][idx]) + "," + str(galv_clusters_nan['lat'][idx])
    for i in range(len(nearby)):
        coord = coord + ";" + str(nearby['longs'][i]) +  "," + str(nearby['lats'][i])
    url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
            coord + "?sources=0&annotations=distance,duration"
            
    r = requests.get(url, params=({'access_token':token}))
    r =  json.loads(r.content)
        
    # add distances to list of distances
    for d in r['durations'][0][1:]:
        distances.append(d)
    
    # add distances as a column to our copied version of hilo_sites data
    nearby = nearby.assign(Dist = distances)
    # extract the row with the closest distance
    closest = nearby[nearby.Dist == nearby.Dist.min()]

    # add info from that closest row to our hilo_clusters data
    galv_clusters_nan['site_name'][idx] = closest['Name'].values[0]
    galv_clusters_nan['site_location'][idx] = closest['Coordinates'].values[0]
    galv_clusters_nan['site_dist'][idx] = closest['Dist'].values[0]

0
1
2
3
4
5


In [40]:
# concat and output the finished dataframe
# save as a csv file
galv_clusters = pd.concat([galv_clusters_found, galv_clusters_nan])
galv_clusters.to_csv('galv_cluster_collection_sites.csv')

In [41]:
# save as a geojson file for the ease of mapping
galv_clusters = galv_clusters[['lat', 'long', 'site_name', 'site_location', 'site_dist']]
galv_clusters = gpd.GeoDataFrame(galv_clusters, geometry = galv_clusters.site_location)
galv_clusters = galv_clusters.iloc[:,[0,1,2,4,5]]
galv_clusters.to_file('galv_cluster_collection_sites.geojson', driver = 'GeoJSON')