#### This jupyter notebook is made by Ziyu. I reran her notebook to update best sites for map-making.
#### You can go to Ziyu's branch for more details about this jupyter notebook

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib.pyplot as plt

# import API-related libraries
import requests
import json
import time #will use this to pause execution for a few seconds

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Load & Process Dataframes

In [20]:
# creating one data set with centers and buffers for each cluster
## read files, you can find these two files in the output directory (I made in the Galveston_kmean_collection_points_final_60_point.ipynb)
galv_centers = gpd.read_file('galveston_parcel_cluster_center_60.geojson')
galv_buffers = gpd.read_file('galveston_parcel_cluster_buffer_60.geojson')

# create a combined dataframe containing both centroids and buffers
galv_centers = galv_centers.rename(columns = {'geometry' : 'center'})
galv_clusters = galv_centers.assign(buffer = galv_buffers['geometry'])

galv_clusters['site_name'] = np.nan
galv_clusters['site_location'] = np.nan
galv_clusters['site_dist'] = np.nan
galv_clusters.head()

Unnamed: 0,lat,long,center,buffer,site_name,site_location,site_dist
0,29.201904,-94.944011,POINT (-94.94401 29.20190),"POLYGON ((-94.94001 29.20190, -94.94003 29.201...",,,
1,29.298048,-94.819682,POINT (-94.81968 29.29805),"POLYGON ((-94.81568 29.29805, -94.81570 29.297...",,,
2,29.131075,-95.061183,POINT (-95.06118 29.13107),"POLYGON ((-95.05718 29.13107, -95.05720 29.130...",,,
3,29.264259,-94.877844,POINT (-94.87784 29.26426),"POLYGON ((-94.87384 29.26426, -94.87386 29.263...",,,
4,29.339165,-94.790654,POINT (-94.79065 29.33916),"POLYGON ((-94.78665 29.33916, -94.78667 29.338...",,,


In [21]:
# Load the distribution points dataframe
dist = pd.read_csv("complete_galveston.csv")
dist.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,0,0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston","{'lat': 29.2734603, 'lng': -94.85032249999999}","Moody Gardens Hotel, Spa and Convention Center...","Rainforest Cafe5310 Seawall Boulevard, Galveston",86
1,1,1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707531, 'lng': -94.8202072}","Rainforest Cafe5310 Seawall Boulevard, Galveston","Moody Gardens Hotel, Spa and Convention Center...",86
2,2,2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707352, 'lng': -94.8202317}",Landry's Prime Seafood & Steaks5310 Seawall Bo...,Landry's Prime Seafood & Steaks5310 Seawall Bl...,88


In [22]:
# Load the collection points dataframe
clct = pd.read_csv('complete_galveston2.csv')

# manipuate to make sure clct has the same strcture as dist
# for the ease of later combining
clct['Unnamed: 0.1'] = ['-']*len(clct)
first_col= clct.pop('Unnamed: 0.1')
clct.insert(0,'Unnamed: 0.1',first_col)

# look at dataframe
clct.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,-,2,3018 Texas Clipper Rd Parking,"3018 Texas Clipper Road, Galveston","{'lat': 29.3157226, 'lng': -94.81716879999999}",3018 Texas Clipper Rd Parking3018 Texas Clippe...,"Parking utmb365-379 11th Street, Galveston",86
1,-,3,Parking lot,Galveston,"{'lat': 29.3353876, 'lng': -94.77805839999999}",Parking lotGalveston,"Parking utmb365-379 11th Street, Galveston",86
2,-,4,East Beach Free Parking,"Apffel Park Road, Galveston","{'lat': 29.325156, 'lng': -94.7386875}","East Beach Free ParkingApffel Park Road, Galve...","UTMB - Public Parking, Galveston Hospitals200 ...",86


In [23]:
# Combine distribution and collection points to get a complete dataframe of all businesses
complete = pd.concat([dist, clct], ignore_index=True)
complete.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Name,Address,Coordinates,Name + Address,second_best_match,second_best_score
0,0,0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston","{'lat': 29.2734603, 'lng': -94.85032249999999}","Moody Gardens Hotel, Spa and Convention Center...","Rainforest Cafe5310 Seawall Boulevard, Galveston",86
1,1,1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707531, 'lng': -94.8202072}","Rainforest Cafe5310 Seawall Boulevard, Galveston","Moody Gardens Hotel, Spa and Convention Center...",86
2,2,2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston","{'lat': 29.2707352, 'lng': -94.8202317}",Landry's Prime Seafood & Steaks5310 Seawall Bo...,Landry's Prime Seafood & Steaks5310 Seawall Bl...,88


In [24]:
# There are several rows with empty coordinates
# We need to drop them before proceeding
complete.dropna(axis=0, how = 'any', inplace=True)

# after dropping, we need to reset the index
complete['order'] = [i for i in range(0,len(complete))]
complete = complete.set_index("order")

In [25]:
# view the shape of the cleaned dataframe
complete.shape

(2961, 8)

In [26]:
# Create two columns of coordinates in the complete dataframe for the ease of computation
lats = []
longs  = []

# loop through the dataframe
for i in range(0,len(complete)):
    
    # get the content in each cell
    coordinate = complete['Coordinates'][i]
    
    # transform string to a dictionary
    coordinate = eval(coordinate)
    
    # get latitudes (keys are different: some are "lat" and some are "latitude")
    # so we have to use "next(iter(coordinate))"
    lat = coordinate[next(iter(coordinate))]
    
    # get longtitudes
    long = list(coordinate.values())[1]
    
    # append to the bigger list
    lats.append(lat)
    longs.append(long)

# create columns for lat and long
complete['lats'] = lats
complete['longs'] = longs

In [27]:
# make it a geo df
complete = gpd.GeoDataFrame(complete, geometry=gpd.points_from_xy(complete.longs, complete.lats))
# only select columns we need for later use
complete = complete[['Name', 'Address', 'geometry', 'lats', 'longs']]
complete = complete.rename(columns = {'geometry':'Coordinates'})
complete.head(3)

Unnamed: 0_level_0,Name,Address,Coordinates,lats,longs
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"Moody Gardens Hotel, Spa and Convention Center","7 Hope Boulevard, Galveston",POINT (-94.85032 29.27346),29.27346,-94.850322
1,Rainforest Cafe,"5310 Seawall Boulevard, Galveston",POINT (-94.82021 29.27075),29.270753,-94.820207
2,Landry's Prime Seafood & Steaks,"5310 Seawall Boulevard, Galveston",POINT (-94.82023 29.27074),29.270735,-94.820232


In [34]:
complete.to_csv("All_Galveston_Business.csv")

# ---------------------------------------------
# Using Mapbox Matrix API

In [28]:
# My access token to mapbox matrix api
# replace it with yours if you are using my code later
token = "pk.eyJ1IjoiYXNobGV5eml5dXJlbiIsImEiOiJjbGU3a2JncGswNmJwM25wN3Rwa211bG04In0.pGiFstl_qOeFnehPWJhhww"

In [29]:
import time #will use this to pause execution for a few seconds

In [30]:
# finding closest business to each cluster centroid within the cluster boundaries
# this closest business will serve as the actual colletion site (because our centroids are merely theoratical)
# The code below searches within the boundaries for each cluster

for idx in range(len(galv_clusters)):  
    boundary = galv_clusters['buffer'][idx]
    #identify sites within the cluster boundaries
    within_boundary = complete[complete.Coordinates.within(boundary)].reset_index(drop = True)
    
    #rest of the code is the same as above basically
    nearby = within_boundary
    distances = []
    
    #added this if len(nearby) > 0 bc of the troublesome cases where filtering was inaccurate and nothing was nearby
    if len(nearby) > 0:
        # mapbox matrix api can take no more than 25 locations each time
        # we can search for the first 25 stores first
        while len(nearby) > 24:
            short_list = nearby.head(24)
            nearby = nearby[24:].reset_index(drop = True)
            
            coord = str(galv_clusters['long'][idx]) + "," + str(galv_clusters['lat'][idx])
            for i in range(len(short_list)):
                coord = coord + ";" + str(short_list['longs'][i]) +  "," + str(short_list['lats'][i])
                
            url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
                    coord + "?sources=0&annotations=distance,duration"
            r = requests.get(url, params=({'access_token':token}))
            r =  json.loads(r.content)
            time.sleep(2) # no more than 60 requests per minute, so we need to put it to sleep

            for d in r['durations'][0][1:]:
                distances.append(d)
        # then we can search for the rest of stores (if there is less than 25 stores, the code will directly go to this part)    
        coord = str(galv_clusters['long'][idx]) + "," + str(galv_clusters['lat'][idx])
        for i in range(len(nearby)):
            coord = coord + ";" + str(nearby['longs'][i]) +  "," + str(nearby['lats'][i])
        url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
                coord + "?sources=0&annotations=distance,duration"

        r = requests.get(url, params=({'access_token':token}))
        r =  json.loads(r.content)

        for d in r['durations'][0][1:]:
            distances.append(d)
        
        # attach the distances we found to the within-boundary businesses df
        within_boundary = within_boundary.assign(Dist = distances)
        # finding the closest one
        closest = within_boundary[within_boundary.Dist == within_boundary.Dist.min()]

        # attach to the bigger centroid df
        galv_clusters['site_name'][idx] = closest['Name'].values[0]
        galv_clusters['site_location'][idx] = closest['Coordinates'].values[0]
        galv_clusters['site_dist'][idx] = closest['Dist'].values[0]
        
    # if there is no nearby businesses found, the code will directly go the this part below
    else:
        galv_clusters['site_name'][idx] = np.nan
        galv_clusters['site_location'][idx] = np.nan
        galv_clusters['site_dist'][idx] = np.nan

In [31]:
# filter out those clusters where there is no nearby business
galv_clusters_nan = galv_clusters[galv_clusters['site_name'].isnull()]
galv_clusters_nan = galv_clusters_nan.reset_index()
# also filter out those clusters where we have already found the nearby business
galv_clusters_found = galv_clusters[galv_clusters['site_name'].notnull()]

In [32]:
# create a copy of the complete business df
complete_copy = complete

# finding closest dist or collection site for each cluster center where there were no sites within the boundary

for idx in range(len(galv_clusters_nan)):  
    print(idx)
    nearby = complete   
    
    lat = galv_clusters_nan.lat[idx]
    long = galv_clusters_nan.long[idx]
    nearby['euc_diff'] = ((nearby['lats']-lat)**2 + (nearby['longs']-long)**2)**0.5
    nearby = complete_copy.nsmallest(20, ['euc_diff'])
    nearby = nearby.reset_index()
    
    #create a list to store distances
    distances = []
    
    coord = str(galv_clusters_nan['long'][idx]) + "," + str(galv_clusters_nan['lat'][idx])
    for i in range(len(nearby)):
        coord = coord + ";" + str(nearby['longs'][i]) +  "," + str(nearby['lats'][i])
    url = "https://api.mapbox.com/directions-matrix/v1/mapbox/walking/" +\
            coord + "?sources=0&annotations=distance,duration"
            
    r = requests.get(url, params=({'access_token':token}))
    r =  json.loads(r.content)
        
    # add distances to list of distances
    for d in r['durations'][0][1:]:
        distances.append(d)
    
    # add distances as a column to our copied version of hilo_sites data
    nearby = nearby.assign(Dist = distances)
    # extract the row with the closest distance
    closest = nearby[nearby.Dist == nearby.Dist.min()]

    # add info from that closest row to our hilo_clusters data
    galv_clusters_nan['site_name'][idx] = closest['Name'].values[0]
    galv_clusters_nan['site_location'][idx] = closest['Coordinates'].values[0]
    galv_clusters_nan['site_dist'][idx] = closest['Dist'].values[0]

0
1
2
3
4
5
6


In [48]:
galv_clusters_nan = gpd.GeoDataFrame(galv_clusters_nan, geometry = galv_clusters_nan.site_location)
galv_clusters_nan = galv_clusters_nan.iloc[:,[0,1,2,4,5]]
galv_clusters_nan

Unnamed: 0,lat,long,site_name,site_dist,geometry
0,29.339165,-94.790654,Galveston Naval Museum,140.3,POINT (-94.77940 29.33448)
1,29.245908,-94.917369,3rd Coast Company,973.1,POINT (-94.90493 29.23666)
2,29.228554,-94.922662,The Kislyuk Retreat.,851.3,POINT (-94.91741 29.22441)
3,29.153089,-95.030006,Sunset Cove,484.0,POINT (-95.03115 29.14769)
4,29.225714,-94.940945,Laffite's Cove Nature Society,875.3,POINT (-94.93444 29.21761)
5,29.257354,-94.896649,Sweetwater Cove,371.6,POINT (-94.89778 29.26201)
6,29.341015,-94.810515,Pelican Cut,0.0,POINT (-94.81952 29.34827)


In [46]:
galv_clusters = galv_clusters[galv_clusters["geometry"]!=None]
galv_clusters

Unnamed: 0,lat,long,site_name,site_dist,geometry
0,29.201904,-94.944011,Overkill Guide Service,7.8,POINT (-94.94361 29.20196)
1,29.298048,-94.819682,"Dr. Abbey B. Berenson, MD",158.6,POINT (-94.82179 29.29729)
2,29.131075,-95.061183,Pepper Pavilon,362.3,POINT (-95.06104 29.13331)
3,29.264259,-94.877844,Big M’s Lawn Service,202.1,POINT (-94.87802 29.26680)
5,29.264176,-94.840561,Weis Middle School,1183.8,POINT (-94.83880 29.26502)
7,29.170826,-95.001971,Next Residence LLC,36.7,POINT (-95.00167 29.17158)
8,29.106732,-95.096759,7 Palms,201.5,POINT (-95.09597 29.10564)
9,29.28744,-94.801409,Frazier Painting Company,26.0,POINT (-94.80136 29.28711)
10,29.332274,-94.736772,Galveston Island Horse and Pony Rides,53.5,POINT (-94.73449 29.33362)
11,29.23365,-94.89172,Beachside Village Information Center-Beachside...,98.4,POINT (-94.89144 29.23305)


In [None]:
galv_clusters = gpd.GeoDataFrame(galv_clusters, geometry = galv_clusters.site_location)
galv_clusters = galv_clusters.iloc[:,[0,1,2,4,5]]

In [51]:
galv_clusters_complete = pd.concat([galv_clusters_nan,galv_clusters]).reset_index()
galv_clusters_complete = gpd.GeoDataFrame(galv_clusters_complete, geometry = galv_clusters_complete.geometry)
galv_clusters_complete

Unnamed: 0,index,lat,long,site_name,site_dist,geometry
0,0,29.339165,-94.790654,Galveston Naval Museum,140.3,POINT (-94.77940 29.33448)
1,1,29.245908,-94.917369,3rd Coast Company,973.1,POINT (-94.90493 29.23666)
2,2,29.228554,-94.922662,The Kislyuk Retreat.,851.3,POINT (-94.91741 29.22441)
3,3,29.153089,-95.030006,Sunset Cove,484.0,POINT (-95.03115 29.14769)
4,4,29.225714,-94.940945,Laffite's Cove Nature Society,875.3,POINT (-94.93444 29.21761)
5,5,29.257354,-94.896649,Sweetwater Cove,371.6,POINT (-94.89778 29.26201)
6,6,29.341015,-94.810515,Pelican Cut,0.0,POINT (-94.81952 29.34827)
7,0,29.201904,-94.944011,Overkill Guide Service,7.8,POINT (-94.94361 29.20196)
8,1,29.298048,-94.819682,"Dr. Abbey B. Berenson, MD",158.6,POINT (-94.82179 29.29729)
9,2,29.131075,-95.061183,Pepper Pavilon,362.3,POINT (-95.06104 29.13331)


In [52]:
galv_clusters_complete = galv_clusters_complete.iloc[:,[1,2,3,4,5]]

In [53]:
# save as a geojson file for the ease of mapping
galv_clusters_complete.to_file('galv_cluster_collection_sites_60.geojson', driver = 'GeoJSON')

In [54]:
buffer = galv_clusters_complete.buffer(0.004)
galv_copy = galv_clusters_complete[:]
galv_copy["geometry"] = buffer
galv_copy.to_file("galv_cluster_collection_sites_buffer_60.geojson",driver="GeoJSON")

In [55]:
galv_copy

Unnamed: 0,lat,long,site_name,site_dist,geometry
0,29.339165,-94.790654,Galveston Naval Museum,140.3,"POLYGON ((-94.77540 29.33448, -94.77542 29.334..."
1,29.245908,-94.917369,3rd Coast Company,973.1,"POLYGON ((-94.90093 29.23666, -94.90095 29.236..."
2,29.228554,-94.922662,The Kislyuk Retreat.,851.3,"POLYGON ((-94.91341 29.22441, -94.91343 29.224..."
3,29.153089,-95.030006,Sunset Cove,484.0,"POLYGON ((-95.02715 29.14769, -95.02717 29.147..."
4,29.225714,-94.940945,Laffite's Cove Nature Society,875.3,"POLYGON ((-94.93044 29.21761, -94.93046 29.217..."
5,29.257354,-94.896649,Sweetwater Cove,371.6,"POLYGON ((-94.89378 29.26201, -94.89380 29.261..."
6,29.341015,-94.810515,Pelican Cut,0.0,"POLYGON ((-94.81552 29.34827, -94.81554 29.347..."
7,29.201904,-94.944011,Overkill Guide Service,7.8,"POLYGON ((-94.93961 29.20196, -94.93963 29.201..."
8,29.298048,-94.819682,"Dr. Abbey B. Berenson, MD",158.6,"POLYGON ((-94.81779 29.29729, -94.81780 29.296..."
9,29.131075,-95.061183,Pepper Pavilon,362.3,"POLYGON ((-95.05704 29.13331, -95.05706 29.132..."
