# Distance between CoCs
A factory can be placed in a CoC and serves two functions:

1. Provide shelter to people experiencing homelessness (PEH) in that CoC.
2. Serve as a source for other temporary housing units (THUs) that can be deployed to other CoCs.

To determine the optimal deployment of factories, we require the distance between each pair of CoCs; from this distance we can calculate a transportation cost that will be used in the optimization framework to strategically deploy THUs and factories. In this work, we consider two ways to calculate the distance between CoCs:

1. Haversine distance
2. Road network distance

## Haversine distance
The hversine distance determines the orthodromic distance between two points on a sphere. Given the latitude $\phi$ and longitude $\lambda$ of two points, we can calculate the distance according to:
$$
d = 2r \sin^{-1}\left( \sqrt \frac{1 - \cos(\Delta \phi) + \cos(\phi_1) \cos(\phi_2)(1-\cos(\Delta \lambda))}{2} \right)
$$
where $r$ is the radius of the Earth (approximately 6,370 km).

In [173]:
import math
# Earth radius R in miles
R = 3958.8
def haversine_dist(lat1, lon1, lat2, lon2):
    # Convert to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    delta_latitude = lat2_rad - lat1_rad
    delta_longitude = lon2_rad - lon1_rad
    # sine version    
    return 2*R*math.asin(math.sqrt((1 - math.cos(delta_latitude) + math.cos(lat1_rad)*math.cos(lat2_rad)*(1 - math.cos(delta_longitude)))/2))

    # arctan version
    # a = (1 - math.cos(delta_latitude) + math.cos(lat1_rad)*math.cos(lat2_rad)*(1 - math.cos(delta_longitude)))/2
    # return 2*R*math.atan2(math.sqrt(a), math.sqrt(1-a))

# Validation
# print(haversine_dist(33.5, -86.67, 30.76, -87.93))

In [201]:
import geopandas as gpd
import os
from pathlib import Path

# test:
state_abbreviations = [
    "AL", "AK", "AZ", 
]

states = [
    'Alabama', 'Alaska', 'Arizona'
]

# state_abbreviations = [
#     "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
#     "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
#     "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
#     "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
#     "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
# ]

# states = [
#     'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 
#     'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 
#     'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 
#     'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
#     'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 
#     'New_Hampshire', 'New_Jersey', 'New_Mexico', 'New_York', 
#     'North_Carolina', 'North_Dakota', 'Ohio', 'Oklahoma', 'Oregon', 
#     'Pennsylvania', 'Rhode_Island', 'South_Carolina', 'South_Dakota', 
#     'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 
#     'West_Virginia', 'Wisconsin', 'Wyoming'
# ]


abbreviation_state_pairs = list(zip(state_abbreviations, states))

# Validation
# for i in range(len(abbreviation_state_pairs)):
#     print(abbreviation_state_pairs[i])


In [175]:
# # Sample processing of .shp
# current_dir = os.getcwd()
# gdf = gpd.read_file(current_dir + "\CoC_GIS_State_Shapefile_MA\Massachusetts\MA_500\MA_500.shp")
# # gdf.info()

# # print(len(gdf))

# # Compute centroids
# centroids = gdf.geometry.centroid

# # Extract latitudes and longitudes
# longitudes = centroids.x
# latitudes = centroids.y

# # Compute averages
# avg_lon = longitudes.mean()
# avg_lat = latitudes.mean()

# print("Latitude, longitude: ", avg_lat, avg_lon)

# # # Get the centroid of the first geometry
# # centroid = gdf.geometry.iloc[0].centroid

# # # Get latitude and longitude
# # longitude = centroid.x
# # latitude = centroid.y

# # print("Latitude, longitude: ", latitude, longitude)


In [None]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)

# Get CoCs for all states
cocs = []
coc_lon_lat = {}
directory = Path.cwd() # Path.cwd().parent
for abbreviation, state in abbreviation_state_pairs:
    # Get CoC paths and CoCs per state folder
    if state == "Wyoming":
        coc_path = directory / 'homelessness-prediction' / 'data/coc-shapefiles/2023' / f'CoC_GIS_State_Shapefile_{abbreviation}'
    else:
        coc_path = directory / 'homelessness-prediction' / 'data/coc-shapefiles/2023' / f'CoC_GIS_State_Shapefile_{abbreviation}' / state
    for coc in os.listdir(coc_path):
        if coc.startswith(abbreviation + '_'):
            cocs.append(coc)
            gdf = gpd.read_file(coc_path / coc / f'{coc}.shp')
            # Compute centroids (if more than one)
            centroids = gdf.geometry.centroid

            # Extract latitudes and longitudes (if more than one)
            longitudes= centroids.x
            latitudes = centroids.y

            # Compute average of centroids (if more than one)
            avg_lon = longitudes.mean()
            avg_lat = latitudes.mean()

            coc_lon_lat[coc] = (avg_lon, avg_lat)

numCoCs = len(cocs)
cocs.sort()
# Validation
# print(cocs)
# print(numCoCs)
# print(coc_lon_lat)


In [183]:
# Produce distance matrix
# Create city distances matrix
import pandas as pd
import numpy as np
coc_distance_matrix = pd.DataFrame(columns=[f'{coc}' for coc in cocs])

for coc_source, lon_lat_source in coc_lon_lat.items():
    coc_distance_arr = []
    for coc_dest, lon_lat_dest in coc_lon_lat.items():
        distance = haversine_dist(lat1=lon_lat_source[1], lon1=lon_lat_source[0], lat2=lon_lat_dest[1], lon2=lon_lat_dest[0])
        # print(coc_source)
        # print(coc_dest)
        # print(lon_lat_source)
        # print(lon_lat_dest)
        # print(distance)
        coc_distance_arr.append(distance)
    coc_distance_matrix.loc[len(coc_distance_matrix)] = coc_distance_arr
    # if coc_source == "AL_503":
    #     break

# print(coc_distance_matrix.head())

coc_distance_matrix['CoC'] = cocs
coc_distance_matrix.set_index('CoC', inplace=True)

# Validation 
# print(coc_distance_matrix.shape)
# print(coc_distance_matrix.head())
# print(coc_distance_matrix['AL_504']['AL_505'])
# print(coc_distance_matrix.iloc[4,5]) # should be equivalent to previous
# print(coc_lon_lat['AL_504'])
# print(coc_lon_lat['AL_505'])




In [None]:
# CoC populations 
# Load the Excel file into a DataFrame
directory = Path.cwd() # Path.cwd().parent
CoC_populations = pd.read_excel(directory / 'homelessness-prediction/coc-homelessness-data/coc_summary.xlsx')

# Only keep population column
CoC_populations = CoC_populations[['CoC_Number', 'Overall_Homeless']]
# Drop indices
CoC_populations = CoC_populations.drop(index=0)

# Remove CoCs where there is missing info
CoC_populations['CoC_Number'] = CoC_populations['CoC_Number'].str.replace('-', '_', regex=True)
CoC_populations = CoC_populations[CoC_populations['CoC_Number'].isin(cocs)]

# coc = coc.iloc[1:].reset_index(drop=True)
for coc in cocs[:]:
    if coc not in CoC_populations['CoC_Number'].values:
        cocs.remove(coc)

# # Validation
# for coc in CoC_populations:
#     if coc not in cocs:
#         print(coc)
# pd.set_option('display.max_rows', None)
# print(CoC_populations)

# # Label indices by CoC name
CoC_populations.set_index('CoC_Number', inplace=True)

# More validation
# print(len(cocs))
# print(cocs)
# print(len(CoC_populations))
# print(CoC_populations.head())
# print(CoC_populations.iloc[0])



Overall_Homeless    1023.0
Name: AK_500, dtype: float64
