In [1]:
import pandas as pd
import numpy as np
import os
import math
import pickle
import geopandas as gpd
import csv
from shapely import wkt

In [2]:

data_dir = os.path.join('/cluster/tufts/hugheslab/datasets/NSF_OD/results_202308_pipeline/')

annual_path = os.path.join(data_dir, 'clean_annual_tract')
quarter_path = os.path.join(data_dir, 'clean_quarter_tract')
semi_path = os.path.join(data_dir, 'clean_semi_tract')

gdf_annual = gpd.read_file(annual_path)

In [10]:
code_dir = '../../'
castnet_path = os.path.join(code_dir, 'CASTNet/hughes-CASTNet/Data/MA/')

### Add static feature as a csv file

In [12]:
geoid_data = []

for geoid in gdf_annual['geoid'].unique():
    geoid_df = gdf_annual[gdf_annual['geoid'] == geoid]
    
    # Extract the lat and lon values 
    lat = geoid_df['lat'].values[0]
    lon = geoid_df['lon'].values[0]
    
    # append data
    geoid_data.append([geoid, lat, lon])


static_features = pd.DataFrame(geoid_data)
static_features = static_features.transpose()
csv_path = os.path.join(castnet_path, 'static_features.csv')
static_features.to_csv(csv_path, index=False, header=False)
static_features_locations = static_features.loc[0,:] # first row is geoid, second row is lat, third row is lon



### Add location as a txt file

In [24]:
geoid_series = gdf_annual['geoid'].unique()
assert((geoid_series == static_features_locations).all())
txt_path = os.path.join(castnet_path, 'locations.txt')
with open(txt_path, 'w') as file:
 for index, geoid in enumerate(geoid_series, start=1):
        file.write(f"{index}\t{geoid}\n")
        


### Add SVI as a pickle file

In [25]:
##SVI  dynamic features organization
svi_cols = ['geoid', 'year', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']
gdf_subset = gdf_annual[svi_cols]
gdf_pivoted = gdf_subset.pivot(index='geoid', columns='year')
assert((gdf_pivoted.index==static_features_locations).all())

In [42]:
num_unique_geoids = len(gdf_pivoted.index)
num_years = len(gdf_pivoted.columns.levels[1])
num_values = len(svi_cols[2:])
gdf_values = gdf_pivoted.values
numpy_3d_array = gdf_values.reshape(num_unique_geoids, num_years, num_values, order='F')

for feature in range(num_values):
    assert((numpy_3d_array[:,:,feature] == gdf_values[:,feature*num_years:(feature+1)*num_years]).all())

svi_path = os.path.join(castnet_path, 'svi.pkl')
with open(svi_path, "wb") as file:
    pickle.dump(numpy_3d_array, file)

### Add overdose as picke file

In [43]:
#overdoses pickle file is an array (# days, # locations) shows # of deaths per DAY in every location??

overdose_data = gdf_annual.groupby(['year', 'geoid'])['deaths'].sum().reset_index()
overdoses_array = overdose_data.pivot_table(index='year', columns='geoid', values='deaths').to_numpy()
assert((overdose_data.pivot_table(index='year', columns='geoid', values='deaths').columns == geoid_series).all())
overdose_path = os.path.join(castnet_path, 'overdose.pkl')
with open(overdose_path, "wb") as file:
    pickle.dump(overdoses_array, file)


### Add distances as csv file

In [56]:
import geopandas as gpd
from shapely.geometry import Point

tl_shape_path = os.path.join(data_dir, 'clean_annual_tract/clean_annual_tract.shp')
tl_gdf = gpd.read_file(tl_shape_path)

unique_locations_df = tl_gdf[tl_gdf['year']==2000].set_index('geoid').loc[geoid_series, ['lat','lon']]

In [57]:
unique_locations_df['lat'] = pd.to_numeric(unique_locations_df['lat'])
unique_locations_df['lon'] = pd.to_numeric(unique_locations_df['lon'])
unique_locations_df.shape

(1620, 2)

In [58]:
unique_locations_df

Unnamed: 0_level_0,lat,lon
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1
25001010100,42.059829,-70.200407
25001010206,41.922636,-70.015368
25001010208,42.013557,-70.064151
25001010304,41.825108,-69.976203
25001010306,41.859376,-69.982635
...,...,...
25027761100,42.199604,-72.200006
25027761200,42.283702,-71.591633
25027761300,42.239683,-71.701747
25027761401,42.489753,-71.579677


In [59]:
#define haversine function
def haversine(lat1, lon1, lat2, lon2):
    """Calculates the distance between two points using the Haversine formula."""
    R = 6371.0  # Earth's radius in kilometers

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dLat = lat2_rad - lat1_rad
    dLon = lon2_rad - lon1_rad
    a = math.sin(dLat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dLon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c


In [61]:
distance_matrix = []

#check documentation and magnitude on distances

for i, row_i in unique_locations_df.iterrows():
    #for each row, initialize empty list row_distances to store  distances between row_i and all other locations
    row_distances = []
    for j, row_j in unique_locations_df.iterrows():
        if i != j:  # avoid calculating distance with itself
            distance = haversine(row_i['lat'], row_i['lon'], row_j['lat'], row_j['lon'])
            row_distances.append(distance)
        else:
            row_distances.append(0)  # distance with itself is 0
    distance_matrix.append(row_distances)

distances_csv_path = os.path.join(castnet_path, 'distances.csv')
distance_matrix_df = pd.DataFrame(distance_matrix)
distance_matrix_df.to_csv(distances_csv_path, index=False, header=False)


In [62]:
distance_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1610,1611,1612,1613,1614,1615,1616,1617,1618,1619
0,0.000000,21.600570,12.373388,32.016627,28.654169,38.338016,39.053149,44.878830,52.321736,36.126402,...,158.040223,155.939868,153.844448,152.614397,157.894983,165.624431,117.321474,125.372800,123.137276,125.948033
1,21.600570,0.000000,10.884733,11.319043,7.537983,18.891214,18.440667,25.290806,33.934407,21.143385,...,174.291869,172.931859,170.483654,170.159740,175.136498,182.956939,136.095733,143.567128,143.446139,146.454351
2,12.373388,10.884733,0.000000,22.182047,18.422425,29.559572,29.319813,36.047247,44.460049,30.183974,...,169.468929,167.608607,165.398624,164.448636,169.642097,177.403667,129.457721,137.369810,135.508118,138.320919
3,32.016627,11.319043,22.182047,0.000000,3.847478,7.933406,7.155444,14.064263,22.975708,13.757973,...,178.990570,178.148622,175.464182,175.783263,180.521999,188.379874,142.787935,149.782779,151.428177,154.627577
4,28.654169,7.537983,18.422425,3.847478,0.000000,11.704627,10.908035,17.911114,26.782491,16.191635,...,177.870910,176.844624,174.243207,174.332969,179.158645,187.005838,140.937801,148.108564,149.106071,152.234603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1615,165.624431,182.956939,177.403667,188.379874,187.005838,189.586566,192.796740,193.709798,194.634473,180.788021,...,15.790595,10.794956,14.948344,13.212316,7.871848,0.000000,50.946382,41.271950,60.332441,62.046224
1616,117.321474,136.095733,129.457721,142.787935,140.937801,145.110641,147.952535,149.958319,152.330216,137.023289,...,49.104559,43.362855,43.342024,37.827226,44.017066,50.946382,0.000000,10.299097,22.932813,27.536442
1617,125.372800,143.567128,137.369810,149.782779,148.108564,151.703877,154.684919,156.309540,158.197699,143.341727,...,38.805473,33.214961,33.053110,28.065831,34.084520,41.271950,10.299097,0.000000,29.559774,33.643771
1618,123.137276,143.446139,135.508118,151.428177,149.106071,154.799127,157.237842,160.200697,163.705466,147.519120,...,63.972463,56.192908,57.921715,49.009587,55.364198,60.332441,22.932813,29.559774,0.000000,4.747055
