In [20]:
import pandas as pd
import numpy as np
import os
import math
import pickle
import geopandas as gpd
import csv
from shapely import wkt

In [21]:
user_dir = '/Users/jyontika/Desktop/'
data_dir = os.path.join(user_dir, 'jyontika-MA-data/data/')

annual_path = os.path.join(data_dir, 'clean_annual_tract')
quarter_path = os.path.join(data_dir, 'clean_quarter_tract')
semi_path = os.path.join(data_dir, 'clean_semi_tract')

gdf_annual = gpd.read_file(annual_path)
gdf_quarter = gpd.read_file(quarter_path)
gdf_semi = gpd.read_file(semi_path)


In [22]:
castnet_path = os.path.join(user_dir, 'opioid-overdose-models/CASTNet/hughes-CASTNet/Data/MA/')

### Add static feature as a csv file

In [23]:
geoid_data = []

for geoid in gdf_annual['geoid'].unique():
    geoid_df = gdf_annual[gdf_annual['geoid'] == geoid]
    
    # Extract the lat and lon values 
    lat = geoid_df['lat'].values[0]
    lon = geoid_df['lon'].values[0]
    
    # append data
    geoid_data.append([geoid, lat, lon])


static_features = pd.DataFrame(geoid_data)
static_features = static_features.transpose()
csv_path = os.path.join(castnet_path, 'static_features.csv')
static_features.to_csv(csv_path, index=False, header=False)



In [24]:
static_features.shape

(3, 1620)

### Add location as a txt file

In [25]:
geoid_series = gdf_annual['geoid'].unique()
txt_path = os.path.join(castnet_path, 'locations.txt')
with open(txt_path, 'w') as file:
 for index, geoid in enumerate(geoid_series, start=1):
        file.write(f"{index}\t{geoid}\n")
        


In [26]:
len(geoid_series)

1620

### Add SVI as a pickle file

In [27]:
##SVI  dynamic features organization
svi_cols = ['geoid', 'year', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']
gdf_subset = gdf_annual[svi_cols]
gdf_pivoted = gdf_subset.pivot(index='geoid', columns='year')

In [28]:
num_unique_geoids = len(gdf_pivoted.index)
num_years = len(gdf_pivoted.columns.levels[1])
num_values = len(svi_cols[2:])
numpy_3d_array = gdf_pivoted.values.reshape(num_unique_geoids, num_years, num_values)
svi_path = os.path.join(castnet_path, 'svi.pkl')
with open(svi_path, "wb") as file:
    pickle.dump(numpy_3d_array, file)

In [29]:
import math
num_agg_slots = int(math.ceil(numpy_3d_array.shape[1] / float(1)))
svi_agg = np.zeros(shape=(numpy_3d_array.shape[0], num_agg_slots, numpy_3d_array.shape[2]))

for loc in range(0, numpy_3d_array.shape[0]):
        new_time_idx = 0
        for i in range(0, numpy_3d_array.shape[1], 1):
            start_idx = i
            end_idx = i + 1
            if end_idx > numpy_3d_array.shape[1]:
                end_idx = numpy_3d_array.shape[1]
            
            svi_agg[loc, new_time_idx] = np.sum(numpy_3d_array[loc, start_idx:end_idx], axis=0)
            new_time_idx += 1
            
            
            
num_time_slots = svi_agg.shape[1]

In [30]:
numpy_3d_array.shape

(1620, 22, 5)

### Add overdose as picke file

In [31]:
#overdoses pickle file is an array (# days, # locations) shows # of deaths per DAY in every location??

overdose_data = gdf_annual.groupby(['year', 'geoid'])['deaths'].sum().reset_index()
overdoses_array = overdose_data.pivot_table(index='year', columns='geoid', values='deaths').to_numpy()
overdose_path = os.path.join(castnet_path, 'overdose.pkl')
with open(overdose_path, "wb") as file:
    pickle.dump(overdoses_array, file)


In [32]:
###

num_agg_slots = int(math.ceil(numpy_3d_array.shape[1] / float(1)))
svi_agg = np.zeros(shape=(numpy_3d_array.shape[0], num_agg_slots, numpy_3d_array.shape[2]))
overdoses_array = np.swapaxes(overdoses_array, 1, 0)

overdose_agg = np.zeros(shape=(overdoses_array.shape[0], num_agg_slots))


In [33]:
np.max(overdoses_array)

10.0

### Add distances as csv file

In [34]:
import geopandas as gpd
from shapely.geometry import Point

tl_shape_path = os.path.join(data_dir, 'clean_annual_tract/clean_annual_tract.shp')
tl_gdf = gpd.read_file(tl_shape_path)


In [35]:
unique_locations_df = tl_gdf[['lat', 'lon']].drop_duplicates()
unique_locations_df['lat'] = pd.to_numeric(unique_locations_df['lat'])
unique_locations_df['lon'] = pd.to_numeric(unique_locations_df['lon'])
unique_locations_df.shape

(1620, 2)

In [36]:
#define haversine function
def haversine(lat1, lon1, lat2, lon2):
    """Calculates the distance between two points using the Haversine formula."""
    R = 6371.0  # Earth's radius in kilometers

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dLat = lat2_rad - lat1_rad
    dLon = lon2_rad - lon1_rad
    a = math.sin(dLat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dLon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c


In [37]:
distance_matrix = []

#check documentation and magnitude on distances

for i, row_i in unique_locations_df.iterrows():
    #for each row, initialize empty list row_distances to store  distances between row_i and all other locations
    row_distances = []
    for j, row_j in unique_locations_df.iterrows():
        if i != j:  # avoid calculating distance with itself
            distance = haversine(row_i['lat'], row_i['lon'], row_j['lat'], row_j['lon'])
            row_distances.append(distance)
        else:
            row_distances.append(0)  # distance with itself is 0
    distance_matrix.append(row_distances)

distances_csv_path = os.path.join(castnet_path, 'distances.csv')
distance_matrix_df = pd.DataFrame(distance_matrix)
distance_matrix_df.to_csv(distances_csv_path, index=False, header=False)


In [38]:
distance_matrix_df.shape

(1620, 1620)