In [75]:
import pandas as pd
import numpy as np
import os
import math
import pickle
import csv

In [55]:
%store -r gdf_annual_with_svi
%store -r gdf_quarter_with_svi
%store -r gdf_semi_with_svi

gdf_annual = gdf_annual_with_svi

In [56]:
data_path = '/Users/jyontika/Desktop/Python/github_hughes/opioid-overdose-models/CASTNet/hughes-CASTNet/Data/Chicago/'
gdf_annual.columns

Index(['geoid', 'year', 'deaths', 'STATEFP', 'COUNTYFP', 'TRACTCE', 'NAME',
       'NAMELSAD', 'MTFCC', 'FUNCSTAT', 'ALAND', 'AWATER', 'lat', 'lon',
       'geometry', 'timestep', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc',
       'theme_4_pc', 'svi_pctile'],
      dtype='object')

### Add static feature as a csv file

In [57]:
geoid_data = []

for geoid in gdf_annual['geoid'].unique():
    geoid_df = gdf_annual[gdf_annual['geoid'] == geoid]
    
    # Extract the lat and lon values 
    lat = geoid_df['lat'].values[0]
    lon = geoid_df['lon'].values[0]
    
    # append data
    geoid_data.append([geoid, lat, lon])


static_features = pd.DataFrame(geoid_data)
static_features = static_features.transpose()
csv_path = os.path.join(data_path, 'static_features.csv')
static_features.to_csv(csv_path, index=False, header=False)



### Add location as a txt file

In [58]:
geoid_series = gdf_annual['geoid']
txt_path = os.path.join(data_path, 'locations.txt')
with open(txt_path, 'w') as file:
    for geoid in geoid_series:
        file.write(str(geoid) + '\n')


### Add SVI as a pickle file

In [59]:
##SVI  dynamic features organization
svi_cols = ['geoid', 'year', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']
gdf_subset = gdf_annual[svi_cols]
gdf_pivoted = gdf_subset.pivot(index='geoid', columns='year')

In [60]:
num_unique_geoids = len(gdf_pivoted.index)
num_years = len(gdf_pivoted.columns.levels[1])
num_values = len(svi_cols[2:])
numpy_3d_array = gdf_pivoted.values.reshape(num_unique_geoids, num_years, num_values)
svi_path = os.path.join(data_path, 'svi.pkl')
with open(svi_path, "wb") as file:
    pickle.dump(numpy_3d_array, file)

In [61]:
numpy_3d_array.shape

(1327, 8, 5)

### Add overdose as picke file

In [62]:
#overdoses pickle file is an array (# days, # locations) shows # of deaths per DAY in every location??
# (8, 1327)

overdose_data = gdf_annual.groupby(['year', 'geoid'])['deaths'].sum().reset_index()
overdoses_array = overdose_data.pivot_table(index='year', columns='geoid', values='deaths').to_numpy()
overdose_path = os.path.join(data_path, 'overdose.pkl')
with open(overdose_path, "wb") as file:
    pickle.dump(overdoses_array, file)


### Add distances as csv file

In [112]:
unique_locations_df = gdf_annual[['lat', 'lon']].drop_duplicates()
unique_locations_df['lat'] = pd.to_numeric(unique_locations_df['lat'])
unique_locations_df['lon'] = pd.to_numeric(unique_locations_df['lon'])


In [113]:
#define haversine function
def haversine(lat1, lon1, lat2, lon2):
  """Calculates the distance between two points using the Haversine formula."""
  R = 6371.01 # Earth radius in kilometers
  dLat = math.radians(lat2 - lat1)
  dLon = math.radians(lon2 - lon1)
  a = math.sin(dLat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dLon / 2)**2
  c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
  return R * c




In [119]:
distance_matrix = []

for i, row_i in unique_locations_df.iterrows():
    row_distances = []
    for j, row_j in unique_locations_df.iterrows():
        if i != j:  # Avoid calculating distance with itself
            distance = haversine(row_i['lat'], row_i['lon'], row_j['lat'], row_j['lon'])
            row_distances.append(distance)
        else:
            row_distances.append(0)  # Distance with itself is 0
    distance_matrix.append(row_distances)

distances_csv_path = os.path.join(data_path, 'distances.csv')
distance_matrix_df = pd.DataFrame(distance_matrix)
distance_matrix_df.to_csv(distances_csv_path, index=False, header=False)
