In [1]:
import pandas as pd
import numpy as np
import os
import math
import pickle
import geopandas as gpd
import csv
from shapely import wkt

#### This file prepares features to input into CASTNet

In [2]:
#retrieve cleaned data frames 
user_dir = '/Users/jyontika/Desktop/'
data_dir = os.path.join(user_dir, 'opioid-overdose-models/cook-county/data/')

gdf_annual = pd.read_csv(f'{data_dir}/cook_county_gdf_year.csv')
gdf_quarter = pd.read_csv(f'{data_dir}/cook_county_gdf_quarterly.csv')
gdf_semi = pd.read_csv(f'{data_dir}/cook_county_gdf_semiannual.csv')

#convert to gpd (was having trouble importing csv as gdf)
gdf_annual['geometry'] = gdf_annual['geometry'].apply(wkt.loads)
gdf_annual = gpd.GeoDataFrame(gdf_annual, geometry='geometry')
gdf_annual.crs = {'init': 'EPSG:4269'}
type(gdf_annual)

gdf_quarter['geometry'] = gdf_quarter['geometry'].apply(wkt.loads)
gdf_quarter = gpd.GeoDataFrame(gdf_quarter, geometry='geometry')
gdf_quarter.crs = {'init': 'EPSG:4269'}
type(gdf_quarter)

gdf_semi['geometry'] = gdf_semi['geometry'].apply(wkt.loads)
gdf_semi = gpd.GeoDataFrame(gdf_semi, geometry='geometry')
gdf_semi.crs = {'init': 'EPSG:4269'}
type(gdf_semi)

  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)


geopandas.geodataframe.GeoDataFrame

In [3]:
castnet_path = os.path.join(user_dir, 'opioid-overdose-models/CASTNet/hughes-CASTNet/Data/Chicago/')

### Add static feature as a csv file

In [4]:
geoid_data = []

for geoid in gdf_annual['geoid'].unique():
    geoid_df = gdf_annual[gdf_annual['geoid'] == geoid]
    
    # Extract the lat and lon values 
    lat = geoid_df['lat'].values[0]
    lon = geoid_df['lon'].values[0]
    
    # append data
    geoid_data.append([geoid, lat, lon])


static_features = pd.DataFrame(geoid_data)
static_features = static_features.transpose()
csv_path = os.path.join(castnet_path, 'static_features.csv')
static_features.to_csv(csv_path, index=False, header=False)



In [5]:
static_features.shape

(3, 1328)

### Add location as a txt file

In [6]:
geoid_series = gdf_annual['geoid'].unique()
txt_path = os.path.join(castnet_path, 'locations.txt')
with open(txt_path, 'w') as file:
 for index, geoid in enumerate(geoid_series, start=1):
        file.write(f"{index}\t{geoid}\n")
        


### Add SVI as a pickle file

In [7]:
##SVI  dynamic features organization
svi_cols = ['geoid', 'year', 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc', 'svi_pctile']
gdf_subset = gdf_annual[svi_cols]
gdf_pivoted = gdf_subset.pivot(index='geoid', columns='year')

In [8]:
num_unique_geoids = len(gdf_pivoted.index)
num_years = len(gdf_pivoted.columns.levels[1])
num_values = len(svi_cols[2:])
numpy_3d_array = gdf_pivoted.values.reshape(num_unique_geoids, num_years, num_values)
svi_path = os.path.join(castnet_path, 'svi.pkl')
with open(svi_path, "wb") as file:
    pickle.dump(numpy_3d_array, file)

In [9]:
import math
num_agg_slots = int(math.ceil(numpy_3d_array.shape[1] / float(1)))
svi_agg = np.zeros(shape=(numpy_3d_array.shape[0], num_agg_slots, numpy_3d_array.shape[2]))

for loc in range(0, numpy_3d_array.shape[0]):
        new_time_idx = 0
        for i in range(0, numpy_3d_array.shape[1], 1):
            start_idx = i
            end_idx = i + 1
            if end_idx > numpy_3d_array.shape[1]:
                end_idx = numpy_3d_array.shape[1]
            
            svi_agg[loc, new_time_idx] = np.sum(numpy_3d_array[loc, start_idx:end_idx], axis=0)
            new_time_idx += 1
            
            
            
num_time_slots = svi_agg.shape[1]

In [10]:
num_time_slots

8

In [11]:
numpy_3d_array.shape

(1328, 8, 5)

### Add overdose as picke file

In [12]:
#overdoses pickle file is an array (# days, # locations) shows # of deaths per DAY in every location??
# (8, 1328)

overdose_data = gdf_annual.groupby(['year', 'geoid'])['deaths'].sum().reset_index()
overdoses_array = overdose_data.pivot_table(index='year', columns='geoid', values='deaths').to_numpy()
overdose_path = os.path.join(castnet_path, 'overdose.pkl')
with open(overdose_path, "wb") as file:
    pickle.dump(overdoses_array, file)


In [13]:
np.max(overdoses_array)

24

In [14]:
###

num_agg_slots = int(math.ceil(numpy_3d_array.shape[1] / float(1)))
svi_agg = np.zeros(shape=(numpy_3d_array.shape[0], num_agg_slots, numpy_3d_array.shape[2]))
overdoses_array = np.swapaxes(overdoses_array, 1, 0)

overdose_agg = np.zeros(shape=(overdoses_array.shape[0], num_agg_slots))


In [15]:
overdose_agg

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# for loc in range(0, numpy_3d_array.shape[0]):
#         new_time_idx = 0
#         for i in range(0, numpy_3d_array.shape[1], 1):
#             start_idx = i
#             end_idx = i + 1
#             if end_idx > numpy_3d_array.shape[1]:
#                 end_idx = numpy_3d_array.shape[1]
            
#             svi_agg[loc, new_time_idx] = np.sum(numpy_3d_array[loc, start_idx:end_idx], axis=0)
#             overdose_agg[loc, new_time_idx] = np.sum(overdoses_array[loc, start_idx:end_idx])
#             new_time_idx += 1

### Add distances as csv file

In [17]:
import geopandas as gpd
from shapely.geometry import Point

# Load census tract shapefile
shapefile_dir = os.path.join(data_dir, 'shapefiles')  #shapefiles are on cluster
tl_shape_path = os.path.join(shapefile_dir, 'tl_2021_17_tract/tl_2021_17_tract.shp')
tl_gdf = gpd.read_file(tl_shape_path)
tl_gdf = tl_gdf[tl_gdf['COUNTYFP']=='031']


In [18]:
geoid_to_drop = ['17031990000', '17031381700', '17031980000', '17031980100']
tl_gdf = tl_gdf[~tl_gdf['GEOID'].isin(geoid_to_drop)]

unique_locations_df = tl_gdf[['INTPTLAT', 'INTPTLON']].drop_duplicates()
unique_locations_df['lat'] = pd.to_numeric(unique_locations_df['INTPTLAT'])
unique_locations_df['lon'] = pd.to_numeric(unique_locations_df['INTPTLON'])
unique_locations_df.shape

(1328, 4)

In [19]:
#define haversine function
def haversine(lat1, lon1, lat2, lon2):
    """Calculates the distance between two points using the Haversine formula."""
    R = 6371.0  # Earth's radius in kilometers

    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    dLat = lat2_rad - lat1_rad
    dLon = lon2_rad - lon1_rad
    a = math.sin(dLat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dLon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c


In [20]:
#mention lat and lon points to kyle
#find representative center point for each tract 
#calculates  distance matrix for a set of unique locations based on their latitude and longitude coordinates 
#average distance between deaths in tract A, deaths in tract B --> grab the pairs and compute haversine 
#distance between tract A and tractB depends on individual death locations, using average pairwise-distance between deaths in A and deaths in B
distance_matrix = []

#check documentation and magnitude on distances

for i, row_i in unique_locations_df.iterrows():
    #for each row, initialize empty list row_distances to store  distances between row_i and all other locations
    row_distances = []
    for j, row_j in unique_locations_df.iterrows():
        if i != j:  # avoid calculating distance with itself
            distance = haversine(row_i['lat'], row_i['lon'], row_j['lat'], row_j['lon'])
            row_distances.append(distance)
        else:
            row_distances.append(0)  # distance with itself is 0
    distance_matrix.append(row_distances)

distances_csv_path = os.path.join(castnet_path, 'distances.csv')
distance_matrix_df = pd.DataFrame(distance_matrix)
distance_matrix_df.to_csv(distances_csv_path, index=False, header=False)
