In [1]:
import geopandas as gpd

import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import tensorflow as tf
import sys
import os
import glob

from functools import partial



code_dir = '/cluster/home/kheuto01/code/opioid-overdose-models/perturbations/'
sys.path.append(code_dir)
code_dir = '/cluster/home/kheuto01/code/opioid-overdose-models/diff_bpr'
sys.path.append(code_dir)
from top_k import top_k_idx
#from make_datasets import make_data
from bpr_model import PerturbedBPRModel


code_dir = '/cluster/home/kheuto01/code/opioid-overdose-models/'
sys.path.append(code_dir)
from zinf_gp.metrics import normcdf, fixed_top_X



from perturbations import perturbed
from bpr import bpr_variable_k_no_ties



2023-05-28 21:22:35.537922: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-28 21:22:35.540984: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-28 21:22:35.680955: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-28 21:22:35.682423: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data_path='/cluster/tufts/hugheslab/datasets/NSF_OD/results_20220606_update/clean_quarter_tract/'

In [3]:
def make_data_quarterly(multiindexed_gdf, first_year, last_year, time_window, feature_cols, train_shape, pred_lag=1):


    xs = []
    ys = []
    quarters = []

    for eval_year in range(first_year, last_year + 1):
        quarters_in_year = multiindexed_gdf[multiindexed_gdf['year']==eval_year].index.unique(level='timestep')
        quarters_in_year.sort_values()
        train_x_df = multiindexed_gdf.loc[idx[:, min(quarters_in_year) - time_window:max(quarters_in_year) - pred_lag], feature_cols]

        

        

        for q,quarter in enumerate(quarters_in_year):
            
            
            train_x_vals = train_x_df.values.reshape(train_shape)
            
            train_y_df = multiindexed_gdf.loc[idx[:,quarter], 'deaths']
            train_y_vals = train_y_df.values

            xs.append(train_x_vals)
            ys.append(train_y_vals)
            quarters.append(np.ones_like(train_y_vals)*q)

    x_BSTD = np.stack(xs, axis=0)
    y_BS = np.stack(ys)

    x_BSTD = tf.convert_to_tensor(x_BSTD, dtype=tf.float32)
    y_BS = tf.convert_to_tensor(y_BS, dtype=tf.float32)

    B, S, T, D = x_BSTD.shape

    assert (B == len(range(first_year, last_year + 1))*pred_lag)
    assert (S == train_shape[0])
    assert (T == time_window)
    assert (D == len(feature_cols))

    # Reshape the training data to flatten the dimensions
    x_BSF_flat = tf.reshape(x_BSTD, (B, S, T * D), )
    # add prediction quarter
    x_BSF_flat = np.concatenate((x_BSF_flat, np.expand_dims(quarters,axis=-1)),axis=-1)


    return x_BSF_flat, y_BS

In [43]:
class PerturbedBPRModel(tf.keras.Model):

    def __init__(self, perturbed_top_k_func, k=100, **kwargs):
        """k should match the k baked into the perturbed top_k func.
        we need k for when performing exact top k in evaluation step."""
        super(PerturbedBPRModel, self).__init__(**kwargs)
        self.perturbed_top_k_func = perturbed_top_k_func
        self.k = k
        self.hidden1 = tf.keras.layers.Dense(25, activation='relu')
        self.hidden2 = tf.keras.layers.Dense(10, activation='relu')
        self.output_layer = tf.keras.layers.Dense(1, activation=None)

    def call(self, inputs):
        intermediate = self.hidden1(inputs)
        intermediate = self.hidden2(intermediate)
        
        outputs = self.output_layer(intermediate)
        # squeeze away feature dimension
        outputs = tf.squeeze(outputs, axis=-1)
        return outputs

    def train_step(self, data):
        # Unpack the data. Its structure depends on your model and
        # on what you pass to `fit()`.
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)  # Forward pass
            top_100_indicators = self.perturbed_top_k_func(y_pred)
            true_top_100_val, true_top_100_idx = tf.math.top_k(y, k=self.k)

            denominator = tf.reduce_sum(true_top_100_val, axis=-1)
            numerator = tf.reduce_sum(top_100_indicators * y, axis=-1)

            # Compute the loss value
            # (the loss function is configured in `compile()`)
            loss = self.compiled_loss(numerator, denominator, regularization_losses=self.losses)

        # Compute gradients
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        # Update weights
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)
        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}

    def test_step(self, data):
        # Unpack the data
        x, y = data
        # Compute predictions
        y_pred = self(x, training=False)  # Forward pass
        # use discrete topk to simulate making a decision
        _, pred_100_idx = tf.math.top_k(y_pred, k=self.k)
        true_top_100_val, true_top_100_idx = tf.math.top_k(y, k=self.k)

        denominator = tf.reduce_sum(true_top_100_val, axis=-1)
        numerator = tf.reduce_sum(tf.gather(y, pred_100_idx, batch_dims=-1), axis=-1)

        # Compute the loss value
        # (the loss function is configured in `compile()`)
        self.compiled_loss(numerator, denominator, regularization_losses=self.losses)

        # Update the metrics.
        self.compiled_metrics.update_state(y, y_pred)
        return {m.name: m.result() for m in self.metrics}

In [5]:
epochs = 5000
seed = 360
time_window = 5*4
first_train_eval_year = 2014
last_train_eval_year = 2018
#batch_dim_size = last_train_eval_year - first_train_eval_year + 1
first_validation_year = 2019
last_validation_year = 2019
first_test_year = 2020
last_test_year = 2021

tf.random.set_seed(seed)


timestep_col = 'timestep'
geography_col = 'geoid'
outcome_col = 'deaths'

x_idx_cols = [geography_col, 'lat', 'lon', timestep_col,
              'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
              'svi_pctile', 'year',
              'neighbor_t', 'deaths']
y_idx_cols = [geography_col, timestep_col, outcome_col]
#features_only = ['lat', 'lon', timestep_col,
#                 'theme_1_pc', 'theme_2_pc', 'theme_3_pc', 'theme_4_pc',
#                 'svi_pctile',
#                 'neighbor_t', 'deaths']
features_only = ['deaths']

data_gdf = gpd.read_file(data_path)

multiindexed_gdf = data_gdf.set_index(['geoid', 'timestep'])
multiindexed_gdf['timestep'] = multiindexed_gdf.index.get_level_values('timestep')
num_geoids = len(data_gdf['geoid'].unique())

train_shape = (num_geoids, time_window, len(features_only))

train_x_BSF_flat, train_y_BS = make_data_quarterly(multiindexed_gdf, first_train_eval_year, last_train_eval_year,
                                                  time_window, features_only, train_shape, pred_lag=4)

valid_x_BSF_flat, valid_y_BS = make_data_quarterly(multiindexed_gdf, first_validation_year, last_validation_year,
                                         time_window, features_only, train_shape, pred_lag=4)

test_x_BSF_flat, test_y_BS = make_data_quarterly(multiindexed_gdf, first_test_year, last_test_year,
                                       time_window, features_only, train_shape, pred_lag=4)

norm_layer = tf.keras.layers.Normalization()
norm_layer.adapt(train_x_BSF_flat)
train_x_BSF_flat = norm_layer(train_x_BSF_flat)
valid_x_BSF_flat = norm_layer(valid_x_BSF_flat)
test_x_BSF_flat = norm_layer(test_x_BSF_flat)

top_100_idx_func = partial(top_k_idx, k=100)

2023-05-28 21:26:31.265949: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-05-28 21:26:31.266002: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: p1cmp078.pax.tufts.edu
2023-05-28 21:26:31.266012: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: p1cmp078.pax.tufts.edu
2023-05-28 21:26:31.266167: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 515.65.1
2023-05-28 21:26:31.266205: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 515.65.1
2023-05-28 21:26:31.266212: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 515.65.1


In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
reg = LinearRegression().fit(train_x_BSF_flat, train_y_BS)

ValueError: Found array with dim 3. LinearRegression expected <= 2.

In [9]:
train_x_BSF_flat.shape

TensorShape([20, 1620, 21])

In [11]:
train_y_BS

<tf.Tensor: shape=(20, 1620), dtype=float32, numpy=
array([[1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [12]:
import numpy as np

# Generate random sample data
np.random.seed(42)  # Set random seed for reproducibility
batch_size = 37
sequence_length = 1620
feature_dim = 20

x_train = np.random.randn(batch_size, sequence_length, feature_dim)
y_train = np.random.randn(batch_size, sequence_length)

# Print the shapes of the generated data
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)

x_train shape: (37, 1620, 20)
y_train shape: (37, 1620)


In [14]:
# Generate random sample data
np.random.seed(42)  # Set random seed for reproducibility
batch_size = 37
sequence_length = 1620
feature_dim = 20

x_train = np.random.randn(batch_size, sequence_length, feature_dim)
y_train = np.random.randn(batch_size, sequence_length)

# Reshape the data
x_train_reshaped = np.reshape(x_train, (batch_size * sequence_length, feature_dim))
y_train_reshaped = np.reshape(y_train, (batch_size * sequence_length,))

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, input_shape=(feature_dim,), use_bias=True)  # 1 unit for prediction, 20 input features + bias
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print model summary
model.summary()

# Train the model
model.fit(x_train_reshaped, y_train_reshaped, epochs=10, batch_size=32)  # x_train_reshaped: (37 * 1620, 20), y_train_reshaped: (37 * 1620,)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1 (Dense)             (None, 1)                 21        
                                                                 
Total params: 21
Trainable params: 21
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2af31e7d9580>

In [16]:
x_train_reshaped.shape

(59940, 20)

In [18]:
x_train[0,:5,5]

array([-0.23413696,  0.11092259, -0.71984421,  1.35624003, -0.50175704])

In [19]:
x_train_reshaped[:5,5]

array([-0.23413696,  0.11092259, -0.71984421,  1.35624003, -0.50175704])

In [20]:
train_x_BSF_flat

<tf.Tensor: shape=(20, 1620, 21), dtype=float32, numpy=
array([[[-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -1.3416407 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -1.3416407 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -1.3416407 ],
        ...,
        [-0.30601794, -0.2999008 , -0.30842423, ...,  1.507537  ,
         -0.45583406, -1.3416407 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -1.3416407 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -1.3416407 ]],

       [[-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -0.4472136 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -0.4472136 ],
        [-0.30601794, -0.2999008 , -0.30842423, ..., -0.44685635,
         -0.45583406, -0.4472136 ],
        ...

In [36]:
thing = np.random.uniform(size=(1620,20))
params = np.arange(20)

In [37]:
multed = thing*params

In [38]:
multed.shape

(1620, 20)

In [39]:
multed

array([[ 0.        ,  0.4293966 ,  0.73677363, ..., 11.29133254,
        12.34185542, 16.24624365],
       [ 0.        ,  0.23881087,  0.39448319, ...,  2.63869672,
        16.71658145,  0.73630669],
       [ 0.        ,  0.23456134,  0.02409557, ...,  9.59706617,
         4.70654871,  5.5814408 ],
       ...,
       [ 0.        ,  0.19647467,  1.86064454, ...,  5.35120039,
         2.53750528,  5.58366202],
       [ 0.        ,  0.50174264,  0.19006992, ..., 15.61504942,
         3.5434477 ,  3.28939923],
       [ 0.        ,  0.96734246,  1.20831224, ..., 10.52995387,
        12.58761931,  8.56894516]])

In [46]:
PerturbedBPRModel(top_100_idx_func,name='hi').name

'hi'