In [3]:
%load_ext autoreload
%autoreload 2
import geopandas as gpd
import numpy as np
import pandas as pd
from pandas import IndexSlice as idx
import os

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import HistGradientBoostingRegressor


def fast_bpr(true_val, pred_val, K=100, bootstrap_samples=1000):
    """

    :param true_val: Pandas dataframe indexed on location
    :param pred_val: Pandas dataframe indexed on location
    :param K: Number of locations to consider
    :param bootstrap_samples: Number of samples to take when evaluating ties
    :return:
    """
    top_K_predicted = pred_val.sort_values(ascending=False).iloc[:K]
    top_K_true = true_val.sort_values(ascending=False).iloc[:K]

    # Now we check for ties
    undisputed_top_predicted = top_K_predicted[top_K_predicted > top_K_predicted.min()]
    num_tied_spots = K - len(undisputed_top_predicted)
    undisputed_top_true = top_K_true[top_K_true > top_K_true.min()]
    num_true_ties = K - len(undisputed_top_true)

    tied_top_predicted = pred_val[pred_val == top_K_predicted.min()]
    tied_top_true = true_val[true_val == top_K_true.min()]

    # now randomly choose locations from the tied spots
    bootstrapped_tied_indices = np.random.choice(tied_top_predicted.index, (bootstrap_samples, num_tied_spots))
    undisputed_pred_idx = undisputed_top_predicted.index.values
    bootstrapped_all_indices = [np.concatenate((undisputed_pred_idx, bootstrap_index))
                                for bootstrap_index in bootstrapped_tied_indices]


    denominator =  top_K_true.sum()
    numerators = [true_val[indicies].sum() for indicies in bootstrapped_all_indices]

    bootstrapped_ratio = np.mean([numerator / denominator
                                  for numerator in numerators])

    return bootstrapped_ratio

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
results = pd.read_csv("/cluster/tufts/hugheslab/datasets/NSF_OD/results_202308_pipeline/results.csv")

In [6]:
marks_results

Unnamed: 0.1,Unnamed: 0,geoid,year,observed,predicted
0,1,25001010100,2013,0,0.206030
1,2,25001010100,2014,1,0.284624
2,3,25001010100,2015,0,0.369421
3,4,25001010100,2016,0,0.345503
4,5,25001010100,2017,0,0.345062
...,...,...,...,...,...
14575,14576,25027761402,2017,0,0.307114
14576,14577,25027761402,2018,1,0.296427
14577,14578,25027761402,2019,0,0.348472
14578,14579,25027761402,2020,0,0.437946


In [14]:
removed_locations=250
bpr_uncertainty_samples=50
seed=360
num_locations = len(results['geoid'].unique())

num_sampled = num_locations - removed_locations

In [23]:
rng = np.random.default_rng(seed=seed)

results_over_time = []
true_deaths = []
output_deaths=[]
for year in range(2020, 2021+1):
    evaluation_deaths = results[results['year'] == year]['observed']
    predicted_deaths = results[results['year'] == year]['predicted']
    true_deaths.append(evaluation_deaths)
    output_deaths.append(predicted_deaths)

    results_over_samples = []
    for _ in range(bpr_uncertainty_samples):
        sampled_indicies = rng.choice(range(num_locations), size=num_sampled, replace=False)
        results_over_samples.append(fast_bpr(evaluation_deaths.iloc[sampled_indicies], predicted_deaths.iloc[sampled_indicies]))

    results_over_time.append(results_over_samples)

In [29]:

bpr_samples_both_years = (np.array(results_over_time[0]) + \
                          np.array(results_over_time[1]))/2
                        
print(f"""Zeroes model (Mean, 95% CI): {np.mean(bpr_samples_both_years)*100:.1f},
      ({np.percentile(bpr_samples_both_years,2.5)*100:.1f}-
       {np.percentile(bpr_samples_both_years,97.5)*100:.1f})""")


Zeroes model (Mean, 95% CI): 62.0,
      (60.3-
       64.0)


In [34]:
rmse_results, mae_results = calculate_metrics(true_deaths, output_deaths)

In [36]:
rmse_mean, rmse_conf_interval = rmse_results
mae_mean, mae_conf_interval = mae_results

print_results("RMSE for Zeroes Model", rmse_mean, rmse_conf_interval)
print_results("MAE for Zeroes Model", mae_mean, mae_conf_interval)


RMSE for Zeroes Model (Mean, 95% CI): 1.31, (1.27-1.35)
MAE for Zeroes Model (Mean, 95% CI): 0.92, (0.89-0.95)


In [33]:

import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt



def calculate_metrics(evaluation_deaths, predicted_deaths, 
                      num_locations_removed = 250, confidence_level=0.95, seed=360):
        """
        @Return: joint RMSE and joint MAE alongside confidence interval
        @param: evaluation_deaths pulled from multiindexed_gdf, not sampled yet
        @param: predicted_deaths - corresponding model returns, already sampled
        """
        rng = np.random.default_rng(seed=seed) 
        num_years = len(evaluation_deaths)
        num_uncertainty_samples = len(predicted_deaths) // num_years


        #make sure each element in evaluation_deaths is of same len, set num_locations to that len
        lengths = [len(sub_list) for sub_list in evaluation_deaths]
        if all(length == lengths[0] for length in lengths):
             num_locations = lengths[0] #1328 for cook county, 1620 for MA
        
        num_sampled = num_locations - num_locations_removed 

        #initialize lists to store values 
        mae_over_samples = [] 
        rmse_over_samples = []

        #calculate metrics for each year across diff. samples of predicted values and actual values
        for i in range(num_years): 

            sampled_indices = rng.choice(range(num_locations), size=num_sampled, replace=False)
            current_eval_deaths = evaluation_deaths[i].iloc[sampled_indices]
            current_predicted_deaths = predicted_deaths[i].iloc[sampled_indices]

            #for test time 1, go through first half of predicted_deaths
            if i == 0: 
                for _ in range(num_uncertainty_samples):
                    mae_over_samples.append(mean_absolute_error(current_eval_deaths, current_predicted_deaths))
                    rmse_over_samples.append(sqrt(mean_squared_error(current_eval_deaths, current_predicted_deaths)))

            #for test time 2, go through second half of predicted_deaths
            else:
                upper_bound = len(evaluation_deaths)*num_uncertainty_samples 
                for _ in range(num_uncertainty_samples, upper_bound):
                    mae_over_samples.append(mean_absolute_error(current_eval_deaths, current_predicted_deaths))
                    rmse_over_samples.append(sqrt(mean_squared_error(current_eval_deaths, current_predicted_deaths)))

        #calculate mean and confidence interval (95%) based off joint rmse/mae vals
        joint_rmse_mean = np.mean(rmse_over_samples)
        joint_mae_mean = np.mean(mae_over_samples)
   
        #calculate mean and confidence interval (95%) based off joint rmse/mae vals
        confidence_level = max(0, min(confidence_level, 1)) 
        
        joint_rmse_lower = np.percentile(rmse_over_samples, (1 - confidence_level) * 100 / 2)
        joint_rmse_upper = np.percentile(rmse_over_samples, 100 - (1 - confidence_level) * 100 / 2)

        joint_mae_lower = np.percentile(mae_over_samples, (1 - confidence_level) * 100 / 2)
        joint_mae_upper = np.percentile(mae_over_samples, 100 - (1 - confidence_level) * 100 / 2)

        return (joint_rmse_mean, (joint_rmse_lower, joint_rmse_upper)), \
            (joint_mae_mean, (joint_mae_lower, joint_mae_upper))



###HELPER function to print results
def print_results(metric_name, mean_value, confidence_interval, confidence_level=0.95):
    '''Prints results from calculate_metrics'''
    print(f"{metric_name} (Mean, {confidence_level*100:.0f}% CI): {mean_value:.2f}, "
          f"({confidence_interval[0]:.2f}-{confidence_interval[1]:.2f})")
