# Step 2b: Temporal Aggregation of baseline and future years. 

**********************************************
version 10

**********************************************
1. Aggregate monthly timeseries data to Aqueduct benchmark year (2014, 2019, 2030, 2050, 2080) for every month (12 total per benchmark year))
    - Filter by month
    - Filter by period
    - Run 10-year trailing moving average
    - Run Theil Sen regression (cappend by min and max of 10-year average)
2. Find extremely small values based on flux value and arid/low water use thresholds, create a column to track
3. Adjust supply and demand to baseline for every month using ADDITIVE DELTA ONLY (12 total per benchmark year). Three baseline options:
      - b4_2019 = Aqueduct 4.0 2019 *** Chosen one ***
4. Group Delta Basins by pooling supply and demand
5. Finalize supply and demand


In [1]:
import os, datetime
import geopandas as gpd
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import TheilSenRegressor
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings("ignore")

In [2]:
# PATHS!
# root
rootPATH = r'\Projections\Final_Data\Data'
# 1. Aqueduct 4.0 Paths
# monthly timeseries
ts4PATH = os.path.join(rootPATH, "Aqueduct40", 'step1_spatial_aggregation', "data_by_catchment", "data", "{}.csv").format

# 2. Delta Basin Groupings (from /Aqueduct30/processData/Y2018M07D25_RH_Basin_Manual_Step_V01)
dltbasinPATH =  os.path.join(rootPATH, "Aqueduct40", "util", "hybas_deltas.csv")

# 3. Area
areaPATH =  os.path.join(rootPATH, "Aqueduct40", "util", "hybas_area.csv")


# Global climate models
gcmFolders = ['gswp3-w5e5',
              'gfdl-esm4',
              'ipsl-cm6a-lr',
              'mpi-esm1-2-hr',
              'mri-esm2-0',
              'ukesm1-0-ll']

# Future scenarions
scenFolders = ['historical',
               'ssp126',
               'ssp370',
               'ssp585'
              ]

months = [str(x).zfill(2) for x in range(1,13)]

# Outpaths
outPATH = r'\Projections\Final_Data\Data\Aqueduct40\step2_temporal_aggregation'


# Monthly 
sdPATH = os.path.join(outPATH, 'working', 'step1_supply-demand', 'M{}.csv').format
bcPATH = os.path.join(outPATH, 'working', 'step2_bias-correction', 'M{}.csv').format
dlPATH = os.path.join(outPATH, 'working', 'step3_pool-deltas', 'M{}.csv').format

arPATH = os.path.join(outPATH, 'working', 'step3_pool-deltas', 'area_delta.csv')

# Annual 
ysdSTATSPATH = os.path.join(outPATH, 'working', 'step1_supply-demand', 'Y2019-2080_stats.csv')
ysdPATH = os.path.join(outPATH, 'working', 'step1_supplydemand', 'Y2019-2080.csv')
ybcPATH = os.path.join(outPATH, 'working', 'step2_bias-correction', 'Y2019-2080.csv')
ydlPATH = os.path.join(outPATH, 'working', 'step3_pool-deltas', 'Y2019-2080_{}.csv').format

# Final 
fnPATH =  os.path.join(outPATH, 'final', 'Aqueduct40_supplydemand_{}-exploded-additive.csv').format


# Universal Data (read in once)

In [3]:
# 1. Create a list of available PFs from monthly folder in playground
available_pfs = os.listdir(os.path.dirname(ts4PATH("")))
available_pfs = [int(x.replace(".csv", "")) for x in available_pfs]

# 2. Define list of pfafs to focus on
list_pfs = available_pfs

# 3. Define Arid and low water use thresholds
aridThres = 0.03
lowuseThres = 0.012

# 4. Define type of supply data to use (tsr = theil sen regressed data)
bastat = 'tsr-'

# 5. Read in Area
area = pd.read_csv(areaPATH, index_col = 'pfaf_id')
area.drop_duplicates(inplace = True)
area.columns = ['area_m2']


# 6. Read in delta basins. Calculate total area in these places
# Read in delta groupings 
df_db = pd.read_csv(dltbasinPATH, index_col = ['PFAF_ID'])
df_db.index.name = 'pfaf_id'
# delta_id_dict = df_db['delta_id'].to_dict()
# Find lists of deltas watersheds
yes_deltas = df_db.index[~df_db['delta_id'].isna()]
# Filter by deltas only
df_db = df_db[~df_db['delta_id'].isna()]
# Add Area per watershed
df_db = pd.merge(df_db, area, how = 'left', left_index = True, right_index = True)
# Calculate total area per delta, add sum to each pfaf_id
delta_area = df_db.groupby(['delta_id'])[['area_m2']].sum()
# Merge area with delta basin definitation
area2 = pd.merge(area, df_db.filter(['delta_id']), how = 'left', left_index = True, right_index = True)
# Now, merge summed delta areas. None-delta areas will be blank (suffix = d)
area2 = pd.merge(area2, delta_area, how = 'left', left_on = 'delta_id', right_index = True, suffixes = ['_c', "_d"])
# Combine two columns. Use pooled area for deltas, and catchment area for non-deltas
area2['area_m2'] = area2['area_m2_d'].fillna(area2['area_m2_c'])
# Set delta id for non-delta catchments as -1
area2['delta_id'] = area2['delta_id'].fillna(-1)
area2 = area2.filter(['area_m2', 'delta_id'])

In [4]:
area2.to_csv(arPATH)

# 1. Calculate monthly supply and demand for each milestone year

## Functions

In [5]:
# 1. Define periods for each miletsone year
slice_starts =  {"2014": "1979-{}-01".format, 
                 "2019": "1979-{}-01".format, 
                 "2030": "2015-{}-01".format, 
                 "2050": "2036-{}-01".format, 
                 "2080": "2066-{}-01".format}

slice_ends =    {"2014": "2014-{}-01".format, 
                 "2019": "2019-{}-01".format,
                 "2030": "2045-{}-01".format, 
                 "2050": "2065-{}-01".format, 
                 "2080": "2095-{}-01".format}

# 2. List of functions

def segment_id_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
def get_pf_data(pf):
    """
    PURPOSE: Read in Catchment CSV based on PFAF ID provided
    RETURNS: data frame, with time as index
    """
    df = pd.read_csv(ts4PATH(pf), index_col = None)
    df['time'] = pd.to_datetime(df['time'])
    df.set_index(['time'], inplace = True)
    return df

def theilsen(X, y, idx):
    '''
    PURPOSE: To capture trend using Theil Sen regression over the period. A Theil Sen is better at removing anomalies than an OLS
    VARIABLES:
                1. X is the series containing time (indpedent variable)
                2. y is the series containing the moving average data
                3. idx is the index location of the milestone year
    RETURNS: Regressed value at milestone year
    '''
    # Pull x and y
    y = y.values
    # Reshape values into 2D arrays
    x_train= X.reshape(-1, 1)
    y_train= y.reshape(-1, 1)
    # Fit regression
    reg = TheilSenRegressor().fit(x_train, y_train)
    # Run regresion
    y_hat = reg.predict(x_train)
    # Cap results to min and max of Y dataset
    y_hat_adj_tsr = []
    for yh in y_hat:
        if yh < np.min(y):
            yh = np.min(y)
        elif yh > np.max(y):
            yh = np.max(y)
        y_hat_adj_tsr.append(yh)
    # Pull value from Period Date
    tsr = y_hat_adj_tsr[idx]
    return tsr

def get_base_data(df_in, dtype, year, month):
    '''
    PURPOSE: To summarize baseline (WE5E) data at 2019. 
             Data types: total demand, total consumption, or available blue water
             Statistics include: raw, average, standard deviation, and Theil Sen Regressed over moving average
    VARIABLES:
                1. df_in is the catchment-specific data frame
                2. dtype is the data type (ww, wn, or ba)
                3. year is the milestone year
                4. month is the month
    RETURNS: Dataframe with data type per statistic as columns, and 2019 summary as row.
    '''
    ordinalFactor = 1000000.0
    # Find the period's start, end, and middle date
    sdate = slice_starts.get(str(year))(month)
    edate = slice_ends.get(str(year))(month)
    pdate = '{}-{}-01'.format(year, month)
    # Filter the data by the selected month and period limits
    df_f = df_in[(df_in['month'] == int(month)) & (df_in.index >= sdate) &(df_in.index <= edate)]
    # Find column names
    gcms = ['gswp3-w5e5']
    raw_cols = ['{}_{}_{}'.format(dtype, x, 'historical') for x in gcms]
    # Find the following data points: raw, avg, std, tsr
    # -- RAW = value at milestone year
    df_raw = df_f[raw_cols][(df_f.index ==pdate)].iloc[0].to_frame("b4_" + str(year)).transpose()
    df_raw = df_raw.add_prefix("raw-")
    # AVG = average across period
    df_avg = df_f[raw_cols][(df_f.index >= sdate) &(df_f.index <= edate)].mean().to_frame("b4_" + str(year)).transpose()
    df_avg = df_avg.add_prefix("avg-")
    # STD = standard deviation across period
    df_std = df_f[raw_cols][(df_f.index >= sdate) &(df_f.index <= edate)].std().to_frame("b4_" + str(year)).transpose()
    df_std = df_std.add_prefix("std-")
    # TSR = Theil Sen regression across period
    # - First, create rolling average 
    df_roll = df_f[raw_cols].rolling(10, center = False).mean()
    df_roll.dropna(subset = raw_cols, inplace = True)
    # - Then, Create ordinal date for Theil Sen
    X = (df_roll.index.map(pd.Timestamp.toordinal) / ordinalFactor).values
    # - Then, find list location for the period date
    idx = df_roll.index.get_loc(pdate)
    # Run Theil Sen across columns
    df_tsr = df_roll[raw_cols].apply(lambda x: theilsen(X = X, y = x, idx = idx)).to_frame("b4_" + str(year)).transpose()
    df_tsr = df_tsr.add_prefix("tsr-")
    df_all = pd.concat([df_raw, df_avg, df_std, df_tsr], axis = 1)
    return df_all
        
def get_gcm_data(df_in, dtype, year, month, scen):
    '''
    PURPOSE: To summarize projected data at milestone years. 
             Data types: total demand, total consumption, or available blue water
             Statistics include: raw, average, standard deviation, and Theil Sen Regressed over moving average
    VARIABLES:
                1. df_in is the catchment-specific data frame
                2. dtype is the data type (ww, wn, or ba)
                3. year is the milestone year
                4. month is the month
                5. scen is ssp code
    RETURNS: Dataframe with datatype per gcm per statistic as columns, and milestone year summary as row.
    '''
    ordinalFactor = 1000000.0
    # Find the period's start, end, and middle date
    sdate = slice_starts.get(str(year))(month)
    edate = slice_ends.get(str(year))(month)
    pdate = '{}-{}-01'.format(year, month)
    # Filter the data by the selected month and period limits
    df_f = df_in[(df_in['month'] == int(month)) & (df_in.index >= sdate) &(df_in.index <= edate)]
    # Find column names
    gcms = ['gfdl-esm4','ipsl-cm6a-lr','mpi-esm1-2-hr','mri-esm2-0','ukesm1-0-ll']
    raw_cols = ['{}_{}_{}'.format(dtype, x, scen) for x in gcms]
    # -- RAW = value at milestone year
    df_raw = df_f[raw_cols][(df_f.index ==pdate)].iloc[0].to_frame(str(year)).transpose()
    df_raw = df_raw.add_prefix("raw-")
    # AVG = average across period
    df_avg = df_f[raw_cols][(df_f.index >= sdate) &(df_f.index <= edate)].mean().to_frame(str(year)).transpose()
    df_avg = df_avg.add_prefix("avg-")
    # STD = standard deviation across period
    df_std = df_f[raw_cols][(df_f.index >= sdate) &(df_f.index <= edate)].std().to_frame(str(year)).transpose()
    df_std = df_std.add_prefix("std-")
    # TSR = Theil Sen regression across period
    # - First, create rolling average 
    df_roll = df_f[raw_cols].rolling(10, center = False).mean()
    df_roll.dropna(subset = raw_cols, inplace = True)
    # - Then, Create ordinal date for Theil Sen
    X = (df_roll.index.map(pd.Timestamp.toordinal) / ordinalFactor).values
    # - Then, find list location for the period date
    idx = df_roll.index.get_loc(pdate)
    # Run Theil Sen across columns
    df_tsr = df_roll[raw_cols].apply(lambda x: theilsen(X = X, y = x, idx = idx)).to_frame(str(year)).transpose()
    df_tsr = df_tsr.add_prefix("tsr-")
    df_all = pd.concat([df_raw, df_avg, df_std, df_tsr], axis = 1)
    return df_all

def pull_gcm_scen(pf, df_in, month, dtype, scen):
    '''
    PURPOSE: To capture all projected milestone years in one dataframe (share the same columns, append new rows per milestone year)
             Merge 2014 GCM historic data into SSP columns (ie replace "historic" with ssp so data aligns neatly in column)
    VARIABLES:
                1. pf = catchment id
                2. df_in is the catchment-specific data frame
                3. month is the month
                4. dtype is ww, wn, or ba
                5. scen is ssp code
    RETURNS: Dataframe with data type per gcm for scen per statistic as columns, and milestone years as row.
    '''
    # Include 2014 historic data, but overwrite name so it falls within each SSP
    g14 = get_gcm_data(df_in = df_in, dtype = dtype, year = 2014, month = month, scen = 'historical')
    g14.rename(columns=lambda s: s.replace("historical", scen), inplace = True)
    g30 = get_gcm_data(df_in = df_in, dtype = dtype, year = 2030, month = month, scen = scen)
    g50 = get_gcm_data(df_in = df_in, dtype = dtype, year = 2050, month = month, scen = scen)
    g80 = get_gcm_data(df_in = df_in, dtype = dtype, year = 2080, month = month, scen = scen)
    df_fut = pd.concat([g14, g30, g50, g80], axis = 0)
    return df_fut

def run_data_collection(list_pfs, month):    
    '''
    PURPOSE: Run the entire process to summarize time series for different datatypes for different scenarios into 
        simple milestone year summaries. Align baseline and 2014 historical data under each SSP so data sits cleanly in 1 row per scen per gcm
    VARIABLES:
                1. list_pfs = list of catchments to process
                2. month is the month
    RETURNS: nothing. Saves file per month
    '''
    def create_supplydemand_aggregation(pf, month):
        df_pfin = get_pf_data(pf)
        # - STEP 1: GET BASELINE AND GCM HISTORIC DATA
        df_b_ba = get_base_data(df_in = df_pfin, dtype = 'ba', year = 2019, month = month)
        df_b_ww = get_base_data(df_in = df_pfin, dtype = 'ww', year = 2019, month = month)
        df_b_wn = get_base_data(df_in = df_pfin, dtype = 'wn', year = 2019, month = month)
        # - STEP 2: GET GCM FUTURE DATA
        ba_tables = []
        ww_tables = []
        wn_tables = []
        for s in ['ssp126', 'ssp370', 'ssp585']:
            df_ba = pull_gcm_scen(pf = pf, df_in = df_pfin, month = month, dtype = 'ba', scen = s)
            df_ww = pull_gcm_scen(pf = pf, df_in = df_pfin, month = month, dtype = 'ww', scen = s)
            df_wn = pull_gcm_scen(pf = pf, df_in = df_pfin, month = month, dtype = 'wn', scen = s)
            ba_tables.append(df_ba)
            ww_tables.append(df_ww)
            wn_tables.append(df_wn)
        df_f_ba = pd.concat(ba_tables, axis = 1)
        df_f_ww = pd.concat(ww_tables, axis = 1)
        df_f_wn = pd.concat(wn_tables, axis = 1)
        # STEP 3: MERGE SUPPLY AND DEMAND TOGETHER
        # -- Baseline
        df_bse = pd.concat([df_b_ba, df_b_ww, df_b_wn], axis = 1)
        # -- Future
        df_fut = pd.concat([df_f_ba, df_f_ww, df_f_wn], axis = 1)
        # # STEP 4: SHAPE THE BASELINE DATA TO FIT SEAMLESSLY WITH GCM DATA
        # Add row for baseline
        df_fut.loc['b4_2019'] = np.nan
        # Reorganize future data
        gcms = ['gfdl-esm4','ipsl-cm6a-lr','mpi-esm1-2-hr','mri-esm2-0','ukesm1-0-ll']
        df_fut = df_fut.reindex(['b4_2019', '2014', '2030', '2050', '2080'])
        fut_cols = ['{}-{}_{}_{}'.format(i, j, k, l) for l in ['ssp126', 'ssp370', 'ssp585'] for k in gcms  for j in ['ba', 'ww', 'wn'] for i in ['raw', 'avg', 'std', 'tsr']]
        df_fut = df_fut[fut_cols]
        # Repeat base for all GCMs and SSPs in future, set columns as future
        df_nbse = pd.concat([df_bse] * (15), axis=1, ignore_index=True)
        df_nbse.columns = fut_cols
        # Fill in baseline data in future table
        df_merged = df_fut.fillna(df_nbse)
        # # STEP 5: ASSIGN INDEX TO DATA OF PF, PERIOD, AND MONTH
        df_merged.index.name = 'period'
        df_merged['month'] = int(month)
        df_merged['pfaf_id'] = pf
        df_merged.reset_index(inplace = True)
        df_merged.set_index(['pfaf_id', 'period', 'month'], inplace = True)
        return df_merged

    # Set number of workers
    n_workers = 40
    # Keep track of while loop
    worker_count = 1
    # While catchments remain in objectids, repeat this process. For every new round, lessen the number of workers to prevent memory overload
    run_count = 1
    oid_count = 0
    # Segment catchments by workers for parallel process
    objectids_list = segment_id_list(lst=list_pfs, n= int(n_workers / worker_count))
    # Step 3. Clip and sum by polygon
    df_fs = []
    for oids in tqdm(objectids_list):
        df_fs.append(Parallel(n_jobs=n_workers)(delayed(create_supplydemand_aggregation)(p, month) for p in oids))
        run_count += 1
        oid_count = len(oids) + oid_count
        print('- - - - - run number', run_count, "\n- - - - - - Remaining catchments:", len(list_pfs) - oid_count)
        # Turn output into 1 csv
    tables = []
    for i in range(0, len(df_fs)):
        tables.append(pd.concat(df_fs[i]))
    df_final = pd.concat(tables)
    # Save
    df_final.to_csv(sdPATH(m))
    del df_final


## Run

In [None]:
# For every month for every catchment, turn time series into milestone years
# Takes a long time...
for m in months:
    run_data_collection(list_pfs = list_pfs, month = m)


# 2. Restructure data and add arid & low water use thresholds

In [168]:
for m in months:
    df_sd = pd.read_csv(sdPATH(m), index_col = ['pfaf_id', 'period', 'month'])
    # Melt data so scenario columns become rows
    df_melt = df_sd.melt(ignore_index = False)
    # Create new fields to track data stats and scenarios
    df_melt['dtype'] = df_melt['variable'].apply(lambda x: x[0:6])
    df_melt['scen'] = df_melt['variable'].apply(lambda x: x[7:])
    # Pivot data stats back to columns, keep scenarios as rows
    df_pv = pd.pivot(data = df_melt.reset_index(), values = 'value', columns = 'dtype', index = ['pfaf_id', 'period', 'month', 'scen'])
    # Only keep TSR, AVG, and STD values
    pv_cols = [ '{}-{}'.format(s,d) for d in ['ba', 'ww', 'wn'] for s in ['tsr', 'avg', 'std']]
    df_pv = df_pv.filter(pv_cols)
    # Add area (in m2) to data frame
    df_f = pd.merge(df_pv.reset_index(), area, how = 'left', left_on = 'pfaf_id', right_index = True).set_index(['pfaf_id', 'period', 'month', 'scen'])
    # Flux (m/month) = Million m3 * 1 million / m2
    df_f['flx-ba'] = df_f['tsr-ba'].divide(df_f['area_m2']) * 1e6
    df_f['flx-ww'] = df_f['tsr-ww'].divide(df_f['area_m2']) * 1e6
    # Turn flux into a yes/no indictor. 
    # If value is less than 1/2x the arid/lowuse threshold, set as 1 (True) (more conservative)
    df_f['flx2-ba'] = np.where(df_f['flx-ba'] < aridThres/24.0, 1, 0)
    df_f['flx2-ww'] = np.where(df_f['flx-ww'] < lowuseThres/24.0, 1, 0)
    # If value is less than the arid/lowuse threshold, set as 1 (True) 
    df_f['flx-ba'] = np.where(df_f['flx-ba'] < aridThres/12.0, 1, 0)
    df_f['flx-ww'] = np.where(df_f['flx-ww'] < lowuseThres/12.0, 1, 0)
    df_f.drop(['area_m2'], axis = 1, inplace = True)
    df_f.to_csv(sdPATH(m + "-exploded"))

# 3. Run Bias Correction

In [5]:
def run_bias_correction(futdf, year, hisdf , basdf):
    # Merge GCM Historic and Baseline values as COLUMNS next to future data
    gcmref = pd.merge(futdf, hisdf, how = 'left', left_index = True, right_index = True, suffixes = ['', '_hist'])
    gcmref = pd.merge(gcmref, basdf, how = 'left', left_index = True, right_index = True, suffixes = ['', '_base'])

    for d in ['ba', 'ww', 'wn']:
        # # Do the bias correction for each data type
        # for d in ['ba', 'ww', 'wn']:
        gcmfut, gcmhis, bashis = 'tsr-{}'.format(d), 'tsr-{}_hist'.format(d), 'tsr-{}_base'.format(d)     # GCM future value
        # Flux status for data type. Note: using water use threshold for consumption
        # Note: I use 1/2x the threshold because I limited how many catchments are reserved. This step should only be for the smallest volumes
        if d == 'wn':
            flxhis = 'flx2-{}_hist'.format('ww')
        else: 
            flxhis = 'flx2-{}_hist'.format(d)  # Flux status for GCM historic (0 = lower than arid&lowwateruse threshold)
        
        # Calculation steps: Addititive, and final bias-corrected raw value
        addfut, bcrfut = 'add-{}'.format(d), 'bcr-{}'.format(d)    
        # - 1 - (Future GCM - Historic) + Base
        gcmref[bcrfut] = (gcmref[gcmfut].subtract(gcmref[gcmhis])).add(gcmref[bashis])
        # - 2 - Set negatives = 0
        gcmref[bcrfut] = gcmref[bcrfut].mask(gcmref[bcrfut] < 0, 0)
        # - 7 - Perform check on data. Compare new bias-corrected data to original future value
        # Compare new bias-corrected data to original future value
        # Compare new bias-corrected data to baseline
        gcmref['compare'] = (gcmref[bcrfut].subtract(gcmref[bashis])).divide(gcmref[bashis])
        gcmref['compare'].replace(np.inf, 0, inplace = True)
        print('{} {} min and max Baseline: {:,.2f}-{:,.2f}'.format(year, d, gcmref[bashis].min(), gcmref[bashis].max()))
        print('{} {} min and max BC Future: {:,.2f}-{:,.2f}'.format(year, d, gcmref[bcrfut].min(), gcmref[bcrfut].max()))
        print('{} {} min and max percent change: {}-{}'.format(year, d, gcmref['compare'].min(), gcmref['compare'].max()))
        print("Increase in {}: {:,.0f}".format(d, len(gcmref[gcmref['compare'] > 0])))
        print("Decrease in {}: {:,.0f}".format(d, len(gcmref[gcmref['compare'] < 0])))

    # Export only the bias-corrected values
    gcmref['period'] = year
    df_final = gcmref.reset_index().set_index(['pfaf_id', 'period', 'month', 'scen'])
    df_final = df_final[[x for x in df_final if 'bcr' in x]]
    return df_final

In [6]:
for m in months:
    # Read in supply demand data
    df_sd = pd.read_csv(sdPATH(m + "-exploded"), index_col = ['pfaf_id', 'period', 'month', 'scen'])
    bc_cols = [x for x in df_sd.columns if ('avg' not in x) and ('std' not in x)]

    # Separate data by time period
    bashist = df_sd[df_sd.index.get_level_values('period') == 'b4_2019'].droplevel(['period']).filter(bc_cols)
    gcmhist = df_sd[df_sd.index.get_level_values('period') == '2014'].droplevel(['period']).filter(bc_cols)
    gcm2030 = df_sd[df_sd.index.get_level_values('period') == '2030'].droplevel(['period']).filter(bc_cols)
    gcm2050 = df_sd[df_sd.index.get_level_values('period') == '2050'].droplevel(['period']).filter(bc_cols)
    gcm2080 = df_sd[df_sd.index.get_level_values('period') == '2080'].droplevel(['period']).filter(bc_cols)

    # Run Bias Correction Funnction
    bc_tables = [] 
    bc_tables.append(run_bias_correction(futdf = gcm2030, year = 2030, hisdf = gcmhist, basdf = bashist))
    bc_tables.append(run_bias_correction(futdf = gcm2050, year = 2050, hisdf = gcmhist, basdf = bashist))
    bc_tables.append(run_bias_correction(futdf = gcm2080, year = 2080, hisdf = gcmhist, basdf = bashist))
    
    # Save results per month
    df = pd.concat(bc_tables)
    
    df_all = df_sd.reset_index()
    
    # Now, add to supply and demand data
    # First, get rid of 2014 data and set baseline = 2019, and set as integer
    df_all = df_all[df_all.period != '2014']
    # Next, we no longer need baseline tied to the future scenarios. Relabel scenaro as basline, and drop duplicates 
    # (baseline had been repeated for every future scenario)
    df_all['scen'] = df_all['scen'].mask(df_all['period'] == 'b4_2019', 'baseline_hist')
    df_all.drop_duplicates(subset = ['pfaf_id', 'period', 'scen'], inplace = True)
    # Relabel base period as 2019 and set as integer
    df_all['period'] = df_all['period'].mask(df_all['period'] == 'b4_2019', '2019')
    df_all['period'] = df_all['period'].astype(int)
    # Reset index()
    df_all.set_index(['pfaf_id', 'period', 'month', 'scen'], inplace= True)
    # Merge Original data with new bias correction data
    df_merge = pd.merge(df_all, df, how= 'left', left_index = True, right_index = True).reset_index()
    # # Now, for baseline, use TSR values in Bias Corrected column
    df_merge['bcr-ba'] = df_merge['bcr-ba'].mask(df_merge['period'] == 2019, df_merge['tsr-ba'])
    df_merge['bcr-ww'] = df_merge['bcr-ww'].mask(df_merge['period'] == 2019, df_merge['tsr-ww'])
    df_merge['bcr-wn'] = df_merge['bcr-wn'].mask(df_merge['period'] == 2019, df_merge['tsr-wn'])
    df_merge.set_index(['pfaf_id', 'period', 'month', 'scen'], inplace= True)
    df_merge.to_csv(bcPATH(m + "-exploded-additive"))

2030 ba min and max Baseline: 0.00-1,191,698.61
2030 ba min and max BC Future: 0.00-1,215,437.30
2030 ba min and max percent change: -1.0-3.919631936457998e+41
Increase in ba: 142,475
Decrease in ba: 89,231
2030 ww min and max Baseline: 0.00-3,789.26
2030 ww min and max BC Future: 0.00-4,056.33
2030 ww min and max percent change: -1.0-3937.772421265366
Increase in ww: 215,197
Decrease in ww: 18,833
2030 wn min and max Baseline: 0.00-987.38
2030 wn min and max BC Future: 0.00-1,064.62
2030 wn min and max percent change: -1.0-2703.0877391633094
Increase in wn: 215,731
Decrease in wn: 18,299
2050 ba min and max Baseline: 0.00-1,191,698.61
2050 ba min and max BC Future: 0.00-1,219,449.56
2050 ba min and max percent change: -1.0-7.516102379691402e+41
Increase in ba: 147,465
Decrease in ba: 84,344
2050 ww min and max Baseline: 0.00-3,789.26
2050 ww min and max BC Future: 0.00-4,098.53
2050 ww min and max percent change: -1.0-6266.314884928143
Increase in ww: 217,335
Decrease in ww: 16,690
20

2080 ba min and max Baseline: 0.00-1,876,452.92
2080 ba min and max BC Future: 0.00-1,892,402.49
2080 ba min and max percent change: -1.0-4.7783235473120287e+39
Increase in ba: 139,821
Decrease in ba: 92,151
2080 ww min and max Baseline: 0.00-4,996.31
2080 ww min and max BC Future: 0.00-5,732.63
2080 ww min and max percent change: -1.0-29618.968917670205
Increase in ww: 211,190
Decrease in ww: 22,830
2080 wn min and max Baseline: 0.00-2,225.86
2080 wn min and max BC Future: 0.00-2,904.26
2080 wn min and max percent change: -1.0-63671.74416248028
Increase in wn: 206,428
Decrease in wn: 27,602
2030 ba min and max Baseline: 0.00-1,746,744.97
2030 ba min and max BC Future: 0.00-1,779,403.23
2030 ba min and max percent change: -1.0-2.4241739815311274e+39
Increase in ba: 135,899
Decrease in ba: 95,938
2030 ww min and max Baseline: 0.00-4,455.36
2030 ww min and max BC Future: 0.00-4,909.20
2030 ww min and max percent change: -1.0-26034.66280497663
Increase in ww: 211,000
Decrease in ww: 23,04

2050 ba min and max Baseline: 0.00-1,071,130.68
2050 ba min and max BC Future: 0.00-1,130,013.35
2050 ba min and max percent change: -1.0-7.947761417142356e+39
Increase in ba: 130,294
Decrease in ba: 101,435
2050 ww min and max Baseline: 0.00-2,916.13
2050 ww min and max BC Future: 0.00-2,772.45
2050 ww min and max percent change: -1.0-16556.468841146376
Increase in ww: 217,358
Decrease in ww: 16,667
2050 wn min and max Baseline: 0.00-1,380.10
2050 wn min and max BC Future: 0.00-1,091.84
2050 wn min and max percent change: -1.0-4245.785151237745
Increase in wn: 216,208
Decrease in wn: 17,822
2080 ba min and max Baseline: 0.00-1,071,130.68
2080 ba min and max BC Future: 0.00-1,062,310.81
2080 ba min and max percent change: -1.0-5.156734778321536e+39
Increase in ba: 125,342
Decrease in ba: 106,477
2080 ww min and max Baseline: 0.00-2,916.13
2080 ww min and max BC Future: 0.00-2,682.20
2080 ww min and max percent change: -1.0-30642.599763123337
Increase in ww: 212,386
Decrease in ww: 21,6

# 4. Pool delta basins

In [7]:
# Pool supply, demand and consumption by delta, per Aqueduct 3.0
def adjust_delta_regions(df_in, period, month):
    # Read in data, filter by period
    df_a = df_in.loc[period].loc[int(month)]
    avg_cols = ['{}-{}'.format(s, d) for d in ['ba', 'ww', 'wn'] for s in ['bcr', 'avg', 'std']]
    # Merge delta IDs and area with adjusted data. This will only keep watersheds in deltas
    df_dlts = pd.merge(df_db.filter(['delta_id', 'area_m2']), df_a, how = 'left', left_index = True, right_index = True)
    # Find total supply, demand,  within each delta area
    df_dg = df_dlts.groupby(['delta_id', 'scen'])[avg_cols].sum()
    # Now, merge pooled data by to pfaf_ids
    df_dlts_t = pd.merge(df_dlts.reset_index().filter(['pfaf_id', 'delta_id', 'scen']), df_dg, how = 'left', left_on = ['delta_id', 'scen'], right_index = True)
    df_dlts_t.drop(['delta_id'], axis = 1, inplace = True)
    df_dlts_t.set_index(['pfaf_id', 'scen'], inplace= True)

    # Now, go back to original dataset. Set delta data = nan
    df_a[df_a.index.get_level_values('pfaf_id').isin(yes_deltas)] = np.nan
    # # Replace delta data with pooled data
    df_fixed = df_a.fillna(df_dlts_t)
    # Add period and month back to datafrome
    df_fixed.reset_index(inplace = True)
    df_fixed['period'] = period
    df_fixed['month'] = m
    return df_fixed.set_index(['pfaf_id', 'period', 'month', 'scen'])


In [8]:

for m in months:
    # Read in bias corrected data
    df_adj = pd.read_csv(bcPATH(m + "-exploded-additive"), index_col = [ 'period', 'month', 'pfaf_id', 'scen'])

    # # Run function to sum resources over Final data. 
    d19 = adjust_delta_regions(df_in = df_adj, period = 2019, month = m)
    d30 = adjust_delta_regions(df_in = df_adj, period = 2030, month = m)
    d50 = adjust_delta_regions(df_in = df_adj, period = 2050, month = m)
    d80 = adjust_delta_regions(df_in = df_adj, period = 2080, month = m)


    df_deltas = pd.concat([d19, d30, d50, d80])
    df_deltas.sort_index(inplace = True)

    # Finally, drop TSR columns and recalc fluxes using BCR columns
    df_clean = df_deltas.filter(['{}-{}'.format(s, d) for d in ['ba', 'ww', 'wn'] for s in ['bcr', 'avg', 'std']])
    # Add area (in m2) to data frame
    df_f = pd.merge(df_clean.reset_index(), area2, how = 'left', left_on = 'pfaf_id', right_index = True).set_index(['pfaf_id', 'period', 'month', 'scen'])

    # Flux = Million m3 * 1 million / m2
    df_f['flx-ba'] = df_f['bcr-ba'].divide(df_f['area_m2']) * 1e6
    df_f['flx-ww'] = df_f['bcr-ww'].divide(df_f['area_m2']) * 1e6
    # Turn flux into a yes/no indictor. 
    # If value is less than 1/2x the arid/lowuse threshold, set as 1 (True) (more conservative)
    df_f['flx2-ba'] = np.where(df_f['flx-ba'] < aridThres/24.0, 1, 0)
    df_f['flx2-ww'] = np.where(df_f['flx-ww'] < lowuseThres/24.0, 1, 0)
    # If value is less than the arid/lowuse threshold, set as 1 (True) 
    df_f['flx-ba'] = np.where(df_f['flx-ba'] < aridThres/12.0, 1, 0)
    df_f['flx-ww'] = np.where(df_f['flx-ww'] < lowuseThres/12.0, 1, 0)
    df_f.drop(['area_m2'], axis = 1, inplace = True)
    df_f.to_csv(dlPATH(m + "-exploded-additive"))

# Finalize data

In [9]:
# Pull all months into 1 table
month_tables = []
for m in months:
    df_f = pd.read_csv(dlPATH(m + "-exploded-additive"), index_col = ['pfaf_id', 'period', 'month', 'scen'])
    month_tables.append(df_f)
# Merge all monthly data together
df_m = pd.concat(month_tables)

In [10]:
# Now,calculate annual data.First, drop flux and arid columns. They'll need to be redone
df_mf = df_m.iloc[:, 0:9]
avg_cols = df_mf.columns
# Sum month by milestone year
df_a = df_mf.reset_index().groupby(['pfaf_id', 'period', 'scen'])[avg_cols].sum()
# Find monthly average and STD for supply
df_avg= df_mf.reset_index().groupby(['pfaf_id', 'period', 'scen'])['avg-ba'].mean().to_frame(name = 'm_avg')
df_std= df_mf.reset_index().groupby(['pfaf_id', 'period', 'scen'])['avg-ba'].std().to_frame(name = 'm_std')
# Merge into 1
df_a = pd.concat([df_a, df_avg, df_std], axis = 1)

# Now, add flux and arid definitions
# Add area (in m2) to data frame
df_f = pd.merge(df_a.reset_index(), area2, how = 'left', left_on = 'pfaf_id', right_index = True).set_index(['pfaf_id', 'period', 'scen'])
# Flux = Million m3 * 1 million / m2
df_f['flx-ba'] = df_f['bcr-ba'].divide(df_f['area_m2']) * 1e6
df_f['flx-ww'] = df_f['bcr-ww'].divide(df_f['area_m2']) * 1e6
# Turn flux into a yes/no indictor. 
# If value is less than the arid/lowuse threshold, set as 1 (True) 
df_f['flx-ba'] = np.where(df_f['flx-ba'] < aridThres, 1, 0)
df_f['flx-ww'] = np.where(df_f['flx-ww'] < lowuseThres, 1, 0)
df_f.drop(['area_m2'], axis = 1, inplace = True)
df_f['ar'] = 0
df_f['ar'][(df_f['flx-ba'] == 1) & (df_f['flx-ww'] == 1) ] = 1


# Redo Annual STD stats for none-delta basins
# Now read in yearly stats to get better version of standard deviation (created over annual timeframe, not a sum of monthly)
# Only need to do for supply, demand STD is not used
df_y = pd.read_csv(ysdSTATSPATH, index_col = ['pfaf_id', 'period'])
# Melt data so scenario columns become rows
df_melt = df_y.melt(ignore_index = False)
# Create new fields to track data stats and scenarios
df_melt['dtype'] = df_melt['variable'].apply(lambda x: x[0:6])
df_melt['scen'] = df_melt['variable'].apply(lambda x: x[7:])
# Rename baseline
df_melt['scen'] = df_melt['scen'].mask(df_melt['scen'] == 'gswp3-w5e5_historical', 'baseline_hist')
# Pivot data stats back to columns, keep scenarios as rows
df_pv = pd.pivot(data = df_melt.reset_index(), values = 'value', columns = 'dtype', index = ['pfaf_id', 'period', 'scen'])
df_pv.dropna(inplace=True)

df_f['std-ba'] = df_f['std-ba'].mask(df_f['delta_id'] == 0, np.nan)
df_f['std-ww'] = df_f['std-ww'].mask(df_f['delta_id'] == 0, np.nan)

df_f['std-ba'] = df_f['std-ba'].fillna(df_pv['std-ba'])
df_f['std-ww'] = df_f['std-ww'].fillna(df_pv['std-ww'])


df_f.to_csv(fnPATH('annual'))

In [11]:
# Now, add Annual Arid definition to data
df_month = pd.merge(df_m.reset_index(), df_f.filter(['ar']), how = 'left', 
                                         left_on = ['pfaf_id', 'period', 'scen'], right_index = True).set_index(['pfaf_id', 'period', 'month', 'scen'])

df_month.to_csv(fnPATH('monthly'))