# Step 5a: Clean withdrawal data per basin/stat intersect

1. Restrucutre the key_years_corr.csv (the official datasets with withdrawal values per basin/state intersects) so that there is a column per sector/gcm/scen. This will align the data to the official Aqueduct indicator structure

2. This script then creates a bias-corrected version of the irrigation data at the basin-state level. This output will be used in the weighted aggregation to country and state indicators ("basin_state_withdrawals.csv")

3. Finally, this script creates the official withdrawal volume per sector per administrative unit. Bias-correction is done at the country and state level to mitigate the impact of small historical values

In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
# PATHS!
# Country
crtyROOT = r'\Projections\Final_Data\Data\Aqueduct40\step5_country_rankings'
                   
# key_years_corr.csv is the official datasets with withdrawal values per basin/state intersects. THis was produced during the zonal statistics stage using DataBricks  
keysPATH = os.path.join(crtyROOT, "key_years_corr.csv") 
# restrucuted
wwrawPATH = os.path.join(crtyROOT, "withdrawals_basin-states-raw.csv")  
wwbcPATH = os.path.join(crtyROOT, "withdrawals_basin-states-bias_corrected.csv")  
wwadPATH = os.path.join(crtyROOT, "withdrawals_{}.csv").format 

scenFolders = ['ssp126',
               'ssp370',
               'ssp585']
gcmFolders = ['gfdl-esm4',
              'ipsl-cm6a-lr',
              'mpi-esm1-2-hr',
              'mri-esm2-0',
              'ukesm1-0-ll']

# 1. Restructure basin-state data, create column for every sector/gcm/scen

The values in key_corrs represent million cubic meters of annual demand per sector per milestone year. Irrigation data is the only sector that is calculated from PCR-GLOBWB model ouputs (because its's a function of climate). Demand and industry are projected by SSP, and are the same across the 5 GCMs per scenarion. Livestock withdrawal data ends in 2019. All future values equal the 2019 value. 

This script will create a clean version that has a column for every sector/gcm/scen combintation for all milestone years for all basin/state IDs. The only exception is the GCM-historic combo. We only need irrigation because that is the only sector in need of bias-correction. 

In [3]:
# Read in original data 
df_sdog = pd.read_csv(keysPATH)
# Clean data. Drop unnamed columns and set index to ID and year
df_sd = df_sdog.loc[:, [x for x in df_sdog.columns if "Unnamed" not in x]]
df_sd.set_index(['basin_stat', 'year'], inplace = True)

# Create lists to hold column names for the different sectors
# - - Future GCM Irrigation; create version that remove extra text
irrsspcols = ['girrww_irr_{}_{}_alldata.csv'.format(g, s) for s in scenFolders for g in gcmFolders]
fixedirrssp = [x.replace("_alldata.csv", "").replace("girrww_irr", "girrww") for x in irrsspcols]
print("Cleaned irrigation SSP columns:\n", fixedirrssp)
# - - Historic GCM + Baseline all sectors; create version that remove extra text
histcols = [x for x in df_sd.columns if ('historical' in x) ]
fixedhist = [x.replace("_alldata.csv", "").replace("girrww_irr", "girrww").replace("ww_historical", "ww_gswp3-w5e5_historical") for x in histcols]
print("Cleaned historic columns:\n", fixedhist)
# - - Baseline all sectors;
basehist = [x for x in fixedhist if "gswp3" in x]
# Create dictionaries to match old column names to fixed
ssp_correction_dic = dict(zip(irrsspcols, fixedirrssp))
hist_correction_dic = dict(zip(histcols, fixedhist))

# Replace column names and filter to only keep these raw columns
df_sd_clean = df_sd.rename(columns = ssp_correction_dic)
df_sd_clean = df_sd_clean.rename(columns = hist_correction_dic)
df_sdf = df_sd_clean.filter(fixedhist + fixedirrssp )

# You'll notice that we did not keep the SSP future data for domestic, industry, and livestock. We will create those columns now. 
# - - For domestic and industry, there is one column per SSP (as these were inputs to the PCR-GLOBWB model). 
# - - We will repeat each SSP column 5x, adding each GCM tag to the name. This way, we can easily pull data by SSP & GCM

for d in ['dom', 'ind']:
    # the SSP columns for irrigation include the 5 GCMs per scen. We can copy this list and replace IRR with DOM or IND
    newssp = [x.replace('irr', d) for x in fixedirrssp]
    # loop through each ssp
    for s in ['ssp1', 'ssp3', 'ssp5']:
        # Select just the columns related to the selected ssp
        selssp = [x for x in newssp if s in x]
        for sel in selssp:
            # copy SSP column 5x and rename with GCM tags
            df_sdf[sel] = df_sd_clean['g{}ww_{}'.format(d,s)]
# - - For livestock, there are no SSP files. Instead, copy the historic files to create a column for each gcm/scen (15 in total)
newssp = [x.replace('irr', 'liv') for x in fixedirrssp]
for sel in newssp:
    df_sdf[sel] = df_sd_clean['glivww_gswp3-w5e5_historical']

# Finally, clean the data. SSP columns that intersect rows with years 2014 and 2019 (historic) should be NAN, and vice-versa for Historic columns and future years
df_sdf.loc[df_sdf.index.get_level_values('year').isin([2014,2019]), [x for x in df_sdf.columns if ("ssp"  in x) ]] = np.nan
df_sdf.loc[df_sdf.index.get_level_values('year') > 2019, [x for x in df_sdf.columns if "historical"  in x]] = np.nan

# Reorder columns for useability
all_ssp = ['g{}ww_{}_{}'.format(d, g, s) for d  in ['dom', 'ind', 'liv', 'irr'] for s in scenFolders for g in gcmFolders]
irrhis = ['girrww_{}_historical'.format(g) for g in gcmFolders]
df_sdf = df_sdf.filter(basehist + irrhis + all_ssp)
# df_sdf.to_csv(wwrawPATH)

Cleaned irrigation SSP columns:
 ['girrww_gfdl-esm4_ssp126', 'girrww_ipsl-cm6a-lr_ssp126', 'girrww_mpi-esm1-2-hr_ssp126', 'girrww_mri-esm2-0_ssp126', 'girrww_ukesm1-0-ll_ssp126', 'girrww_gfdl-esm4_ssp370', 'girrww_ipsl-cm6a-lr_ssp370', 'girrww_mpi-esm1-2-hr_ssp370', 'girrww_mri-esm2-0_ssp370', 'girrww_ukesm1-0-ll_ssp370', 'girrww_gfdl-esm4_ssp585', 'girrww_ipsl-cm6a-lr_ssp585', 'girrww_mpi-esm1-2-hr_ssp585', 'girrww_mri-esm2-0_ssp585', 'girrww_ukesm1-0-ll_ssp585']
Cleaned historic columns:
 ['gdomww_gswp3-w5e5_historical', 'gindww_gswp3-w5e5_historical', 'girrww_gfdl-esm4_historical', 'girrww_ipsl-cm6a-lr_historical', 'girrww_mpi-esm1-2-hr_historical', 'girrww_mri-esm2-0_historical', 'girrww_ukesm1-0-ll_historical', 'glivww_gswp3-w5e5_historical', 'girrww_gswp3-w5e5_historical']


In [4]:
# View final data
print(df_sdf.loc['111011-EGY.15_1'])

      gdomww_gswp3-w5e5_historical  gindww_gswp3-w5e5_historical  \
year                                                               
2014                      8.282047                     38.554481   
2019                      9.940389                     42.884429   
2030                           NaN                           NaN   
2050                           NaN                           NaN   
2080                           NaN                           NaN   

      glivww_gswp3-w5e5_historical  girrww_gswp3-w5e5_historical  \
year                                                               
2014                      0.002875                      0.000000   
2019                      0.002891                      7.388355   
2030                           NaN                           NaN   
2050                           NaN                           NaN   
2080                           NaN                           NaN   

      girrww_gfdl-esm4_historical  girrww_ipsl

# 2. Create final dataset for weighted irrigation

We will replace irrigation with a bias-corrected version, and then calculate total demand per basin-state ID

In [5]:
df_sdf = pd.read_csv(wwrawPATH, index_col = ['basin_stat', 'year'])

In [6]:
# First separate data into three buckets: baseline, historic gcm, and future gcm
b19 = df_sdf.loc[df_sdf.index.get_level_values('year') == 2019, 'girrww_gswp3-w5e5_historical'].droplevel('year')
h14 = df_sdf.loc[df_sdf.index.get_level_values('year') == 2014, irrhis].droplevel('year').melt(ignore_index = False, value_name='h14')
f30 = df_sdf.loc[df_sdf.index.get_level_values('year') == 2030, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')
f50 = df_sdf.loc[df_sdf.index.get_level_values('year') == 2050, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')
f80 = df_sdf.loc[df_sdf.index.get_level_values('year') == 2080, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')

# Clean data, create columns to hold sector, gcm and ssp for GCM dataframes
b19.name = 'b19'
for df in [h14, f30, f50, f80]:
    df[['sector', 'gcm', 'ssp']] = df['variable'].str.split('_',  expand=True)
    df.drop(['variable', 'sector'], axis = 1, inplace = True)
h14.drop(['ssp'], axis = 1, inplace = True)
# Merge baseline and historic GCM data together
df_his = pd.merge(b19, h14, how = 'left', left_index = True, right_index = True)


In [7]:
def run_bias_correction(df_fp, year):
    # Merge selected future data with historic data based on polygon and gcm
    df_bias = pd.merge(df_his.reset_index(), df_fp.reset_index(), how = 'left', left_on = ['basin_stat', 'gcm'], right_on = ['basin_stat', 'gcm'])
    # Perform bias correction. 
    # - - Bias correction =  (GCM Future  - GCM historic) + Baseline
    df_bias['corr'] = df_bias['fp'].subtract(df_bias['h14']).add(df_bias['b19'])
    # If the difference between future and historic is > baseline (aka <0 corr value), set corr = 0
    df_bias.loc[(df_bias['corr'] < 0), 'corr'] = 0
    # Reshape the data into sector-gcm-ssp. 
    # - - Create column to hold redone column name
    df_bias['column'] = 'girrww_' + df_bias['gcm'] + "_" + df_bias['ssp']
    # - - Add year back into the data
    df_bias['year'] = year
    # - - Pivot columns back to columns
    df_bias_clean = pd.pivot(data = df_bias, values = 'corr', columns = 'column', index = ['basin_stat', 'year'])
    return df_bias_clean

bc30 = run_bias_correction(df_fp = f30, year = 2030)
bc50 = run_bias_correction(df_fp = f50, year = 2050)
bc80 = run_bias_correction(df_fp = f80, year = 2080)
df_bc = pd.concat([bc30, bc50, bc80], axis = 0)

# Add newly adjusted irrigation data to other sectoral data. Keep all GSWP baseline data and all non-irr SSP data
df_keep = df_sdf[basehist + list(set(all_ssp) - set(fixedirrssp))]
df_final = pd.concat([df_keep, df_bc], axis = 1)

# # Finally, calculate total 
df_final['gtotww_gswp3-w5e5_historical'] = df_final.loc[:, [x for x in df_final.columns if "historical" in x]].sum(axis = 1)
gcms = ['gfdl-esm4', 'ipsl-cm6a-lr', 'mpi-esm1-2-hr', 'mri-esm2-0', 'ukesm1-0-ll']
for g in gcms:
    df_final['gtotww_{}_ssp126'.format(g)] = df_final.loc[:, [x for x in df_final.columns if g + "_ssp126" in x]].sum(axis = 1)
    df_final['gtotww_{}_ssp370'.format(g)] = df_final.loc[:, [x for x in df_final.columns if g + "_ssp370" in x]].sum(axis = 1)
    df_final['gtotww_{}_ssp585'.format(g)] = df_final.loc[:, [x for x in df_final.columns if g + "_ssp585" in x]].sum(axis = 1)

df_final.loc[df_final.index.get_level_values('year') > 2019, 'gtotww_gswp3-w5e5_historical'] = np.nan

In [8]:
df_final.to_csv(wwbcPATH)

# 3. Sum withdrawal by Country and State; perform bias corrected once data is aggregated

In [11]:
# Read in raw data
df_sdf = pd.read_csv(wwrawPATH)
gcols = [x for x in df_sdf.columns if x[0] == 'g']
df_sdf['gid_1'] = df_sdf['basin_stat'].apply(lambda x: x.split("-")[1])
df_sdf['gid_0'] = df_sdf['gid_1'].apply(lambda x: x[0:3])

In [301]:
# # gid = 'gid_0'
# gid = 'gid_1'

In [12]:
def find_admin_demand(gid, outTAG):# Sum each sector-gcm-scen by Admin unit and year
    df_adm = df_sdf.groupby([gid, 'year'])[gcols].sum().reset_index()
    # Set any data in historic columns > 2019 to NaN
    df_adm.loc[df_adm.year.isin([2014, 2019]), [x for x in df_adm.columns if "ssp" in x]] = np.nan
    # Set any data in SSP columns == 2019 to NaN
    df_adm.loc[df_adm.year > 2019, [x for x in df_adm.columns if "historical"  in x]] = np.nan
    # Set index
    df_adm.set_index([gid, 'year'], inplace = True)

    #  Run bias correction
    # First separate data into three buckets: baseline, historic gcm, and future gcm
    b19 = df_adm.loc[df_adm.index.get_level_values('year') == 2019, 'girrww_gswp3-w5e5_historical'].droplevel('year')
    h14 = df_adm.loc[df_adm.index.get_level_values('year') == 2014, irrhis].droplevel('year').melt(ignore_index = False, value_name='h14')
    f30 = df_adm.loc[df_adm.index.get_level_values('year') == 2030, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')
    f50 = df_adm.loc[df_adm.index.get_level_values('year') == 2050, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')
    f80 = df_adm.loc[df_adm.index.get_level_values('year') == 2080, fixedirrssp].droplevel('year').melt(ignore_index = False, value_name='fp')

    # Clean data, create columns to hold sector, gcm and ssp for GCM dataframes
    b19.name = 'b19'
    for df in [h14, f30, f50, f80]:
        df[['sector', 'gcm', 'ssp']] = df['variable'].str.split('_',  expand=True)
        df.drop(['variable', 'sector'], axis = 1, inplace = True)
    h14.drop(['ssp'], axis = 1, inplace = True)
    # Merge baseline and historic GCM data together
    df_his = pd.merge(b19, h14, how = 'left', left_index = True, right_index = True)

    def run_bias_correction(df_fp, year, gid):
        # Merge selected future data with historic data based on polygon and gcm
        df_bias = pd.merge(df_his.reset_index(), df_fp.reset_index(), how = 'left', left_on = [gid, 'gcm'], right_on = [gid, 'gcm'])
        # Perform bias correction. 
        # - - Bias correction =  (GCM Future  - GCM historic) + Baseline
        df_bias['corr'] = df_bias['fp'].subtract(df_bias['h14']).add(df_bias['b19'])
        # If the difference between future and historic is > baseline (aka <0 corr value), set corr = 0
        df_bias.loc[(df_bias['corr'] < 0), 'corr'] = 0
        # Reshape the data into sector-gcm-ssp. 
        # - - Create column to hold redone column name
        df_bias['column'] = 'girrww_' + df_bias['gcm'] + "_" + df_bias['ssp']
        # - - Add year back into the data
        df_bias['year'] = year
        # - - Pivot columns back to columns
        df_bias_clean = pd.pivot(data = df_bias, values = 'corr', columns = 'column', index = [gid, 'year'])
        return df_bias_clean

    bc30 = run_bias_correction(df_fp = f30, year = 2030, gid = gid)
    bc50 = run_bias_correction(df_fp = f50, year = 2050, gid = gid)
    bc80 = run_bias_correction(df_fp = f80, year = 2080, gid = gid)
    df_bc = pd.concat([bc30, bc50, bc80], axis = 0)

    # Add newly adjusted irrigation data to other sectoral data. Keep all GSWP baseline data and all non-irr SSP data
    df_keep = df_adm[basehist + list(set(all_ssp) - set(fixedirrssp))]
    df_fixed = pd.concat([df_keep, df_bc], axis = 1)

    # Now, restructur data so 2019 baseline falls under each SSP. This will make is much easier to compare baseline to future
    df_m = df_fixed.melt(ignore_index = False).reset_index()
    df_m[['sector', 'gcm', 'ssp']] = df_m['variable'].str.split('_',  expand=True)
    # Separate baseline from future
    m19 = df_m[df_m['year'] == 2019]
    mfp = df_m[df_m['year'] > 2019].set_index([gid, 'year', 'variable'])
    # Sort data so baseline is always on top. Then, use forward fill to fill SSP's 2019
    m19.loc[m19['gcm'] == 'gswp3-w5e5', 'gcm'] = 'base'
    m19 = m19.sort_values(by = [gid, 'sector', 'gcm', 'ssp']).ffill()
    m19.set_index([gid, 'year', 'variable'], inplace = True)
    # merge together
    mall =  pd.concat([mfp, m19], axis = 0)
    mall_clean = mall.loc[(mall['gcm']!= 'gswp3-w5e5'), ['value']].reset_index()

    # Reshape so columns are columns again
    df_mc = pd.pivot(data = mall_clean, values = 'value', columns = 'variable', index = [gid, 'year'])

    # # Finally, calculate total 
    df_mc['gtotww_gswp3-w5e5_historical'] = df_mc.loc[:, [x for x in df_mc.columns if "historical" in x]].sum(axis = 1)
    gcms = ['gfdl-esm4', 'ipsl-cm6a-lr', 'mpi-esm1-2-hr', 'mri-esm2-0', 'ukesm1-0-ll']
    for g in gcms:
        df_mc['gtotww_{}_ssp126'.format(g)] = df_mc.loc[:, [x for x in df_mc.columns if g + "_ssp126" in x]].sum(axis = 1)
        df_mc['gtotww_{}_ssp370'.format(g)] = df_mc.loc[:, [x for x in df_mc.columns if g + "_ssp370" in x]].sum(axis = 1)
        df_mc['gtotww_{}_ssp585'.format(g)] = df_mc.loc[:, [x for x in df_mc.columns if g + "_ssp585" in x]].sum(axis = 1)

    df_mc.loc[df_mc.index.get_level_values('year') > 2019, 'gtotww_gswp3-w5e5_historical'] = np.nan

    # Save to versions of the data. One as is, with sector-gcm-ssp as columns; one exploded, with unique row for every sector-gcm-ssp
    df_mc.to_csv(os.path.join(crtyROOT,wwadPATH(outTAG)))
    # Explode
#     df_exp = df_mc.melt(ignore_index = False)
#     df_exp[['sector', 'gcm', 'ssp']] = df_exp['variable'].str.split('_',  expand=True)
#     df_exp.to_csv(os.path.join(crtyROOT,wwadPATH(outTAG + '-exploded')))

In [13]:
find_admin_demand(gid = 'gid_0', outTAG = 'countries')
find_admin_demand(gid = 'gid_1', outTAG = 'provinces')