# Step 4: Finalize data for downloan

This script brings the annual PFAF baseline and future indicators into clean tables that contain the raw, score, category, and label.

The baseline table will use the same naming mechanism as Aq 3.0
The future table will use this format: 
  bau30_ws_x_r -> SSP 3 7.0 for 2030 water stress raw value

The indicators included as of now are: water stress (ws), water depletion (wd), interannual variability (iv or iav), seasonal variable (sv or sev)

In [2]:
import os, datetime
import geopandas as gpd
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [3]:
# PATHS!
# root
rootPATH = r'\Projections\Final_Data\Data'

# 1. Hydrobasin 6 
hy6PATH = os.path.join(rootPATH,  "shapes", "hybas_lev06_v1c_merged_fiona_V04.shp")

# 2. Country boundaries
ad0PATH = os.path.join(rootPATH,  "shapes", "gadm36_0.shp")

# 3. Aqueduct 4.0 data
aq4PATH = os.path.join(rootPATH, "Aqueduct40", "step3_calculate_indicators", "final", "Aqueduct40_indicators_{}-additive.csv").format

# 4. Aqueduct 3.0 data
aq3ROOT = r'C:\Aqueduct3\download\Y2019M07D12_Aqueduct30_V01\baseline'
aq3PATH = r'C:Aqueduct3\download\Y2019M07D12_Aqueduct30_V01\baseline\annual\arcmap\y2019m07d12_aqueduct30_v01.gdb'
aq3mPATH = r'C:\Aqueduct3\download\Y2019M07D12_Aqueduct30_V01\baseline\monthly\arcmap\y2019m07d12_rh_aqueduct30_data_download_monthly_v01.shp'
# 5. Category Weighting Scheme
wghtPATH = os.path.join(rootPATH,  "Aqueduct40", "util", "aq40_weights_enhanced.xlsx")
# 6. UTILITIY
gdf_m_PATH = os.path.join(rootPATH,  "Aqueduct40", "util", 'aq40_monthly_geometry.shp')
gdf_y_PATH = os.path.join(rootPATH,  "Aqueduct40", "util", 'aq40_annual_geometry.shp')

# SAVE LOCATIONS
outPATH = r'\Projections\Final_Data\Data\Aqueduct40\step4_final_data_download'
basePATH = os.path.join(outPATH, 'baseline', '{0}', 'y2023m07d05_sk_Aqueduct40_indicators_{0}.csv').format
futPATH = os.path.join(outPATH, 'future', 'annual', 'y2023m07d05_sk_Aqueduct40_indicators_{}.csv').format
cartoPATH = os.path.join(outPATH, 'carto', 'y2023m07d05_sk_Aqueduct40_indicators_{}.csv').format

# GCM
# PCR-GLOBWB Indicators
gcmPATH = r'Projections\Final_Data\Data\Aqueduct40\step3_calculate_indicators'
inPATH = os.path.join(gcmPATH, 'working', 'Aqueduct40_indicators_{}-additive.csv').format

# Fix column names
baseline_inds = {'ws': 'bws', 'wd': 'bwd', 'iv': 'iav', 'sv': 'sev'}
baseline_typs = {'_r': '_raw', '_s': '_sco', '_c': '_cat', '_l': '_lab'}
baseline_annual_typs = {'_r': '_raw', '_s': '_score', '_c': '_cat', '_l': '_label'}

# Geospatial Data

In [4]:
hy6 = gpd.read_file(hy6PATH)
# PFAF FOUND TWICE: 353020 IN BOTH ASIA AND NORTHAMERICA (1 catchment cross 2 boundaries). Set all data = No Data.
no_data_pfafs = [353020]
hy6 = hy6.filter(['PFAF_ID', 'geometry']).set_index('PFAF_ID')
hy6.index.name = 'pfaf_id'
# Define projection from input data
hy6_crs = hy6.crs

#  Baseline Monthly

## CARTO

In [13]:
df_bm = pd.read_csv(aq4PATH('monthly'), index_col = 'pfaf_id')
df_gbm = pd.merge(hy6, df_bm, how = 'left', left_index = True, right_index = True)
# Set No Data as -9999
df_gbm.replace(np.nan, -9999, inplace = True)
# Relabel No Datas
for l in [x for x in df_gbm.columns if "lab" in x]:
    df_gbm[l].replace(-9999, 'No Data', inplace = True)

# # Save CSV and Geodatabase version
df_gbm = df_gbm.reset_index().set_index(['pfaf_id', 'year', 'month'])
df_gbm.drop(['geometry'], axis = 1, inplace = True)
df_gbm.to_csv(cartoPATH('monthly'))

In [14]:
# QA RESULTS
inds = ['bws', 'bwd', 'iav']
for i in inds:
    l = "{}_label".format(i)
    r = "{}_raw".format(i)
    s = "{}_score".format(i)
    d = "{}_cat".format(i)
    df = df_gbm.groupby(l)[r, s, d].agg(['min', 'max'])
    print(df)

                            bws_raw                 bws_score               \
                                min          max          min          max   
bws_label                                                                    
Arid and Low Water Use     1.000000     1.000000     5.000000     5.000000   
Extremely High (>80%)      0.800284  9999.000000     4.000512     5.000000   
High (40-80%)              0.400018     0.799914     3.000065     3.999845   
Low (<10%)                 0.000000     0.099996     0.000000     0.999937   
Low - Medium (10-20%)      0.100002     0.199887     1.000028     1.999185   
Medium - High (20-40%)     0.200032     0.399962     2.000231     2.999863   
No Data                -9999.000000 -9999.000000 -9999.000000 -9999.000000   

                       bws_cat          
                           min     max  
bws_label                               
Arid and Low Water Use    -1.0    -1.0  
Extremely High (>80%)      4.0     4.0  
High (40-80%) 

## DATA DOWNLOAD

In [15]:
# # Rename columns using Aqueduct naming structure. 
ind_cols = df_gbm.columns.tolist()
month_data = []
# Loop by month
for m in [str(x).zfill(2) for x in range(1,13)]:
    # Filter dataframe to select 1 month at a time
    df_f = df_gbm[df_gbm.index.get_level_values('month') == int(m)]
    # Add month to column name (ex bws_raw becomes bws_01_raw)
    mon_cols = [x[0:4] + m + x[3:] for x in ind_cols]
    df_f.columns = mon_cols
    # drop date details from index
    df_clean = df_f.droplevel(level = ['month', 'year'])
    # Append to list
    month_data.append(df_clean)
df_bm = pd.concat(month_data, axis = 1)
df_bm.to_csv(basePATH('monthly'))

# Baseline Annual

## 1. Read in Aqueduct 3.0 database

In [16]:
gdf_3 = gpd.read_file(aq3PATH, layer = 'annual')
gdf_3.set_index(['string_id', 'aq30_id'], inplace = True)

## 2. Read in Aqueduct 4.0 annual results

In [17]:
df_4 = pd.read_csv(aq4PATH('annual'), index_col = 'pfaf_id')
# Merge to spatial data (which will have more catchments than we had data for)
df_gba = pd.merge(hy6, df_4, how = 'left', left_index = True, right_index = True)
# Set No Data as -9999
df_gba.replace(np.nan, -9999, inplace = True)
# Relabel No Datas
for l in [x for x in df_gba.columns if "lab" in x]:
    df_gba[l].replace(-9999, 'No Data', inplace = True)
    
# Save CSV and Geodatabase version
df_ba_final = df_gba.drop(['geometry'], axis = 1).sort_index()
new_cols = df_ba_final.columns.tolist()

## 3. Add Aq4 results to database

In [18]:
gdf_4 = gdf_3.copy()
# Drop PCR-GLOBWB indicators
gdf_4.drop(new_cols, axis = 1, inplace = True)
# Drop w_awr columns to NaN
gdf_4.drop([x for x in gdf_4.columns if "w_awr" in x], axis = 1, inplace = True)
# Merge new data to dataframe based on catchment
gdf_4 = pd.merge(gdf_4, df_ba_final, how = 'left', left_on = 'pfaf_id', right_index = True)
# Replace missing data
gdf_4.loc[:, [x for x in gdf_4.columns if "_raw" in x]] = gdf_4.loc[:, [x for x in gdf_4.columns if "_raw" in x]].fillna(-9999)
gdf_4.loc[:, [x for x in gdf_4.columns if "_score" in x]] = gdf_4.loc[:, [x for x in gdf_4.columns if "_score" in x]].fillna(-9999)
gdf_4.loc[:, [x for x in gdf_4.columns if "_cat" in x]] = gdf_4.loc[:, [x for x in gdf_4.columns if "_cat" in x]].fillna(-9999)
gdf_4.loc[:, [x for x in gdf_4.columns if "_label" in x]] = gdf_4.loc[:, [x for x in gdf_4.columns if "_label" in x]].fillna('No Data')

## 4. Fix Drought and GTD Labels

In [19]:
def category(score, df_in):
    cat = 'cat'
    df_cat = np.floor(df_in[score]).to_frame(name = cat)
    df_cat[cat] = np.where(df_cat[cat] == 5, 4, df_cat[cat])
    return df_cat

drr_labels = {
    -9999 : 'No Data',
    0: 'Low (0.0-0.2)',
    1: 'Low - Medium (0.2-0.4)',
    2: 'Medium (0.4-0.6)',
    3: 'Medium - High (0.6-0.8)', 
    4: 'High (0.8-1.0)'
}
# Redo raw values. They were mistakenly set to equal score in Aq 3.0
gdf_4['drr_raw'] = gdf_4['drr_score']/5
# Limit to 0 to 5; set no data = -9999
gdf_4['drr_raw'] = gdf_4['drr_raw'].mask(gdf_4['drr_score'] == -9999, -9999)
gdf_4['drr_score'] = gdf_4['drr_score'].mask(gdf_4['drr_score'] < 0, 0)
gdf_4['drr_score'] = gdf_4['drr_score'].mask(gdf_4['drr_score'] > 5, 5)
gdf_4['drr_score'] = gdf_4['drr_score'].mask(gdf_4['drr_raw'] == -9999, -9999)
# Set category
gdf_4['drr_cat'] = category(score = 'drr_score', df_in = gdf_4)['cat']
# Set Label
gdf_4['drr_label'] = gdf_4['drr_cat'].map(drr_labels)

In [20]:
def gtd_score(r):
    if np.isnan(r):
        score = -9999
    elif r == -9999:
        score = -9999
    elif r < 0:
        score = max(0, r+1)
    elif r < 2:
        score = 0.5 * r + 1
    elif r < 4:
        score = 0.5 * r + 1
    elif r < 8:
        score = 0.25* r + 2
    else:
        score = min(0.25*r + 2, 5)
    return score
        
gtd_labels = {
    -9999 : 'Insignificant Trend',
    0: 'Low (<0 cm/y)',
    1: 'Low - Medium (0-2 cm/y)',
    2: 'Medium - High (2-4 cm/y)',
    3: 'High (4-8 cm/y)', 
    4: 'Extremely High (>8 cm/y)'
}

# Copy label. Will need to overwrite insignificant trends
gdf_4['gtd_label_copy'] = gdf_4['gtd_label']
gdf_4['gtd_score'] = gdf_4['gtd_raw'].apply(lambda x: gtd_score(x))
# Set category
gdf_4['gtd_cat'] = category(score = 'gtd_score', df_in = gdf_4)['cat']
# Set Label
gdf_4['gtd_label'] = gdf_4['gtd_cat'].map(gtd_labels)
# Label insignificant trends
gdf_4['gtd_label'] = gdf_4['gtd_label'].mask(gdf_4['gtd_label_copy'] == 'Insignificant Trend', 'Insignificant Trend')
gdf_4['gtd_score'] = gdf_4['gtd_score'].mask(gdf_4['gtd_label_copy'] == 'Insignificant Trend', -9999)
gdf_4['gtd_cat'] = gdf_4['gtd_cat'].mask(gdf_4['gtd_label_copy'] == 'Insignificant Trend', -9999)
gdf_4.drop(['gtd_label_copy'], axis = 1, inplace = True)

## 5. Fix Coastal Flooding Labels

In [21]:
# First, set score = -1 if no risk
gdf_4['cfr_score'] = gdf_4['cfr_score'].mask(gdf_4['cfr_raw'] == 0, 0)
# Then, set category  = -1
gdf_4['cfr_cat'] = gdf_4['cfr_cat'].mask(gdf_4['cfr_raw'] == 0, -1)
# Then, create new label
gdf_4['cfr_label'] = gdf_4['cfr_label'].mask(gdf_4['cfr_raw'] == 0, 'No Risk')

## 6. Add weighting scheme

In [22]:
def weight_scores(r, QS):
    if np.isnan(r):
        score = -9999
    elif r < QS[1]:
        score = (r - QS[0])/(QS[1] - QS[0]) + 0
    elif r < QS[2]:
        score = (r - QS[1])/(QS[2] - QS[1]) + 1
    elif r < QS[3]:
        score = (r - QS[2])/(QS[3] - QS[2]) + 2
    elif r < QS[4]:
        score = (r - QS[3])/(QS[4] - QS[3]) + 3       
    elif r < QS[5]:
        score = (r - QS[4])/(QS[5] - QS[4]) + 4
    else:
        score = 5
    return score


weight_labels = {
    -9999 : 'No data',
    0: 'Low (0-1)',
    1: 'Low - Medium (1-2)',
    2: 'Medium - High (2-3)',
    3: 'High (3-4)', 
    4: 'Extremely High (4-5)'
}

# https://github.com/wri/aqueduct30_data_download/blob/master/metadata.md#quantile-linear-interpolation
qan = [0, 0.72, 1.09, 1.60, 2.34, 5]
qal = [0, 1.45, 2.20, 2.92, 3.83, 5]
rrr = [0, 0.30, 1.39, 2.81, 3.93, 5]
ovr = [0, 1.01, 1.61, 2.10, 2.68, 5]

These methods reflect the approach taken in Aqueduct 3.0
There are three things we need to calculate:

1. Weighted group averages per industry. Find the fraction that each indicator represents within the group (QAN, QAL, RRR) (first, indicators with no data are dropped)

2. Weighted overall averages per industry (looking at all indicators).  First, we need to find the fraction that each indicator represents within the overall (all 13 indicators) (first, indicators with no data are dropped)

3. Fraction of indicators present. The averages have dropped null values. If too many indicators are missing, we need to mask out the overall scores. 

In [23]:
# First, read in weights
df_w = pd.read_excel(wghtPATH, sheet_name = 'export_csv')
df_w = df_w.filter(['group_short', 'industry_short', 'indicator_short', 'weight_abs', 'weight_fraction'])

# Select only the score values from the full dataset and turn indicators from columns to rows
df_raw = gdf_4.loc[:,[x for x in gdf_4.columns if "_score" in x]]
df_raw = df_raw.melt(ignore_index = False).reset_index()
# Create new upper case version of indicator name (ex: bws_score = BWS)
df_raw['indicator'] = df_raw['variable'].apply(lambda x: x[0:3].upper())
# Replace no datas with -9999 and 9999 to 5
df_raw.replace(-9999, np.nan, inplace = True)
df_raw['value'] = df_raw['value'].mask(df_raw['value'] > 5, 5)
# Merge weights to each indicator for every geometry
df_merge = pd.merge(df_w, df_raw, how = 'outer', left_on = 'indicator_short', right_on = 'indicator')

# 1. & 2. WEIGHTED GROUP AVERAGES PER INDUSTRY & OVERALL AVERAGES PER INDUSTRY
# Set weights equal to NAN if no data is available
df_merge.loc[df_merge['value'].isna(), ['weight_abs', 'weight_fraction']] = np.nan
# Find weight fraction per group and per overall
# Find total weight per group/cat; and per cat (aka overall)
group_weights = df_merge.groupby(['string_id', 'aq30_id', 'group_short', 'industry_short'])['weight_abs'].sum().to_frame(name = 'grp_wght')
overall_weights = df_merge.groupby(['string_id', 'aq30_id', 'industry_short'])['weight_abs'].sum().to_frame(name = 'tot_wght')
# Merge to data to find fractions
df_merge = pd.merge(df_merge, group_weights, how = 'left', left_on = ['string_id', 'aq30_id', 'group_short', 'industry_short'], right_index = True)
df_merge = pd.merge(df_merge, overall_weights, how = 'left', left_on = ['string_id', 'aq30_id', 'industry_short'], right_index = True)
# Calculate fraction for group and overall
df_merge['grp_fraction'] = df_merge['weight_abs'].divide(df_merge['grp_wght'])
df_merge['tot_fraction'] = df_merge['weight_abs'].divide(df_merge['tot_wght'])
# Multiply the value by the weight
df_merge['grp_value'] = df_merge['grp_fraction'].multiply(df_merge['value'])
df_merge['tot_value'] = df_merge['tot_fraction'].multiply(df_merge['value'])
# Sum weighted values by group-industry; standarized name of fraction and value columns
df_avg_grp = df_merge.groupby(['string_id', 'aq30_id', 'group_short', 'industry_short'])[['grp_fraction', 'grp_value']].sum().reset_index()
df_avg_grp.rename(columns = {'grp_fraction': 'fraction', 'grp_value': 'value'}, inplace = True)
# Sum weighted values by overall industry; standarized name of fraction and value columns
df_avg_tot = df_merge.groupby(['string_id', 'aq30_id', 'industry_short'])[['tot_fraction', 'tot_value']].sum().reset_index()
df_avg_tot.rename(columns = {'tot_fraction': 'fraction', 'tot_value': 'value'}, inplace = True)
df_avg_tot['group_short'] = 'TOT'
# Pivot the data so Groups are columns and industries are rows (with string ID)
df_piv_grp = pd.pivot_table(data = df_avg_grp, values = ['fraction', 'value'], index = ['string_id', 'aq30_id', 'industry_short'], columns = ['group_short'], aggfunc ='sum')
df_piv_tot = pd.pivot_table(data = df_avg_tot, values = ['fraction', 'value'], index = ['string_id', 'aq30_id', 'industry_short'], columns = ['group_short'], aggfunc ='sum')
# Merge together
df_piv = pd.concat([df_piv_grp, df_piv_tot], axis = 1)
# Just keep values
df_val = df_piv['value']
# Rename to raw
df_val = df_val.add_suffix('_RAW')
# Calculate score
df_val['QAL_SCORE'] = df_val['QAL_RAW'].apply(lambda x: weight_scores(x, qal))
df_val['QAN_SCORE'] = df_val['QAN_RAW'].apply(lambda x: weight_scores(x, qan))
df_val['RRR_SCORE'] = df_val['RRR_RAW'].apply(lambda x: weight_scores(x, rrr))
df_val['TOT_SCORE'] = df_val['TOT_RAW'].apply(lambda x: weight_scores(x, ovr))

# Create categories
for i in ['QAL', 'QAN', 'RRR', 'TOT']:
    sc = i + "_SCORE"
    ct = i + "_CAT"
    lb = i + "_LABEL"
    df_val[ct] = category(score = sc, df_in = df_val)['cat']
    df_val[lb] = df_val[ct].map(weight_labels)

    
# 3. FRACTION OF DATA AVAILABILE
frac_grp = df_merge.groupby(['string_id', 'aq30_id', 'group_short', 'industry_short'])['weight_fraction'].sum().to_frame(name = 'value').reset_index()
frac_grp['group_short'] = frac_grp['group_short'].apply(lambda x: x + "WEIGHT_FRACTION")
frac_tot = df_merge.groupby(['string_id', 'aq30_id',  'industry_short'])['weight_fraction'].sum().to_frame(name = 'value').reset_index()
frac_tot['group_short'] = 'TOT_WEIGHT_FRACTION'
print("Percentage of locations without enough data to average:", len(frac_tot[frac_tot.value < 0.75]) / len(frac_tot)*100)
# Melt the values data
df_val_melt = df_val.melt(ignore_index = False).reset_index()
# Combine values and fractions
df_combo = pd.concat([df_val_melt, frac_grp, frac_tot], axis = 0)
df_combo['column'] = 'W_AWR_' + df_combo['industry_short'] + "_" + df_combo['group_short'] 

# Flip data so group-industry-indicators are columns
df_warw = pd.pivot(data = df_combo, values = 'value', columns = 'column', index = ['string_id', 'aq30_id'])
df_warw.columns = [x.lower() for x in df_warw.columns]

# Set values to floats
for i in df_warw.columns:
    if "label" not in i:
        df_warw[i] = df_warw[i].astype(float)
        
# If overall fraction for group is less than 0.75, set to no data
industry_list = [x.lower() for x in list(df_w['industry_short'].unique())]
for i in industry_list:
    print(i)
    # Create fraction name
    fr = 'w_awr_{}_tot_weight_fraction'.format(i)
    # Find all industry-related scores, raws, and cats
    num_cols = ['w_awr_{}_{}_{}'.format(i, t, c) for c in ['raw', 'score', 'cat'] for t in ['qan', 'qal', 'rrr', 'tot']]
    # Find all industry-related labels
    lab_cols = ['w_awr_{}_{}_label'.format(i, t) for t in ['qan', 'qal', 'rrr', 'tot']]
    # Set numbers = -9999 and label to No Data
    df_warw.loc[df_warw[fr] <0.75, num_cols] = -9999
    df_warw.loc[df_warw[fr] <0.75, lab_cols] = 'No data'

Percentage of locations without enough data to average: 16.75459083875865
def
agr
fnb
che
elp
smc
ong
min
con
tex


## 8. Create CARTO table for weights

In [24]:
cat_groups = {'bws': 'qan', 'bwd': 'qan', 'cfr': 'qan', 'rfr': 'qan', 
              'iav': 'qan', 'sev': 'qan',  'gtd': 'qan', 'drr': 'qan',
              'cep': 'qal', 'ucw': 'qal', 
              'rri': 'rrr', 'udw': 'rrr', 'usa': 'rrr'}

# Only keep indicator scores
df_c = gdf_4.loc[:, [x for x in gdf_4.columns if "score" in x]]
# Pivot table so columns become rows
df_cm = df_c.melt(ignore_index = False)
# Drop no datas
df_cm = df_cm.loc[df_cm['value']!= -9999, :]
# Turn old column name into just indicator abbr
df_cm['indicator'] = df_cm['variable'].apply(lambda x: x[0:3])
df_cm['group_short'] = df_cm['indicator'].apply(lambda x: cat_groups.get(x))
df_cm.rename(columns = {'value': 'score'}, inplace = True)
df_cm.drop('variable', axis = 1, inplace = True)

## 8. Merge all pieces together, and sort to match original Aq 3 data

In [25]:
df_4_final = pd.concat([gdf_4, df_warw], axis = 1)
df_4_final = df_4_final.filter(gdf_3.columns[0:-3])

## 9. Save

In [26]:
# Save CSV and Geodatabase version
df_4_final.to_csv(basePATH('annual'))
df_4_final.to_csv(cartoPATH('annual'))
df_cm.to_csv(cartoPATH('custom'))
# DROP GEOM AND SHAPE

In [122]:
# # Save location ID and geometry to separate separate shapefile to rebuild as geodatabase in ArcMAP
# gdf_shp = gdf_3.filter(['Shape_Length', 'Shape_Area', 'geometry'])
# # Define projection from input data
# aq_crs = gdf_3.crs 
# gdf_shp.to_file(gdf_y_PATH)

## 8. QA

In [43]:
df_w.indicator_short.unique()

array(['BWS', 'BWD', 'GTD', 'IAV', 'SEV', 'DRR', 'RFR', 'CFR', 'UCW',
       'CEP', 'UDW', 'USA', 'RRI'], dtype=object)

In [44]:
inds = ['bws', 'bwd', 'gtd', 'iav', 'sev', 'drr', 'rfr', 'cfr', 'ucw',
       'cep', 'udw', 'usa', 'rri']
for i in inds:
    df = df_4_final.groupby(i + "_label")[i + '_raw', i + "_score", i + '_cat'].agg(['min', 'max'])
    print(df)

                            bws_raw                 bws_score               \
                                min          max          min          max   
bws_label                                                                    
Arid and Low Water Use     1.000000     1.000000     5.000000     5.000000   
Extremely High (>80%)      0.803479  9999.000000     4.006260     5.000000   
High (40-80%)              0.400318     0.799223     3.001146     3.998599   
Low (<10%)                 0.000000     0.099904     0.000000     0.998614   
Low - Medium (10-20%)      0.100020     0.199952     1.000294     1.999654   
Medium - High (20-40%)     0.200010     0.399624     2.000069     2.998643   
No Data                -9999.000000 -9999.000000 -9999.000000 -9999.000000   

                       bws_cat          
                           min     max  
bws_label                               
Arid and Low Water Use    -1.0    -1.0  
Extremely High (>80%)      4.0     4.0  
High (40-80%) 

                           udw_raw                 udw_score               \
                               min          max          min          max   
udw_label                                                                   
Extremely High (>20%)     0.200096     0.693025     4.000696     5.000000   
High (10-20%)             0.100019     0.199712     3.000274     3.997923   
Low (<2.5%)               0.000000     0.024995     0.000000     0.999687   
Low - Medium (2.5-5%)     0.025016     0.049902     1.000898     1.997177   
Medium - High (5-10%)     0.050041     0.099957     2.001173     2.999385   
No Data               -9999.000000 -9999.000000 -9999.000000 -9999.000000   

                      udw_cat          
                          min     max  
udw_label                              
Extremely High (>20%)     4.0     4.0  
High (10-20%)             3.0     3.0  
Low (<2.5%)               0.0     0.0  
Low - Medium (2.5-5%)     1.0     1.0  
Medium - High (5-10%)     

# Future

## DATA DOWLOAD

In [5]:
# Read in monthly data
df_4f = pd.read_csv(aq4PATH('future'), index_col = 'pfaf_id')
# Read in Aqueduct 3 monthly
gdf_3m = gpd.read_file(aq3mPATH)
gdf_3m.set_index(['fid', 'pfaf_id'], inplace = True)
# Set to same shape as Aq 3 data
df_4f_final = pd.DataFrame(index = gdf_3m.index).reset_index()
df_4f_final = pd.merge(df_4f_final, df_4f, how = 'left', left_on = 'pfaf_id', right_index = True).set_index(['fid', 'pfaf_id'])
df_4f_final.to_csv(futPATH('future'))

In [7]:
inds = ['ws', 'wd', 'iv', 'sv']
years = ['30']
scens = ['bau']

for i in inds:
    for s in scens:
        for y in years:
            la = '{}{}_{}_x_l'.format(s,y,i)
            ra = '{}{}_{}_x_r'.format(s,y,i)
            sc = '{}{}_{}_x_s'.format(s,y,i)
            ca = '{}{}_{}_x_c'.format(s,y,i)
            df = df_4f_final.groupby(la)[ra, sc, ca].agg(['min', 'max'])
            print(df)

                       bau30_ws_x_r              bau30_ws_x_s            \
                                min          max          min       max   
bau30_ws_x_l                                                              
Arid and low water use     1.000000     1.000000     5.000000  5.000000   
Extremely high (>80%)      0.802074  9999.000000     4.003736  5.000000   
High (40-80%)              0.400100     0.799942     3.000360  3.999896   
Low (<10%)                 0.000000     0.099997     0.000000  0.999958   
Low-medium (10-20%)        0.100267     0.199945     1.003844  1.999603   
Medium-high (20-40%)       0.200328     0.399897     2.002361  2.999628   

                       bau30_ws_x_c       
                                min  max  
bau30_ws_x_l                              
Arid and low water use         -1.0 -1.0  
Extremely high (>80%)           4.0  4.0  
High (40-80%)                   3.0  3.0  
Low (<10%)                      0.0  0.0  
Low-medium (10-20%)    

In [8]:
inds = ['ba', 'ww', 'ws', 'wd', 'iv', 'sv']
years = ['30']
scens = ['bau']

for i in inds:
    for s in scens:
        for y in years:
            la = '{}{}_{}_y_l'.format(s,y,i)
            ra = '{}{}_{}_y_r'.format(s,y,i)
            sc = '{}{}_{}_y_s'.format(s,y,i)
            ca = '{}{}_{}_y_c'.format(s,y,i)
            df = df_4f_final.groupby(la)[ca].agg(['min', 'max'])
            print(df)

                             min     max
bau30_ba_y_l                            
1.2x decrease               -1.0    -1.0
1.2x increase                1.0     1.0
1.4x decrease               -2.0    -2.0
1.4x increase                2.0     2.0
1.7x or greater decrease    -3.0    -3.0
1.7x or greater increase     3.0     3.0
Near normal                  0.0     0.0
No Data                  -9999.0 -9999.0
                             min     max
bau30_ww_y_l                            
1.2x decrease               -1.0    -1.0
1.2x increase                1.0     1.0
1.4x decrease               -2.0    -2.0
1.4x increase                2.0     2.0
1.7x or greater decrease    -3.0    -3.0
1.7x or greater increase     3.0     3.0
Near normal                  0.0     0.0
No Data                  -9999.0 -9999.0
                          min  max
bau30_ws_y_l                      
1.4x decrease            -1.0 -1.0
1.4xincrease              1.0  1.0
2.0x decrease            -2.0 -2.0
2.0x 

## CARTO

In [9]:
scenario_names = {'bau': 'business_as_usual', 'opt': 'optimistic', 'pes': 'pessimistic'}
indicator_names = {'ww': 'water_demand', 'ba': 'water_supply', 
              'ws':'water_stress', 'wd':'water_depletion', 
              'sv':'seasonal_variability', 'iv':'interannual_variability'}
type_names = {'x': 'future_value', 'y': 'change_from_baseline'}
data_names = {'r': 'value', 'l': 'label'}

In [10]:
# Only keep raw and label
df_4ff = df_4f.loc[:, [x for x in df_4f.columns if ('_r' in x) | ('_l' in x)]]
# Melt data so columns become rows
df_melt = df_4ff.melt(ignore_index = False)
# Split old column name into individual columns by "_"
df_melt[[0, 1 , 2 , 3]] = df_melt['variable'].str.split('_',  expand=True)
# Create scenario
df_melt['scenario'] = df_melt[0].apply(lambda x: scenario_names.get(x[0:3]))
df_melt['year'] = df_melt[0].apply(lambda x: '20'+str(x[3:]))
df_melt['indicator'] = df_melt[1].apply(lambda x: indicator_names.get(x))
df_melt['type'] = df_melt[2].apply(lambda x: type_names.get(x))
df_melt['data'] = df_melt[3].apply(lambda x: data_names.get(x))

# Pivot data so raw and labels are side-by-side again
df_pv = pd.pivot(data = df_melt.reset_index(), values = 'value', 
               index = ['pfaf_id', 'indicator', 'year', 'scenario', 'type'], columns = ['data']).reset_index()
# Save in original order
df_final = df_pv.set_index(['pfaf_id']).filter(['indicator', 'value', 'label', 'year', 'scenario', 'type'])

In [11]:
df_final.to_csv(cartoPATH('future'))

## CREATE CATEGORIES

In [71]:
inds = ['ws', 'wd', 'iv', 'sv']
years = ['30', '50', '80']
scens = ['bau', 'opt', 'pes']

cats = ['x']
tablesx = []
for i in inds:
    for s in scens:
        for y in years:
            for c in cats:
                la = '{}{}_{}_{}_l'.format(s,y,i,c)
                ca = '{}{}_{}_{}_c'.format(s,y,i,c)
                df_f = df_4f_final.reset_index()
                df = df_f.filter([la, ca]).drop_duplicates().sort_values(by = ca)
                tablesx.append(df.reset_index().drop(['index'], axis = 1))
            
cats = ['y']
tablesy = []
for i in inds:
    for s in scens:
        for y in years:
            for c in cats:
                la = '{}{}_{}_{}_l'.format(s,y,i,c)
                ca = '{}{}_{}_{}_c'.format(s,y,i,c)
                df_f = df_4f_final.reset_index()
                df = df_f.filter([la, ca]).drop_duplicates().sort_values(by = ca)
                tablesy.append(df.reset_index().drop(['index'], axis = 1))

In [72]:
x_tables = pd.concat(tablesx, axis = 1 )
y_tables = pd.concat(tablesy, axis = 1 )

In [77]:
df_thresholds = pd.concat([x_tables, y_tables], axis = 1)
df_thresholds.to_csv(futPATH('categories', 'csv'))

In [78]:
df_4f_final.columns

Index(['bau30_ws_x_r', 'bau30_ws_x_s', 'bau30_ws_x_c', 'bau30_ws_x_l',
       'bau30_ws_x_u', 'bau50_ws_x_r', 'bau50_ws_x_s', 'bau50_ws_x_c',
       'bau50_ws_x_l', 'bau50_ws_x_u',
       ...
       'pes80_wd_y_r', 'pes80_wd_y_s', 'pes80_ws_y_c', 'pes80_ws_y_l',
       'pes80_ws_y_r', 'pes80_ws_y_s', 'pes80_ww_y_c', 'pes80_ww_y_l',
       'pes80_ww_y_r', 'pes80_ww_y_s'],
      dtype='object', length=396)