# Step 2a: Create Catchment Time Series Files
This script creates one file for each pfaf catchment, which includes all supply and demand data from all scenarios and climate models. 

This script also calculates water stress using two methods, but this can be ignored. We now calculate water stress after temporal aggregation of the supply and demand data (step 2b)

All output data is in million m3/month 

In [0]:
!pip install tqdm
import os, datetime, warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from calendar import monthrange
from tqdm import tqdm

## Paths & Parameters

In [0]:
aq4PATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/'
utilsPATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/catchment_master/utils/'

# Indicator folders
indFolders = ['demand_irr_resample_5',
              'demand_nonirr_resample_5',
              'discharge',
              'runoff_resample_5'] 

# Global climate models
gcmFolders = ['gswp3-w5e5',
              'gfdl-esm4',
              'ipsl-cm6a-lr',
              'mpi-esm1-2-hr',
              'mri-esm2-0',
              'ukesm1-0-ll',
             ]

# Future scenarions
scenFolders = ['historical',
               'ssp126',
               'ssp370',
               'ssp585',
              ]

# Functions

### Functions only needed to be ran once

In [0]:
def get_all_catchments():
    '''
    PURPOSE: Use runoff baseline run results to get all catchments including their 
             duplicates information (e.g., one pfaf basin crossing two or more M regions).
    '''
    folderPath = os.path.join(aq4PATH, '{}/{}/{}/'.format(indFolders[3], gcmFolders[0], scenFolders[0]))
    fileNames = [f for f in os.listdir(folderPath) if os.path.isfile(os.path.join(folderPath, f))]
    dfs = []
    for f in fileNames[:]:
        mRegion = f.split('.')[0]
        fp = os.path.join(folderPath, f)
        df = pd.read_csv(fp)
        df.drop_duplicates(subset=['pfaf_id'], inplace=True)
        df = df[['pfaf_id']]
        df['mRegion'] = mRegion
        dfs.append(df)
    dfm = pd.concat(dfs)
    dfm['Duplicated'] = dfm.duplicated(subset=['pfaf_id'], keep=False)
    df1 = dfm[dfm['Duplicated'] == True]
    df2 = dfm[dfm['Duplicated'] == False]
    df1.to_csv('/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/catchment_master/utils/mregion_catchment_master_list_duplicated_only.csv', index=None)
    df2.to_csv('/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/catchment_master/utils/mregion_catchment_master_list_non-duplicated_only.csv', index=None)

### Functions more constantly used

In [0]:
def time_reformatter(row):
    '''
    PURPOSE: To update datetime format of values in the time column, which was lost when saving data into csv files.
    '''
    t = row['time']
    tl = t.split('-')
    y = int(tl[0])
    m = int(tl[1])
    return datetime.date(y, m, 1)


def get_days(row):
    '''
    PURPOSE: To get the number of days in a given month.
    '''
    d = row['time']
    y = d.year
    m = d.month
    r = monthrange(y, m)[1]
    return r


def ws_basic(row, ba, ww):
    '''
    PURPOSE: To calculate water stress, dividing ww by ba, and when ba is zero, water stress is assigned to be 1.0
    '''
    b = row[ba]
    w = row[ww]
    if b == 0.0:
        ws = 1.0
    else:
        ws = w / b
    return ws


def livww_to_neg(row, field, resample_size):
    '''
    PURPOSE: For future projections, there is no livww column. For those files, a livww column was added, and -9999 was assigned
             as values for all items in that column. This function adjust livww data units when there is livww, and skips adjustment
             when the value is -9999.
    '''
    d = row[field]
    if d == -9999:
        r = -9999
    else:
        r = d * row['days'] / (resample_size * resample_size)
    return r


def discharge_replace_neg(row, field):
    '''
    PURPOSE: To replace any negative discharge values with zeros.
    '''
    d = row[field]
    if d < 0:
        d = 0
    return d


def month_getter(row):
    t = row['time']
    m = t.month
    return m


def year_getter(row):
    t = row['time']
    y = t.year
    return y


def catchment_data_by_model_by_scen(pfafid, mregion, model, scen):
    '''
    PURPOSE: Get data for a catchment of a given model and scenario .
    '''
    
#     mregion-pfafid type file names
    mpfn = '{}_{}.csv'.format(mregion, pfafid)
#     mregion type file names
    mfn = '{}.csv'.format(mregion)
    
#     GET DATA
#     Get irr demand
    folderPath = os.path.join(aq4PATH, '{}/{}/{}/'.format(indFolders[0], model, scen))
    fp = os.path.join(folderPath, mpfn)
    dfirr = pd.read_csv(fp)
    dfirr = dfirr[['time', 'girrww', 'girrwn']]
    dfirr['time'] = dfirr.apply(lambda row: time_reformatter(row), axis=1)
    dfirr.set_index('time', inplace=True)
    dfirr.columns = ['girrww_{}_{}'.format(model, scen), 'girrwn_{}_{}'.format(model, scen)]
    
#     Get non irr demand
    folderPath = os.path.join(aq4PATH, '{}/{}/'.format(indFolders[1], scen))
    fp = os.path.join(folderPath, mpfn)
    dfnonirr = pd.read_csv(fp)
    if scen == scenFolders[0]:
        dfnonirr = dfnonirr[['time', 'gdomww', 'gdomwn', 'gindww', 'gindwn', 'glivww']]
    else:
        dfnonirr = dfnonirr[['time', 'gdomww', 'gdomwn', 'gindww', 'gindwn']]
        dfnonirr['glivww'] = -9999
    dfnonirr['time'] = dfnonirr.apply(lambda row: time_reformatter(row), axis=1)
    dfnonirr.set_index('time', inplace=True)
    dfnonirr.columns = ['gdomww_{}_{}'.format(model, scen), 'gdomwn_{}_{}'.format(model, scen), 'gindww_{}_{}'.format(model, scen), 'gindwn_{}_{}'.format(model, scen), 'glivww_{}_{}'.format(model, scen)]
    
#     Get discharge
    folderPath = os.path.join(aq4PATH, '{}/{}/{}/'.format(indFolders[2], model, scen))
    fp = os.path.join(folderPath, mfn)
    dfdisc = pd.read_csv(fp)
    dfdisc = dfdisc[dfdisc['pfaf_id'] == pfafid]
    dfdisc = dfdisc[['time', 'discharge_Mm3month', 'days']]
    dfdisc['time'] = dfdisc.apply(lambda row: time_reformatter(row), axis=1)
    dfdisc.set_index('time', inplace=True)
    dfdisc.columns = ['discharge_{}_{}'.format(model, scen), 'days']
    
#     Get runoff
    folderPath = os.path.join(aq4PATH, '{}/{}/{}/'.format(indFolders[3], model, scen))
    fp = os.path.join(folderPath, mfn)
    dfrun = pd.read_csv(fp)
    dfrun = dfrun[dfrun['pfaf_id'] == pfafid]
    dfrun = dfrun[['time', 'runoff']]
    dfrun['time'] = dfrun.apply(lambda row: time_reformatter(row), axis=1)
    dfrun.set_index('time', inplace=True)
    dfrun.columns = ['runoff_{}_{}'.format(model, scen)]
    
#     merge 4 dfs, and update the days column in case it is empty due to non-existing runoff data
    dfm = pd.concat([dfirr, dfnonirr, dfdisc, dfrun], axis=1)
    dfm.reset_index(inplace=True)
    dfm['days'] = dfm.apply(lambda row: get_days(row), axis=1)
    dfm.set_index('time', inplace=True)
    
#     DATA ADJUSTMENTS
    resample_size = 5
#     1. Adjust the irrigation demand data unit from km3 to million m3,
#        and account for resampling scale change
    firrww = 'girrww_{}_{}'.format(model, scen)
    firrwn = 'girrwn_{}_{}'.format(model, scen)
    dfm[firrww] = (dfm[firrww] * 1000) / (resample_size * resample_size)
    dfm[firrwn] = (dfm[firrwn] * 1000) / (resample_size * resample_size)
    
#     2. Adjust non-irrigation demand data unit from million-m3-per-day to million m3 by multiplying data by the number of days in the given month,
#        and account for resampling scale change
    fdomww = 'gdomww_{}_{}'.format(model, scen)
    fdomwn = 'gdomwn_{}_{}'.format(model, scen)
    findww = 'gindww_{}_{}'.format(model, scen)
    findwn = 'gindwn_{}_{}'.format(model, scen)
    flivww = 'glivww_{}_{}'.format(model, scen)
    dfm[fdomww] = (dfm[fdomww] * dfm['days']) / (resample_size * resample_size)
    dfm[fdomwn] = (dfm[fdomwn] * dfm['days']) / (resample_size * resample_size)
    dfm[findww] = (dfm[findww] * dfm['days']) / (resample_size * resample_size)
    dfm[findwn] = (dfm[findwn] * dfm['days']) / (resample_size * resample_size)
    dfm[flivww] = dfm.apply(lambda row: livww_to_neg(row, field=flivww, resample_size=resample_size), axis=1)
    
#     3. Replace negative discharge values with zeros
    fdisc = 'discharge_{}_{}'.format(model, scen)
    dfm[fdisc] = dfm.apply(lambda row: discharge_replace_neg(row, field=fdisc), axis=1)
    
#     4. Adjust resampling factors for runoff data
    frun = 'runoff_{}_{}'.format(model, scen)
    dfm[frun] = dfm[frun] / (resample_size * resample_size)
    dfm.drop(['days'], axis=1, inplace=True)
    return dfm


def ten_year_moving_ols(df, xname, yname):
    '''
    PURPOSE: To smooth the monthly data using a 10-year moving linear regression
    VARIABLES:
                1. df is the dataframe that contains your dependent and independent variables
                2. xname is the column name of your x or independent variable in the dataframe
                3. yname is the column name of your y or dependent variable in the dataframe
    '''
    dfcopy = df
    df = df.dropna(axis=0, how='any', subset=[yname])
    ww_ols = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
    for i in range(len(df[yname].values)-9):
        y = df[yname].values[i:i+10]
        x = df[xname].values[i:i+10]
        z = np.polyfit(x, y, 1)
        y_hat = np.poly1d(z)(x)
        wols = y_hat[-1]
        if wols < np.min(y):
            wols = np.min(y)
        elif wols > np.max(y):
            wols = np.max(y)
        ww_ols.append(wols)
    outYname = 'm10ols-{}'.format(yname)
    df[outYname] = ww_ols
    df = df[[outYname]]
    dfcopy = pd.concat([dfcopy, df], axis=1)
    return dfcopy
  

def just_an_ols(df, xname, yname):
    '''
    PURPOSE: To estimate what the latest water demand level with precipitation effects removed by applying a linear regression over the entire time series
    VARIABLES:
                1. df is the dataframe that contains your dependent and independent variables
                2. xname is the column name of your x or independent variable in the dataframe
                3. yname is the column name of your y or dependent variable in the dataframe
    '''
    dfcopy = df
    df = df.dropna(axis=0, how='any', subset=[yname])
    y = df[yname].values
    x = df[xname].values
    z = np.polyfit(x, y, 1)
    y_hat = np.poly1d(z)(x)
    y_hat_adj = []
    for yh in y_hat:
        if yh < np.min(y):
            yh = np.min(y)
        elif yh > np.max(y):
            yh = np.max(y)
        y_hat_adj.append(yh)
    outYname = 'regols-{}'.format(yname)
    df[outYname] = y_hat_adj
    df = df[[outYname]]
    dfcopy = pd.concat([dfcopy, df], axis=1)
    return dfcopy


def aq_m10_ols_pre_and_post_construct(df, xname, yname, month_gone=108):
    '''
    PURPOSE: To prepare month-specific dataframes for 10-year moving ols
    VARIABLES:
                1. df is the dataframe that contains your dependent and independent variables
                2. xname is the column name of your x or independent variable in the dataframe
                3. yname is the column name of your y or dependent variable in the dataframe
                4. month_gone is the number of month skipped because of the 10-year moving ols, the default value is 108 (= 9 * 12)
    '''
    data = []
    for m in np.arange(1, 13, 1):
        dfm = df[df['month'] == m]
        dfm = ten_year_moving_ols(df=dfm, xname=xname, yname=yname)
        c = dfm.columns.values[-1]
        dfm = dfm[[c]]
        dfm = pd.concat([df, dfm], axis=1)
        d = dfm[c].values
        data.append(d)
    datasum = np.nansum(data, axis=0)
    datasum[:month_gone] = np.nan
    df[c] = datasum
    return df


def aq_reg_ols_ws_estimate(df, xname, yname):
    '''
    PURPOSE: To prepare month-specific dataframes for the simple ols over the entire time series
    VARIABLES:
                1. df is the dataframe that contains your dependent and independent variables
                2. xname is the column name of your x or independent variable in the dataframe
                3. yname is the column name of your y or dependent variable in the dataframe
    '''
    data = []
    for m in np.arange(1, 13, 1):
        dfm = df[df['month'] == m]
        dfm = just_an_ols(df=dfm, xname=xname, yname=yname)
        c = dfm.columns.values[-1]
        dfm = dfm[[c]]
        dfm = pd.concat([df, dfm], axis=1)
        d = dfm[c].values
        data.append(d)
    datasum = np.nansum(data, axis=0)
    df[c] = datasum
    return df


def water_stress_calcs(df, model, scen):
    '''
    PURPOSE: To calcualte water stress
    '''
    ba = 'ba_{}_{}'.format(model, scen)
    ww = 'ww_{}_{}'.format(model, scen)
    ws = 'ws_{}_{}'.format(model, scen)
    bao = 'm10ols-{}'.format(ba)
    wwo = 'm10ols-{}'.format(ww)
    wso = 'm10ols-{}'.format(ws)
    wsr = 'regols-{}'.format(ws)
    wsor = 'regols-m10ols-{}'.format(ws)
    df = df[[ba, ww, 'time', 'month', 'year']]
    
#     calculate water stress using raw demand and supply, this will give you 'ws'
    df[ws] = df.apply(lambda row: ws_basic(row, ba=ba, ww=ww), axis=1)
    
#     calculate ordinal time for regression
    ordinalFactor = 1000000.0
    df['dateord'] = df['time'].map(pd.Timestamp.toordinal) / ordinalFactor
    
#     create m10y ols demand, this will give you 'wwo'
    df = aq_m10_ols_pre_and_post_construct(df=df, xname='dateord', yname=ww)
    
#     create m10y ols supply, this will give you 'bao'
    df = aq_m10_ols_pre_and_post_construct(df=df, xname='dateord', yname=ba)
    
#     calculate water stress using ols-demand and ols-supply, this will give you 'wso'
    df[wso] = df.apply(lambda row: ws_basic(row, ba=bao, ww=wwo), axis=1)
    
#     two water stress calculation options

#     Option 1: Apply a linear regression over 'ws' which is calculated using raw ww and ba
    df = aq_reg_ols_ws_estimate(df=df, xname='dateord', yname=ws)
    
#     Option 2: Apply a linear regression over 'wso' which is calculated using m10ols-ww and m10ols-ba
    df = aq_reg_ols_ws_estimate(df=df, xname='dateord', yname=wso)
    df = df[[ws, bao, wwo, wso, wsr, wsor]]
    return df


def create_catchment_monthly_water_stress(dfp):
    dfp.reset_index(inplace=True)
    dfp['month'] = dfp.apply(lambda row: month_getter(row), axis=1)
    dfp['year'] = dfp.apply(lambda row: year_getter(row), axis=1)
    dfs = [dfp]
    for m in gcmFolders[:]:
        if m == gcmFolders[0]:
            s = scenFolders[0]
            print(m,s)
            df = water_stress_calcs(df=dfp, model=m, scen=s)
            dfs.append(df)
        else:
            for s in scenFolders[:]:
                print(m,s)
                df = water_stress_calcs(df=dfp, model=m, scen=s)
                dfs.append(df)
    dfm = pd.concat(dfs, axis=1)
    cols = dfm.columns.values
    cols = list(set(cols) - set(['time', 'month', 'year']))
    for c in cols:
#             The if statements below are to reassign nan values to places where it shouldn't have data, e.g., for the baseline period, 
#             no data is available before Dec 2019. The nansum operation above treats nan as zeros.    
        if 'gswp3-w5e5_historical' in c:
            dfm[c][720:] = np.nan
        elif 'historical' in c:
            dfm[c][660:] = np.nan
        else:
            dfm[c][:660] = np.nan
    return dfm


def create_catchment_master(pfafid, mregion):
    '''
    PURPOSE:
             1. To pull all data for the given catchment by scenario by model
             2. To assemble scenario-model specific data into one giant master file
             3. Calculate total water supply (ba), demand (ww), and consumption (wn)
    '''
    dfs = []
    for m in gcmFolders[:]:
        if m == gcmFolders[0]:
            s = scenFolders[0]
            print(m,s)
            df = catchment_data_by_model_by_scen(pfafid=pfafid, mregion=mregion, model=m, scen=s)
            dfs.append(df)
        else:
            for s in scenFolders[:]:
                print(m,s)
                df = catchment_data_by_model_by_scen(pfafid=pfafid, mregion=mregion, model=m, scen=s)
                dfs.append(df)
    dfm = pd.concat(dfs, axis=1)
    
#     fill livww blanks from 2015 through 2019 for baseline column
    dfm.reset_index(inplace=True)
    liv2014 = dfm[(dfm['time'] >= datetime.date(2014, 1, 1)) & (dfm['time'] <= datetime.date(2014, 12, 1))]['glivww_gswp3-w5e5_historical']
    livtill2014 = list(dfm[(dfm['time'] <= datetime.date(2014, 12, 1))]['glivww_gswp3-w5e5_historical'].values)
    livfill = list(np.resize(liv2014, 12 * 5))
    livnan = np.empty((972,))
    livnan[:] = np.nan
    livnan = list(livnan)
    livnew = livtill2014 + livfill + livnan
    dfm['glivww_gswp3-w5e5_historical'] = livnew
    
#     fill livww blanks from 2015 through 2100 for all gcm-ssp liv columns; gcm-historical columns need no fixing 
    cols = dfm.columns.values
    livcols = [c for c in cols if 'liv' in c]
    livcols = [c for c in livcols if 'historical' not in c]
    livfutfil = list(np.resize(liv2014, 12 * (2100 - 2015 + 1)))
    livfutnan = np.empty((660,))
    livfutnan[:] = np.nan
    livfutnan = list(livfutnan)
    livfutnew = livfutnan + livfutfil
    for c in livcols:
        dfm[c] = livfutnew
        
#     calculate total supply, demand, consumption
    for m in gcmFolders[:]:
        if m == gcmFolders[0]:
            s = scenFolders[0]
            ms = '{}_{}'.format(m, s)
            ba = 'ba_{}'.format(ms)
            ww = 'ww_{}'.format(ms)
            wn = 'wn_{}'.format(ms)
            dfm[ba] = np.nansum([dfm['discharge_{}'.format(ms)], dfm['runoff_{}'.format(ms)]], axis=0)
            dfm[ww] = np.nansum([dfm['girrww_{}'.format(ms)], dfm['gdomww_{}'.format(ms)], dfm['gindww_{}'.format(ms)], dfm['glivww_{}'.format(ms)]], axis=0)
            dfm[wn] = np.nansum([dfm['girrwn_{}'.format(ms)], dfm['gdomwn_{}'.format(ms)], dfm['gindwn_{}'.format(ms)], dfm['glivww_{}'.format(ms)]], axis=0)
        else:
            for s in scenFolders[:]:
                ms = '{}_{}'.format(m, s)
                ba = 'ba_{}'.format(ms)
                ww = 'ww_{}'.format(ms)
                wn = 'wn_{}'.format(ms)
                dfm[ba] = np.nansum([dfm['discharge_{}'.format(ms)], dfm['runoff_{}'.format(ms)]], axis=0)
                dfm[ww] = np.nansum([dfm['girrww_{}'.format(ms)], dfm['gdomww_{}'.format(ms)], dfm['gindww_{}'.format(ms)], dfm['glivww_{}'.format(ms)]], axis=0)
                dfm[wn] = np.nansum([dfm['girrwn_{}'.format(ms)], dfm['gdomwn_{}'.format(ms)], dfm['gindwn_{}'.format(ms)], dfm['glivww_{}'.format(ms)]], axis=0)
    dfm.set_index('time', inplace=True)
    return dfm
  
    
def apply_additive_delta(suffixes, df_in):
    df_copy = df_in.copy()
    for x in suffixes:
        for y in gcmFolders[1:]:
            for z in scenFolders[1:]:
                futr_ssp_col = '{}_{}_{}'.format(x, y, z)
                futr_his_col = '{}_{}_{}'.format(x, y, 'historical')
                base_his_col = '{}_{}_{}'.format(x, 'gswp3-w5e5', 'historical')
                adj_ssp_col = "adj-" + futr_ssp_col
                df_copy[adj_ssp_col] = df_copy[base_his_col] + [df_copy[futr_ssp_col] - df_copy[futr_his_col]]
    return df_copy

### Run script below to create master files for all catchments
Notes & recommendations:
1. Non-duplicated catchments and duplicated ones are processed separately (in sequence), and can be ran independently. All outputs are current being saved to the catchment_master/data folder (which is of course something you can change if you want to). A duplicate catchment is a catchment that falls in 2+ PCR-GLOBWB regions.
2. You can divide the non-duplicated catchment list into smaller chunks, and run them with multiple 4-core clusters in parallel. One way to do that is to subset the "df_nd" dataframe.
3. I wouldn't recommend dividing up duplicated catchments, because by doing that you might risk having the same catchment in different chunks resulting failure in file merging.

In [0]:
# Confirm/update output data folder
outPATH = os.path.join(aq4PATH, 'catchment_master/data')

# First, creating master files for non-duplicated catchments 
print('Creating master files for non-duplicated catchments...\n\n')
df_nd = pd.read_csv(os.path.join(utilsPATH, 'mregion_catchment_master_list_non-duplicated_only.csv'))
pids, mrs = df_nd['pfaf_id'].values, df_nd['mRegion'].values
i = 0
for p, m in zip(pids, mrs):
    print('**************************************\npfaf_id-{}: running...\n{} done, {} remaining\n'.format(p, i, len(pids) - i))
    dfpm = create_catchment_master(pfafid=p, mregion=m)
    cols = dfpm.columns.values
    for c in cols:
#             The if statements below are to reassign nan values to places where it shouldn't have data, e.g., for the baseline period, 
#             no data is available before Dec 2019. The nansum operation above treats nan as zeros.    
        if 'gswp3-w5e5_historical' in c:
            dfpm[c][720:] = np.nan
        elif 'historical' in c:
            dfpm[c][660:] = np.nan
        else:
            dfpm[c][:660] = np.nan        
    print('\nNow calculating water stress...')
    dfpm = create_catchment_monthly_water_stress(dfp=dfpm)
    dfpm.to_csv(os.path.join(outPATH, '{}.csv'.format(p)), index=None)
    i = i + 1
    print('\npfaf_id-{}: file saved\n'.format(p))

# Second, creating master files for duplicated catchments
print('Creating master files for duplicated catchments...\n\n')
df_d = pd.read_csv(os.path.join(utilsPATH, 'mregion_catchment_master_list_duplicated_only.csv'))
upids = df_d['pfaf_id'].unique()
i = 0
for p in tqdm(upids[0:230]):
    print('**************************************\npfaf_id-{}: running...\n{} done, {} remaining\n'.format(p, i, len(pids) - i))
    mrs = df_d[df_d['pfaf_id'] == p]['mRegion'].values
    dfs = []
    for m in mrs:
        print('\nWorking on region {}'.format(m))
        dfpm = create_catchment_master(pfafid=p, mregion=m)
        dfs.append(dfpm)
    df0 = dfs[0]
    cols = df0.columns.values
    max_cols = [x for x in cols if ('irr' in x) | ('dom' in x) |('ind' in x) |('liv' in x) |('ww' in x) |('wn' in x) ]
    sum_cols = [x for x in cols if ('runoff' in x) | ('discharge' in x) | ('ba' in x)]
    for dfi in dfs[1:]:
        for c in sum_cols:
            df0[c] = np.nansum([df0[c], dfi[c]], axis=0)
#             The if statements below are to reassign nan values to places where it shouldn't have data, e.g., for the baseline period, 
#             no data is available before Dec 2019. The nansum operation above treats nan as zeros.
            if 'gswp3-w5e5_historical' in c:
                df0[c][720:] = np.nan
            elif 'historical' in c:
                df0[c][660:] = np.nan
            else:
                df0[c][:660] = np.nan
        for c in max_cols:
            df0[c] = np.nanmax([df0[c], dfi[c]], axis=0)
#             The if statements below are to reassign nan values to places where it shouldn't have data, e.g., for the baseline period, 
#             no data is available before Dec 2019. The nansum operation above treats nan as zeros.
            if 'gswp3-w5e5_historical' in c:
                df0[c][720:] = np.nan
            elif 'historical' in c:
                df0[c][660:] = np.nan
            else:
                df0[c][:660] = np.nan
    print('\nNow calculating water stress...')
    df0 = create_catchment_monthly_water_stress(dfp=df0)
    df0.to_csv(os.path.join(outPATH, '{}.csv'.format(p)), index=None)
    i = i + 1
    print('\npfaf_id-{}: file saved\n'.format(p))