# Step 1:  Create supply (discharge)
This script summarizes the PCR-GLOBWB NetCDF outputs for supply by the Hydrobasin 6 catchments. We use a lookup table created by University of Utrecht to select the maximum discharge in each catchment by the inflow and outflow points


## Discharge (renewable water flowing into sub-basin with upstream consumption removed)
discharge = All inflows - false outflows


## After this script:
The results will need to be converted from m3/day to Million m3/month and added with runoff

# Setup

## Libraries

In [0]:
!pip install tqdm
!pip install rtree
!pip3 install numpy
!pip3 install pandas
!pip3 install scipy
!pip3 install geopandas
!pip3 install rasterio
!pip3 install rasterstats
!pip3 install rioxarray
!pip3 install netcdf4
!pip install psutil
!pip install dask
!pip3 install xarray
import psutil
import rioxarray
import rasterio
import geopandas as gpd
import rasterstats as rstats
import netCDF4, os, subprocess, re, time, datetime, json
import numpy as np, pandas as pd
import netCDF4 as nc
from rasterio import Affine
from rasterio.enums import Resampling
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import dask
import xarray
import gc
from joblib import Parallel, delayed


In [0]:
# !pip install tqdm
# !pip install rtree
# !pip3 install numpy
# !pip3 install pandas
# !pip3 install scipy
# !pip3 install geopandas
# !pip3 install xarray
# !pip3 install rasterio
# !pip3 install rasterstats
# !pip3 install rioxarray
# !pip3 install netcdf4
# !pip install psutil
# !pip install dask
# import psutil
# import xarray
# import rioxarray
# import rasterio
# import geopandas as gpd
# import rasterstats as rstats
# import netCDF4, os, subprocess, re, time, datetime, json
# import numpy as np, pandas as pd
# import netCDF4 as nc
# from rasterio import Affine
# from rasterio.enums import Resampling
# import matplotlib.pyplot as plt
# import math
# from tqdm import tqdm
# import dask
# import gc
# from joblib import Parallel, delayed


## Functions & Data Locations

In [0]:
def memory_usage():
    process = psutil.Process(os.getpid())
    print('- - - Current memory usage is:', process.memory_info()[0] / float(2 ** 20))
    

def find_supply_paths(gcm, scen, m):
    '''
    PURPOSE: Find the paths to the supply data based on the defined parameters
    INPUTS:
        gcm: global climate model
        scen: future scenario
        m: region of the world
    OUTPUTS:
        dPATH: path to 5 arc min discharge
    '''
    # Find withdrawal data paths
    # For withdrawal data, Path requires knowing the beginning year
    beginText = 'begin_from_1960' if 'historical' in scen else 'begin_from_2015'
    scenText = 'historical-reference' if gcm == 'gswp3-w5e5' else scen
    # Find discharge
    dPATH = '/dbfs/mnt/pgb-data-lake/pcrglobwb_output1/pcrglobwb_aqueduct_2021/version_2021-09-16/{0}/{1}/{2}/{3}/netcdf/discharge_monthAvg_output.nc'.format(gcm, scenText, beginText, m)
    return dPATH

def read_NETCDF(ncPATH):
    '''
    PURPOSE: Read in NetCDF, return an Xarray about with spatial dimension defined, and list of band names (so we know what data is in it)
    INPUTS:
        ncPATH: path to netCDF in Data Lake
    OUTPUTS:
        ds: Xarray
    '''
#     print(ncPATH)
    # Read in arrary
    ds = xarray.open_dataset(ncPATH)
    # Find coordinate names
    dimensions  = [x for x in ds.coords.keys()]
    lat_variable = [x for x in dimensions if "lat" in x][0]
    lon_variable = [x for x in dimensions if "lon" in x][0]
    # Standardize lat and lon names
    ds = ds.rename({lon_variable: 'lon', lat_variable: 'lat'})
    # Set spatial dimenstions and projection
    ds = ds.rio.set_spatial_dims('lon', 'lat')
    ds.rio.crs
    ds.rio.write_crs("epsg:4326", inplace=True)
    # Find name of bands
    nc_bands = list(set([x for x in ds.variables.keys()]) - set(dimensions))
    nc_bands.remove('spatial_ref')
#     print(nc_bands)
    return ds

  
def run_zonal_stats_for_supply(gcm, scen, m):
    '''
    PURPOSE: Find the discharge at the catchment level
    INPUTS:
        gcm: global climate model
        scen: scenario
        m: region

    OUTPUTS:
        None returned. CSV containing zonal statistics will save. 
    '''
   
    # Find paths for supply data
    discharge_path = find_supply_paths(gcm, scen, m)

    print("- - - - Step 1: Read in supply data (discharge, inflows, outflows)")
    # Read in NetCDFs
    ds_dis = read_NETCDF(discharge_path)
    dis_band = 'dischage_monthAvg_output.nc'

    print("- - - - Step 2: Turn Xarray into Pandas")
    df_dis = ds_dis.to_dataframe()
    df_dis.reset_index(inplace = True)

    print("- - - - Step 3: Round coordinates to match lookup")
    df_dis['lat'] = round(df_dis['lat'], 3)
    df_dis['lon'] = round(df_dis['lon'], 3)

    print("- - - - Step 4: Merge discharge with inflow IDs")
    df_inflows = pd.merge(df_dis, df_i, how = 'left', left_on = ['lat', 'lon'], right_on = ['lat', 'lon'])
    df_catchment_ins = df_inflows.groupby(['time', 'geo_id'])['discharge'].sum().to_frame(name = 'discharge_inflow')
    del df_inflows

    print("- - - - Step 5: Merge discharge with outflow IDs")
    df_o_false = df_o[df_o['true'] == 0]
    df_outfows = pd.merge(df_dis, df_o_false, how = 'left', left_on = ['lat', 'lon'], right_on = ['lat', 'lon'])
    df_catchment_outs = df_outfows.groupby(['time', 'geo_id'])['discharge'].sum().to_frame(name = 'discharge_outflow')
    del df_o_false
    del df_outfows
    del df_dis

    print("- - - - Step 6: Merge inflows and outflows together")
    df_flows = pd.merge(df_catchment_ins, df_catchment_outs, how = 'outer', left_index = True, right_index = True)
    df_flows.replace(np.nan, 0, inplace = True)
    del df_catchment_ins
    del df_catchment_outs

    print("- - - - Step 7: Calculate total discharge")
    df_flows['discharge_m3s'] = df_flows['discharge_inflow'] - df_flows['discharge_outflow'] 
    df_flows.reset_index(inplace = True)

    print("- - - - Step 8: Add PFAF IDs")
    df_flows = pd.merge(df_flows, df_id, how = 'left', left_on = 'geo_id', right_on = 'HYBAS_ID')

    print("- - - - Step 9: Filter out non-regional ID")
    geogidlookupPATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-boundaries/m_region-pfaf6-lookups/{0}_pfaf6_lookup.csv'.format
    df_pfs = pd.read_csv(geogidlookupPATH(m))
    pfs = df_pfs['pfaf_id'].tolist()
    df_flows = df_flows[df_flows['pfaf_id'].isin(pfs)]

    print("- - - - Step 10: Clean data")
    # Filter data
    df_discharge = df_flows.filter(['time', 'pfaf_id', 'discharge_m3s'])
    # Find number of days per month
    df_discharge['days'] = df_discharge['time'].dt.daysinmonth
    # Convert m3/sec to million m3 per month
    # m3/sec * 86400sec/1dy * xdays/1month * 1Million/1e6

    # FIXED:
    df_discharge['discharge_Mm3month'] = df_discharge['discharge_m3s'] * 86400 * df_discharge['days'] / 1e6
    df_discharge.set_index(['time', 'pfaf_id'], inplace = True)
    del df_flows
    outPATH = newROOT(gcm, scen) + newNAME(m)
    df_discharge.to_csv(outPATH)
    del df_discharge
    gc.collect()
    
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# ! - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - UNIVERSAL DATA - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - !
# 1. M folders (ie, regional folders) and path to example regional data
mFolders = ['M' + str(x).zfill(7) for x in range(1, 54)]
# Global climate models
# gcmFolders = ['gswp3-w5e5']

gcmFolders = ['gfdl-esm4',
              'ipsl-cm6a-lr',
              'mpi-esm1-2-hr',
              'mri-esm2-0',
              'ukesm1-0-ll']

# gcmFolders = ['ipsl-cm6a-lr',
#               'mpi-esm1-2-hr',
#               'mri-esm2-0',
#               'ukesm1-0-ll']

# # Future scenarions
scenFolders = ['ssp126',
               'ssp370',
               'ssp585']

# scenFolders = ['historical']

# 2. HyBAS6 ID to PFAF ID Lookup (df_id)
# Data that matches HYBAS_ID to pfaf_id (Rens used HYBAS ID for flows)
hybasidlookupPATH =  '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-river_networks/flows/hybas_to_pfaf_lookup.csv'
# Read in hybas to pfaf lookup
df_id = pd.read_csv(hybasidlookupPATH, header = 0, index_col = 0)
# Filter to only keep pfafs, rename to lowercase
df_id['pfaf_id'] = df_id['PFAF_ID'].astype(int)
df_id = df_id.filter(['pfaf_id', 'area_m2_30spfaf06'])
# Drop pfaf that shares two HYBAS IDs (watershed bridges Asia & North America. No PCR GLOBWB data, so don't need to keep it)
df_id = df_id[df_id['pfaf_id'] != 353020]

# 3. Inflow and Outflow lookup tables (df_i, df_o)
iPATH =  '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-river_networks/flows/hybas_topology_05min/inflow_points.txt'
oPATH =  '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-river_networks/flows/hybas_topology_05min/outflow_points.txt'
df_i = pd.read_csv(iPATH, header = 1, delimiter = ';')
df_i.columns = [x.strip() for x in df_i.columns]
df_o = pd.read_csv(oPATH, header = 1, delimiter = ';')
df_o.columns = [x.strip() for x in df_o.columns]    
    

# 4. Output {root: 0 = GCM; 1 = SCEN, name: 0 = m region}
newROOT = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/discharge/{0}/{1}/'.format
newNAME = '{0}.csv'.format

In [0]:
# # Full run for 1 Region

for scen in scenFolders:
    print("- - ", scen)
    for gcm in gcmFolders:
        if gcm == 'gswp3-w5e5' and scen != 'historical':
            continue
        else:
            print("- ", gcm)
            for m in tqdm(mFolders):
                test_file_exist = newROOT(gcm, scen)
                if os.path.exists(test_file_exist):
                    continue
                else:
                    print("- - -", m)
                    run_zonal_stats_for_supply(gcm = gcm, scen = scen, m = m)