# Step 1: Create irrigation demand
This script summarizes the PCR-GLOBWB NetCDF outputs for irrigation demand by the Hydrobasin 6 catchments. We loop through the GCM/scen, convert the flux to volume, resample 5x5, and take a clip of the timeseries data for each polygon. Data is saved by region-polygon. 


## Gross demand
girrww =  estimate_irrigation_demand (km3/month )
 
 ## Gross consumption
 girrwn = (girrww * (evaporation_from_irrigation * area)) /  (girrww  + (precipitation_at_irrigation * area))
 
 ## Next steps
 - convert km3/month into million m3/month
 - adjust to account for resample (5x5)

# Setup

## Libraries

In [0]:
!pip install tqdm
!pip install rtree
!pip3 install numpy
!pip3 install pandas
!pip3 install scipy
!pip3 install geopandas
!pip3 install xarray
!pip3 install rasterio
!pip3 install rasterstats
!pip3 install rioxarray
!pip3 install netcdf4
!pip install psutil
!pip install dask
import psutil
import xarray
import rioxarray
import rasterio
import geopandas as gpd
import rasterstats as rstats
import netCDF4, os, subprocess, re, time, datetime, json
import numpy as np, pandas as pd
import netCDF4 as nc
from rasterio import Affine
from rasterio.enums import Resampling
import matplotlib.pyplot as plt
import math
from tqdm import tqdm
import dask
import gc
from joblib import Parallel, delayed


## Functions & Data Locations

In [0]:
def memory_usage():
    process = psutil.Process(os.getpid())
    print('- - - Current memory usage is:', process.memory_info()[0] / float(2 ** 20))
    

def find_irr_paths(gcm, scen):
    '''
    PURPOSE: Find the paths  based on the defined parameters
    INPUTS:
        gcm: global climate model
        scen: future scenario
    OUTPUTS:
        dPATH: path to 5 arc min discharge
    '''
    # Find withdrawal data paths
    # For withdrawal data, Path requires knowing the beginning year
    beginText = '1960-2014' if 'historical' in scen else '2015-2100'
    if gcm == 'gswp3-w5e5':
        scenText = 'historical-reference'
        beginText = '1960-2019'
    else:
        scenText = scen
    # Find runoff
    PATH = '/dbfs/mnt/pgb-data-lake/pcrglobwb_input/version_2021-09-16/edwin_irrigation_demand/'
    NAME = 'estimateIrrigationDemandVolume_monthTot_output_{0}_km3_per_month_{1}_{2}_compressed.nc'.format(beginText, gcm, scenText)
    FULL_PATH =    PATH + NAME
    return FULL_PATH

def find_et_paths(gcm, scen, m):
    '''
    PURPOSE: Find the evaporation and precipitation paths based on the defined parameters
    INPUTS:
        gcm: global climate model
        scen: future scenario
    OUTPUTS:
        dPATH: path to 5 arc min discharge
    '''
    # Find withdrawal data paths
    # For withdrawal data, Path requires knowing the beginning year
    beginText = 'begin_from_1960' if 'historical' in scen else 'begin_from_2015'
    scenText = 'historical-reference' if gcm == 'gswp3-w5e5' else scen
    # Find runoff
    PATH = '/dbfs/mnt/pgb-data-lake/pcrglobwb_output1/pcrglobwb_aqueduct_2021/version_2021-09-16/{0}/{1}/{2}/{3}/netcdf/'.format(gcm, scenText, beginText, m)
    EV_PATH  = PATH +  'evaporation_from_irrigation_monthTot_output.nc'
    PR_PATH  = PATH +  'precipitation_at_irrigation_monthTot_output.nc'
    return EV_PATH, PR_PATH

def read_NETCDF(ncPATH):
    '''
    PURPOSE: Read in NetCDF, return an Xarray about with spatial dimension defined, and list of band names (so we know what data is in it)
    INPUTS:
        ncPATH: path to netCDF in Data Lake
    OUTPUTS:
        ds: Xarray
        nc_band: band in data
    '''
#     print(ncPATH)
    # Read in arrary
    ds = xarray.open_dataset(ncPATH)
    # Find coordinate names
    dimensions  = [x for x in ds.coords.keys()]
    lat_variable = [x for x in dimensions if "lat" in x][0]
    lon_variable = [x for x in dimensions if "lon" in x][0]
    # Standardize lat and lon names
    ds = ds.rename({lon_variable: 'lon', lat_variable: 'lat'})
    # Set spatial dimenstions and projection
    ds = ds.rio.set_spatial_dims('lon', 'lat')
    ds.rio.crs
    ds.rio.write_crs("epsg:4326", inplace=True)
    # Find name of bands
    nc_bands = list(set([x for x in ds.variables.keys()]) - set(dimensions))
    nc_bands.remove('spatial_ref')
#     print(nc_bands)
    return ds

def fillnas(da):
    """Replaces NA values with 0 in data array. Returns data array"""
        # Fill NA's with where statement. fillna functions aren't working great
    da_filled = xarray.where(da.isnull(), 0, da)
    del da
    # reset spatial dimensions
    da_filled = da_filled.rio.set_spatial_dims('lon', 'lat')
    da_filled.rio.crs
    da_filled.rio.write_crs("epsg:4326", inplace=True)
    return da_filled

def resample_xarray(ds, downscale_factor):
    '''
    PURPOSE: Resample NetCDF to smaller size so zonal statistics can be more accurate 
    INPUTS:
        ds: Xarray to downscale
        downscale_factor: 1-dimensional factor to increase size by. 
        Ex: 10 would turn each pixel into 100 smaller, identical pixels (10X10)
    OUTPUTS:
        xds_downscaled: downscaled Xarray
    '''
    # Dfein new dimensions
    new_width = ds.rio.width * downscale_factor
    new_height = ds.rio.height * downscale_factor
    # Run resampling function
    xds_downscaled = ds.rio.reproject(
        ds.rio.crs,
        shape=(new_height, new_width),
        resampling=Resampling.nearest,
    )
    # Rename coordinate dimensions
    xds_downscaled = xds_downscaled.rename({'x': 'lon', 'y': 'lat'})
    return xds_downscaled


def segment_id_list(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
        
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# ! - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - UNIVERSAL DATA - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - -  - - !
# 1. M folders (ie, regional folders) and path to example regional data
mFolders = ['M' + str(x).zfill(7) for x in range(1, 54)]
# Global climate models
# gcmFolders = ['gswp3-w5e5']


gcmFolders = ['gfdl-esm4',
              'ipsl-cm6a-lr',
              'mpi-esm1-2-hr',
              'mri-esm2-0',
              'ukesm1-0-ll']
# Future scenarions
scenFolders = ['historical',
               'ssp126',
               'ssp370',
               'ssp585']

# 2. Hydrobasin 6 
shapePATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-boundaries/hydro_basin_lv6/aq3_pfaf_basins.shp'
hy6 = gpd.read_file(shapePATH , crs="epsg:4326")
hy6.columns= hy6.columns.str.lower()
project_crs = hy6.crs # WGS84 aka epsg 4326
hy6.set_index('pfaf_id', inplace = True)
geog_id = 'pfaf_id'
geogidlookupPATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-boundaries/m_region-pfaf6-lookups/{0}_pfaf6_lookup.csv'.format
# 3. Area
areaPATH = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/aux-boundaries/global_area_5arcmin.nc'
ds_area = read_NETCDF(areaPATH)
area_band = 'global_cellsize_m2_05min.tif'  
# ds_area = ds_area.assign(area=ds_area[area_band]/1000000.0)


# 4.  Output Root ( 0 = resample; 1 =  GCM; 2 = SCEN)
newROOT = '/dbfs/mnt/pgb-data-lake/aqueduct_dev/pcrglobwb_aqueduct_2021/version_2021-09-16/run_202205/zonal_statistics/pfaf6/demand_irr_resample_{0}/{1}/{2}/'.format
# 4.  Output Name ( 0 = M region; 1 =  PFAF ID)
newNAME = '{0}_{1}.csv'.format

In [0]:


def run_irrigation(gcm, scen, m, resample_size, list_pfs):    
    # In function function
    def clip_by_pfaf_id(p):
        # Step 3.1: Select 1 polygon per loop
        my_geom = hy6.loc[p:p, :]
        # Step 3.2: Clip NetCDF by polygon
        stime = time.time()
        clipped = ds_rs.rio.clip(my_geom.geometry, project_crs, drop=False)
        # Step 3.2 housekeeping
        print('- - - - Clipped NetCDF in {}'.format(time.time()-stime))
        memory_usage()
        # Step 3.3: Sum contents across lat and long
        df_t = clipped.sum(dim = ['lon', 'lat']).to_dataframe()
        # Step 3.3 housekeeping
        del clipped
        # Add geometry ID
        df_t[geog_id] = p
        outPATH = newROOT(resample_size, gcm, scen) + newNAME(m, p)
        df_t.to_csv(outPATH)

    # - - - STEP 1: READ IN REGION DATA TO GET EXTENTS AND WATERSHEDS
    mstime = time.time()
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 1 - Read in global 5 arcmin dataset for irrigation demand. 
    print("Step 1: Reading in global data")
    irrPATH = find_irr_paths(gcm, scen)
    ds_ww = read_NETCDF(irrPATH) 
    ww_band = 'girrww'
    ds_ww = ds_ww.rename(name_dict={'estimate_irrigation_demand': ww_band})
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 2 - Read in region-specific evaporation and precipitation data
    print("Step 2: Reading in regional data")
    evpPATH, prcPATH = find_et_paths(gcm, scen, m)
    ds_ev = read_NETCDF(evpPATH)
    ds_pr = read_NETCDF(prcPATH)
    ev_band, pr_band = 'evaporation_from_irrigation', 'precipitation_at_irrigation'
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 3 - Convert evap and precip from m/month to km3/month
    print("Step 3: Convert regional data to km3")
    grid_area = ds_area.reindex_like(ds_ev, method='nearest', tolerance=0.01)
    ds_m = xarray.merge([ds_ev, ds_pr, grid_area])
    ds_m = ds_m.assign(evap_km3 = (ds_m[ev_band] * ds_m[area_band])/1e9) 
    ds_m = ds_m.assign(prec_km3 = (ds_m[pr_band] * ds_m[area_band])/1e9) 
    ds_m = ds_m.drop([ev_band, pr_band, area_band])
    del ds_ev, ds_pr
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 4 - Merge evap and precip with demand (all in km3)
    print("Step 4: Merging global data to regional data")
    ds_box = xarray.merge([ds_m, ds_ww], join='left', fill_value=0)
    del ds_ww
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 5 - Calculate consumption
    print("Step 5: Calculating consumption")
    stime = time.time()
    ds_box = ds_box.assign(girrwn=(xarray.where(ds_box[ww_band] > 0, (ds_box[ww_band] * ds_box['evap_km3']) / (ds_box[ww_band] + ds_box['prec_km3']) , 0 )))
    ds_box = ds_box.drop(["evap_km3", "prec_km3"])
    ds_box = ds_box.rio.set_spatial_dims('lon', 'lat')
    ds_box.rio.crs
    ds_box.rio.write_crs(ds_m.rio.crs, inplace = True)
    print("Step 5: Completed in {}".format(time.time() - stime))   
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 6 - Resample data and fix NAs
    print("Step 6: Resampling data")
    stime = time.time()
    ds_rs = resample_xarray(ds_box, resample_size)
    ds_rs = ds_rs.chunk({"lon": 100, "lat": 100})
    for x in [ww_band, 'girrwn']:
        ds_rs[x].attrs['_FillValue'] = 0.0
    print("Step 6: Completed in {}".format(time.time() - stime)) 
    ds_rs = ds_rs.rio.set_spatial_dims('lon', 'lat')
    ds_rs.rio.crs
    ds_rs.rio.write_crs(ds_m.rio.crs, inplace = True)
    del ds_m
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
    # Step 7 - Loop through watersheds
    print("Step 7: Starting clip loop") 
#     df_pf = pd.read_csv(geogidlookupPATH(m))
#     list_pfs = list(set(df_pf['pfaf_id'].tolist()))
    memory_usage()
    stime = time.time()
    # Set number of workers
    n_workers = 40
    # Keep track of while loop
    worker_count = 1
    # While catchments remain in objectids, repeat this process. For every new round, lessen the number of workers to prevent memory overload
    run_count = 1
    oid_count = 0
    # Segment catchments by workers for parallel process
    objectids_list = segment_id_list(lst=list_pfs, n= int(n_workers / worker_count))
    # Step 3. Clip and sum by polygon
    df_fs = []
    for oids in tqdm(objectids_list):
        memory_usage()
        Parallel(n_jobs=n_workers)(delayed(clip_by_pfaf_id)(p) for p in oids)
        run_count += 1
        oid_count = len(oids) + oid_count
        print('- - - - - run number', run_count, "\n- - - - - - Remaining catchments:", len(list_pfs) - oid_count)
        gc.collect()
    del ds_rs
    gc.collect()
    endtime = time.time() - mstime
    print('Region {0} done in {1}'.format(m, endtime))

# Run

In [0]:
# USER SELECTIONS!!!
scen_sel = 'ssp585' # Scenario
rs_size = 5 # Resample Size


print("Start {0} scen_sel with {1} x {1} resample".format(scen_sel, rs_size))
for gcm in gcmFolders:
    print("Start GCM:{0}".format(gcm))
    for m in mFolders:
        test_root = os.path.dirname(newROOT(rs_size, gcm, scen_sel))
        complete_files = os.listdir(test_root)
        complete_pfs = [int(x.split("_")[1].replace(".csv", "")) for x in complete_files if (m in x)]
        # Read full list of PFs
        df_pf = pd.read_csv(geogidlookupPATH(m))
        all_pfs = list(set(df_pf['pfaf_id'].tolist()))
        # Step 3b: Create list of IDs from lookup table
        list_pfs = list(set(df_pf['pfaf_id'].tolist()))
        unfinished_pfs = list(set(all_pfs) - set(complete_pfs))
        print("Region {0} has {1} out of {2} watersheds left".format(m, len(unfinished_pfs), len(all_pfs)))
        if len(unfinished_pfs) == 0:
            continue
        else: 
            run_irrigation(gcm = gcm, m = m, scen = scen_sel,  resample_size = rs_size, list_pfs = unfinished_pfs)