In [4]:
from pathlib import Path
import xarray as xr
import pandas as pd
import numpy as np

from functions import *
from luts import *

data_dir = Path("/beegfs/CMIP6/jdpaul3/hydroviz_data/stats")

### Split & filter 
Split the collection of CSVs into two file groups based on geometry (`seg` = stream segment polyline geometry, and `hru` = hydrological response unit geometry, ie watershed). Filter out the files we don't want (the Maurer files, the "diff" files, and any files with stats computed over the full historical range [1952-2005].)

In [2]:
seg_files = list(data_dir.glob("dynamic*seg*.csv"))
seg_files += list(data_dir.glob("static*seg*.csv"))

hru_files = list(data_dir.glob("dynamic*hru*.csv"))
hru_files += list(data_dir.glob("static*hru*.csv"))

In [3]:
seg_files = filter_files(seg_files)
hru_files = filter_files(hru_files)

Removed 317 files from list; 314 files remain out of 631 original files.
Removed 317 files from list; 314 files remain out of 631 original files.


### Extract statistic names, geometry IDs, and coordinates

Get the stats abbreviations from columns of the first file; we will use these as variables when building our xarray dataset. Also get the geometry ids from the index of the first files in each list; we will use these as coordinates in our `xarray` datasets.

In [7]:
stat_vars = list(stat_vars_dict.keys())
print(len(stat_vars))
print(stat_vars)

23
['dh3', 'dh15', 'dl3', 'dl16', 'fh1', 'fl1', 'fl3', 'ma12', 'ma13', 'ma14', 'ma15', 'ma16', 'ma17', 'ma18', 'ma19', 'ma20', 'ma21', 'ma22', 'ma23', 'ra1', 'ra3', 'th1', 'tl1']


In [8]:
seg_ids = pd.read_csv(seg_files[0]).seg_id.astype(str).tolist()
hru_ids = pd.read_csv(hru_files[0]).hru_id.astype(str).tolist()

In [9]:
geom_coords_dict = {}
geom_coords_dict["seg"] = get_unique_coords(seg_files)
geom_coords_dict["hru"] = get_unique_coords(hru_files)
geom_coords_dict

{'seg': {'lcs': ['dynamic', 'static'],
  'models': ['ACCESS1-0',
   'BNU-ESM',
   'CCSM4',
   'GFDL-ESM2G',
   'GFDL-ESM2M',
   'IPSL-CM5A-LR',
   'IPSL-CM5A-MR',
   'MIROC-ESM',
   'MIROC-ESM-CHEM',
   'MIROC5',
   'MRI-CGCM3',
   'NorESM1-M',
   'bcc-csm1-1'],
  'scenarios': ['historical', 'rcp26', 'rcp45', 'rcp60', 'rcp85'],
  'variants': ['r1i1p1'],
  'eras': ['1976_2005', '2016_2045', '2046_2075', '2071_2100']},
 'hru': {'lcs': ['dynamic', 'static'],
  'models': ['ACCESS1-0',
   'BNU-ESM',
   'CCSM4',
   'GFDL-ESM2G',
   'GFDL-ESM2M',
   'IPSL-CM5A-LR',
   'IPSL-CM5A-MR',
   'MIROC-ESM',
   'MIROC-ESM-CHEM',
   'MIROC5',
   'MRI-CGCM3',
   'NorESM1-M',
   'bcc-csm1-1'],
  'scenarios': ['historical', 'rcp26', 'rcp45', 'rcp60', 'rcp85'],
  'variants': ['r1i1p1'],
  'eras': ['1976_2005', '2016_2045', '2046_2075', '2071_2100']}}

Besides the geometry, the coords are equivalent. We won't use the `variant` coord since there is only one variant represented here.

In [11]:
seg_ds = create_empty_dataset(geom_coords_dict["seg"], seg_ids)
seg_ds

In [8]:
# hru_ds = create_empty_dataset(geom_coords_dict["hru"], hru_ids)
# hru_ds

In [9]:
len(hru_ids)

109951

### Populate empty NC files

In [10]:
#seg_ds = xr.open_dataset("/beegfs/CMIP6/jdpaul3/hydroviz/empty_seg.nc")
#hru_ds = xr.open_dataset("/beegfs/CMIP6/jdpaul3/hydroviz/empty_hru.nc")

In [25]:
seg_files[0:10]

[PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_GFDL-ESM2M_rcp60_r1i1p1_seg_2046_2075.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_NorESM1-M_rcp26_r1i1p1_seg_2046_2075.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_IPSL-CM5A-LR_rcp26_r1i1p1_seg_2046_2075.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_IPSL-CM5A-MR_rcp26_r1i1p1_seg_2071_2100.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_MIROC-ESM-CHEM_rcp45_r1i1p1_seg_2071_2100.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_GFDL-ESM2G_rcp85_r1i1p1_seg_2071_2100.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_IPSL-CM5A-MR_rcp60_r1i1p1_seg_2016_2045.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_ACCESS1-0_rcp85_r1i1p1_seg_2071_2100.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_CCSM4_rcp26_r1i1p1_seg_2046_2075.csv'),
 PosixPath('/beegfs/CMIP6/jdpaul3/

In [26]:

def populate_dataset(ds, files):
    
    # TODO: add a step to confirm that the CSV is the proper shape based on dataset geom_id coords
    # TODO: add a step to confirm that all stat vars from luts.py are found in CSV column names
    # TODO: add a step to confirm that the parsed coords all actually exist in the xarray dataset

    for file in files:
        # parse this filename to find coords where data should go
        parts = file.name.split('_')
        try:
            lc, model, scenario, era = parts[0], parts[1], parts[2], "_".join([parts[5], parts[6].split(".")[0]])
        except:
            print(f"Error parsing file: {file.name}")
            continue
        
        # only read in the columns we want, and use actual NaNs
        df = pd.read_csv(file, usecols = stat_vars)
        df.replace(-99999, np.nan, inplace=True)

        for stat in df.columns:
            ds[stat].loc[{"lc": lc, "model": model, "scenario": scenario, "era": era_lookup[era]}] = df[stat]
            # drop column after use (might save memory?)
            df.drop(columns=[stat], inplace=True)
        


In [28]:
populate_dataset(seg_ds, seg_files)

100%|██████████| 314/314 [01:52<00:00,  2.79it/s]


In [29]:
seg_ds

In [30]:
seg_ds["dh3"].sel({"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century", "geom_id": "1000"}).load().values

array(382.66)

In [19]:
# dynamic_GFDL-ESM2M_rcp60_r1i1p1_seg_2046_2075.csv
seg_ds["dh3"].sel({"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}).load().values

array([1003.7, 1897.2, 5853. , ...,    nan,    nan,    nan])

In [20]:
# dynamic_NorESM1-M_rcp26_r1i1p1_seg_2046_2075
seg_ds["dh3"].sel({"lc": "dynamic", "model": "NorESM1-M", "scenario": "rcp26", "era": "mid_century"}).load().values

array([ 806.7, 1703.7, 5115.4, ...,    nan,    nan,    nan])

In [21]:
# dynamic_IPSL-CM5A-LR_rcp26_r1i1p1_seg_2046_2075.csv
seg_ds["dh3"].sel({"lc": "dynamic", "model": "IPSL-CM5A-LR", "scenario": "rcp26", "era": "mid_century"}).load().values

array([ 839.19, 1667.7 , 4998.1 , ...,     nan,     nan,     nan])

In [23]:
# dynamic_IPSL-CM5A-LR_rcp26_r1i1p1_seg_2071_2100.csv
seg_ds["dh3"].sel({"lc": "dynamic", "model": "IPSL-CM5A-LR", "scenario": "rcp26", "era": "late_century"}).load().values


array([nan, nan, nan, ..., nan, nan, nan])

In [28]:
# parse this filename to find coords where data should go
seg_files[0]

PosixPath('/beegfs/CMIP6/jdpaul3/hydroviz_data/stats/dynamic_GFDL-ESM2M_rcp60_r1i1p1_seg_2046_2075.csv')

In [29]:
# replace the -99999 with np.nan
# set index to geom_id
# and for each variable, set the data in the xarray dataset along the geom_id dimension
df = pd.read_csv(seg_files[0])
df = df.replace(-99999, np.nan)
df

Unnamed: 0,seg_id,ma3,ma4,ma12,ma13,ma14,ma15,ma16,ma17,ma18,...,spr_dur7,spr_freq,spr_ord,sum_mag,sum_dur3,sum_dur7,sum_freq,sum_ord,sum_cv,lf1
0,1,192.27,147.78,64.59,19.182,53.315,444.23,240.61,101.62,73.375,...,1397.8,9.5,111.5,0.020236,1.5913,5.7061,7.5,256.5,0.996485,71.0
1,2,117.47,100.76,389.30,328.330,595.210,1147.90,479.31,288.17,180.410,...,3302.6,6.5,103.0,0.015262,3.1210,3.8311,7.5,261.5,0.835755,23.5
2,3,145.66,122.20,704.50,444.750,856.800,3208.40,1265.20,597.88,399.090,...,9258.7,9.0,109.0,0.008106,5.5370,10.9130,4.0,258.0,0.795020,35.5
3,4,141.67,119.15,760.04,500.360,932.200,3346.60,1338.10,633.36,419.520,...,9590.6,9.0,109.0,0.008157,6.1173,12.5420,4.5,258.0,0.776000,29.0
4,5,116.26,100.93,456.57,398.950,729.260,1378.10,569.92,340.81,212.900,...,3901.6,6.5,103.0,0.014536,3.5317,4.4323,7.5,262.0,0.768025,18.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56535,56536,,,,,,,,,,...,,,,,,,,,,
56536,56537,,,,,,,,,,...,,,,,,,,,,
56537,56538,,,,,,,,,,...,,,,,,,,,,
56538,56539,,,,,,,,,,...,,,,,,,,,,


In [31]:
# for instance, see what the values are for the first stat ma3 (should be all NAN because its empty)
seg_ds["ma3"].sel({"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}).load()

In [32]:
#now use indexing (instead of sel or isel) to set new values
seg_ds["ma3"].loc[{"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}] = 1

In [33]:
# view again to be sure they are set
seg_ds["ma3"].sel({"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}).load()

In [34]:
#lets try with real values now
seg_ds["ma3"].loc[{"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}] = df['ma3']

In [35]:
seg_ds["ma3"].sel({"lc": "dynamic", "model": "GFDL-ESM2M", "scenario": "rcp60", "era": "mid_century"}).load()