# Aggregate gridded data to HRU using areal averaging

### Output Format:

year month day hour minute second hru1 hru2 hru3

1996 10 1 0 0 0 0.004 0.050 0.070

1996 10 2 0 0 0 0.500 0.040 0.100

In [37]:
import rasterio as rs
import geopandas as gpd
from netCDF4 import Dataset
import os
import datetime

In [2]:
def process_tiffs(df):
    reg = df.reg
    nhru = df.nhruID
    
    fl = './data/nhrus/AEA_tiffs/HUC_%s_nhruID_%s.tiff'%(reg,nhru)
    
    if os.path.isfile(fl): # only proceed if the tiff exists
        with rs.open(fl) as ds:
            rast = ds.read(1)

        n,m = rast.shape
        rast.shape = n*m
        rast = rast[rast!=0] # remove no data cells
        k = float(len(rast))

        cells = np.unique(rast)
        #print(len(cells))
        percents = []
        for cell in cells:
            percents.append(len(rast[rast==cell])/k) # divide by the total cells in the basin to get the propotion of each cell in the basin

        cells = list(cells)
        return cells,percents
    
    else:
        return [],[]

In [27]:
def compute_contributions(fl,test=False):
    tmp = gpd.read_file(fl)
    dat = pd.DataFrame() # first generate a list of hrus and their grid cell contributions
    dat['nhruID'] = tmp.hru_id_nat
    dat['reg_hruID'] = tmp.hru_id_reg
    reg = fl.split('_')[-2]
    dat['reg'] = tmp.region
    cells,percents = zip(*dat.apply(process_tiffs,axis=1)) # run the aggregation function
    dat['cells'] = cells # insert results back into the dataframe
    dat['percents'] = percents

    if (reg == '08') | (reg == '10U'): # if either of these regions occur
        dat2 = pd.read_pickle('./data/reg%s_unclipped.pcl'%reg) # load the missing data
        
        for nhru in dat2.nhruID: # remove the overlapping rows from the data frame
            dat = dat[dat.nhruID != nhru]
        
        dat = dat.append(dat2) # merge the two dataframes
    
    # write some tests
    if test:
        # do all the percentages equal very close to 1
        def test_percent(df):
            perc = np.sum(df.percents)
            if 1-perc > 0.0001:
                print('percent does not sum: %s: %s'%(1-perc,df.nhruID))
            
        dat.apply(test_percent,axis=1)
        # is the length of the original df the same as the produced one
        if len(tmp) - len(dat) > 0:
            print('data frames are different lengths')
    
    dat.to_pickle('./data/nhru_contrib/huc_%s_cell_contrib.pcl'%reg)
    print('%s Complete!'%reg)

In [28]:
# list regions
regions = glob.glob('./data/nhrus/clean_AEA/nhru_*_clean.shp')

In [29]:
[compute_contributions(reg,test=True) for reg in regions]

07 Complete!
percent does not sum: 1.0: 40826
percent does not sum: 1.0: 40827
percent does not sum: 1.0: 40891
percent does not sum: 1.0: 40908
percent does not sum: 1.0: 41064
08 Complete!
09 Complete!
10L Complete!
10U Complete!
11 Complete!


[None, None, None, None, None, None]

### Note:
HUC8 has a few issues...

# Crop the radar data by each HRU:

In [34]:
dat = pd.read_pickle('./data/nhru_contrib/huc_07_cell_contrib.pcl') # load some trial data
dat.sort_values('reg_hruID',inplace=True)

In [35]:
dat.head()

Unnamed: 0,nhruID,reg_hruID,reg,cells,percents
0,32610,1,7,"[721, 722]","[0.6408839779005525, 0.35911602209944754]"
1,32611,2,7,"[737, 738, 739, 740, 741]","[0.1377341251713111, 0.40566468707172226, 0.30..."
2,32612,3,7,"[735, 736, 737, 738]","[0.04225352112676056, 0.09859154929577464, 0.6..."
3,32613,4,7,"[737, 738, 739, 740, 741]","[0.0007342143906020558, 0.27386196769456683, 0..."
4,32614,5,7,[717],[1.0]


In [31]:
fl = './stage4_map_daily_20041220-20150107.nc'
ds = Dataset(fl,'r')

In [38]:
# compute the dates
time = ds.variables['time']
print('Time Units: %s'%time.units)
timeoffset = time.units[-20:] # trip the string
strt = pd.to_datetime(timeoffset) # convert string into datetime object
time = np.array(ds.variables['time'])

def compute_time(time,offset):
    dt = datetime.timedelta(hours=time)
    time = offset+dt
    return str(time.date())

times = np.vectorize(compute_time)(time,strt)

Time Units: Hour since 2001-12-31T23:00:00Z


In [42]:
def year(index): return index.year
def month(index): return index.month
def day(index): return index.day

In [57]:
# prepair the output data frame
out = pd.DataFrame()
out['datetime'] = pd.DatetimeIndex(times)
out.index = pd.DatetimeIndex(out.datetime)
out['year'] = out.index.map(year)
out['month'] = out.index.map(month)
out['day'] = out.index.map(day)
out['hour'] = 0
out['minute'] = 0
out['second'] = 0

for hru in dat.reg_hruID: # create space for each HRU
    out['hru_%s'%hru] = -9999
    
del out['datetime'] # clean up

In [63]:
m,k,l = ds.variables['Total_precipitation_surface_1_Hour_Accumulation'].shape # get the dimensions of the precip data

In [82]:
idx = np.arange(1,(m*k)+1)

In [83]:
idx.max()

3232389

In [61]:
for i in range(m): # iterate through slices of the dataset
    rast = np.array(ds.variables['Total_precipitation_surface_1_Hour_Accumulation'][i,:,:]) # pull a slice
    rast.shape = (k*l)

3669

In [62]:
len(out)

3669

In [48]:
hru

8205