# Aggregate gridded data to HRU using areal averaging

### Output Format:

year month day hour minute second hru1 hru2 hru3

1996 10 1 0 0 0 0.004 0.050 0.070

1996 10 2 0 0 0 0.500 0.040 0.100

In [1]:
import rasterio as rs
import geopandas as gpd
from netCDF4 import Dataset
import os
import datetime
import progressbar as bar

In [2]:
def process_tiffs(df):
    reg = df.reg
    nhru = df.nhruID
    
    fl = './data/nhrus/AEA_tiffs/HUC_%s_nhruID_%s.tiff'%(reg,nhru)
    
    if os.path.isfile(fl): # only proceed if the tiff exists
        with rs.open(fl) as ds:
            rast = ds.read(1)

        n,m = rast.shape
        rast.shape = n*m
        rast = rast[rast!=0] # remove no data cells
        k = float(len(rast))

        cells = np.unique(rast)
        #print(len(cells))
        percents = []
        for cell in cells:
            percents.append(len(rast[rast==cell])/k) # divide by the total cells in the basin to get the propotion of each cell in the basin

        cells = list(cells)
        return cells,percents
    
    else:
        return [],[]
    
def process_tiffs_12(df):
    reg = df.reg
    nhru = df.reg_hruID
    
    fl = './data/nhrus/AEA_tiffs/HUC_%s_nhruID_%s.tiff'%(reg,nhru)
    
    if os.path.isfile(fl): # only proceed if the tiff exists
        with rs.open(fl) as ds:
            rast = ds.read(1)

        n,m = rast.shape
        rast.shape = n*m
        rast = rast[rast!=0] # remove no data cells
        k = float(len(rast))

        cells = np.unique(rast)
        #print(len(cells))
        percents = []
        for cell in cells:
            percents.append(len(rast[rast==cell])/k) # divide by the total cells in the basin to get the propotion of each cell in the basin

        cells = list(cells)
        return cells,percents
    
    else:
        return [],[]

In [3]:
def compute_contributions(fl,test=False):
    tmp = gpd.read_file(fl)
    dat = pd.DataFrame() # first generate a list of hrus and their grid cell contributions
    dat['nhruID'] = tmp.hru_id_nat
    dat['reg_hruID'] = tmp.hru_id_reg
    reg = fl.split('_')[-2]
    dat['reg'] = tmp.region
    cells,percents = zip(*dat.apply(process_tiffs,axis=1)) # run the aggregation function
    dat['cells'] = cells # insert results back into the dataframe
    dat['percents'] = percents

    if (reg == '08') | (reg == '10U') | (reg == '04'): # if either of these regions occur
        dat2 = pd.read_pickle('./data/reg%s_unclipped.pcl'%reg) # load the missing data
        
        for nhru in dat2.nhruID: # remove the overlapping rows from the data frame
            dat = dat[dat.nhruID != nhru]
        
        dat = dat.append(dat2) # merge the two dataframes
    
    # write some tests
    if test:
        # do all the percentages equal very close to 1
        def test_percent(df):
            perc = np.sum(df.percents)
            if 1-perc > 0.0001:
                print('percent does not sum: %s: %s'%(1-perc,df.nhruID))
            
        dat.apply(test_percent,axis=1)
        # is the length of the original df the same as the produced one
        if len(tmp) - len(dat) > 0:
            print('data frames are different lengths')
    
    dat.to_pickle('./data/nhru_contrib/huc_%s_cell_contrib.pcl'%reg)
    print('%s Complete!'%reg)
    
def compute_contributions_12(fl,test=False):
    '''for region 12'''
    tmp = gpd.read_file(fl)
    dat = pd.DataFrame() # first generate a list of hrus and their grid cell contributions
    dat['reg_hruID'] = tmp.hru_id
    reg = fl.split('_')[-2]
    dat['reg'] = tmp.region
    cells,percents = zip(*dat.apply(process_tiffs_12,axis=1)) # run the aggregation function
    dat['cells'] = cells # insert results back into the dataframe
    dat['percents'] = percents
    
    # write some tests
    if test:
        # do all the percentages equal very close to 1
        def test_percent(df):
            perc = np.sum(df.percents)
            if 1-perc > 0.0001:
                print('percent does not sum: %s: %s'%(1-perc,df.nhruID))
            
        dat.apply(test_percent,axis=1)
        # is the length of the original df the same as the produced one
        if len(tmp) - len(dat) > 0:
            print('data frames are different lengths')
    
    dat.to_pickle('./data/nhru_contrib/huc_%s_cell_contrib.pcl'%reg)
    print('%s Complete!'%reg)

In [4]:
# list regions
regions = glob.glob('./data/nhrus/clean_AEA/nhru_*_clean.shp')

In [6]:
# run region 12
compute_contributions_12(regions[-1],test=True)

12 Complete!


In [7]:
[compute_contributions(reg,test=True) for reg in regions]

07 Complete!
08 Complete!
09 Complete!
10L Complete!
10U Complete!
11 Complete!


[None, None, None, None, None, None]

# Crop the radar data by each HRU:

In [13]:
# load the precip data
fl = './stage4_map_daily_20041220-20150107.nc'
ds = Dataset(fl,'r')
m,k,l = ds.variables['Total_precipitation_surface_1_Hour_Accumulation'].shape # get the dimensions of the precip data

In [14]:
# unsure if this is needed

# load the index raster
#idx = np.load('./data/hrap_grid_index.npy')
#idx.shape = k*l 

In [15]:
# compute the dates
time = ds.variables['time']
#print('Time Units: %s'%time.units)
timeoffset = time.units[-20:] # strip the string
strt = pd.to_datetime(timeoffset) # convert string into datetime object
time = np.array(ds.variables['time'])

def compute_time(time,offset):
    dt = datetime.timedelta(hours=time)
    time = offset+dt
    return str(time.date())

times = np.vectorize(compute_time)(time,strt)

In [16]:
def year(index): return index.year
def month(index): return index.month
def day(index): return index.day

In [26]:
def compute_precip(df,datetime=[],rast=[],out=[]):
    '''
    Compute precip for an hru based on its contributing grid cells
    '''
    
    precip = rast[df.cells]
    percents = np.array(df.percents)
    
    weighted_precip = np.sum(precip*percents) # precip in mm
    weighted_precip *= 0.0393701 # mm > inches

    out.loc[datetime,'hru_%s'%df.reg_hruID] = weighted_precip # insert into the out data frame

In [27]:
def generate_output(fl):
    reg = fl.split('_')[-2] # extract the region
    print('Starting region %s...'%reg)
    dat = pd.read_pickle('./data/nhru_contrib/huc_%s_cell_contrib.pcl'%reg) # load the contributing cells and percentages
    dat.sort_values('reg_hruID',inplace=True,ascending=True) # sort by regional hru
    
    # prepair the output data frame
    out = pd.DataFrame()
    out['datetime'] = pd.DatetimeIndex(times)
    out.index = pd.DatetimeIndex(out.datetime)
    out['year'] = out.index.map(year)
    out['month'] = out.index.map(month)
    out['day'] = out.index.map(day)
    out['hour'] = 0
    out['minute'] = 0
    out['second'] = 0

    for hru in dat.reg_hruID: # create space for each HRU
        out['hru_%s'%hru] = -999

    del out['datetime'] # clean up
    
    pb = bar.ProgressBar(min_value=0,max_value=m)

    for i in range(m): # iterate through slices of the dataset
        rast = np.array(ds.variables['Total_precipitation_surface_1_Hour_Accumulation'][i,:,:]) # pull a slice
        rast.shape = (k*l) # reshape the dataset in the say way as the index values

        dat.apply(compute_precip,axis=1,datetime=times[i],rast=rast,out=out) # compute precip for each hru for the time slice
        pb.update(i)
    
    
    out.to_csv('./data/hru_%s_stage_4_precip.cbh'%reg,sep=' ',header=False,index=False,float_format='%.2f')
    out.to_pickle('./data/hru_%s_stage_4_precip.pcl'%reg)
    
    print('Region %s complete!'%reg)

In [28]:
# run only region 12
generate_output(regions[-1])

Starting region 12...


 99% (3668 of 3669) |############################################################################################################################ | Elapsed Time: 7:54:59 ETA: 0:00:07

Region 12 complete!


In [15]:
for fl in regions[-1:]:
    generate_output(fl)

Starting region 11...


 99% (3668 of 3669) |############################################################################ | Elapsed Time: 7:03:55 ETA: 0:00:06

Region 11 complete!


In [14]:
regions[-1:]

['./data/nhrus/clean_AEA/nhru_11_clean.shp']