In [1]:
from pyogrio import read_dataframe,write_dataframe
import geopandas as gpd
import os,glob,sys,time,re
import pandas as pd
import numpy as np
import multiprocessing as mp
from parallel_pandas import ParallelPandas
from tqdm.auto import tqdm

# get basin boundary files

In [25]:
basins = glob.glob('../../data/GRIT/full_catchment/GRIT_full_catchment_*_EPSG8857_simplify_final.gpkg')
if not os.path.exists('../basin_boundary'):
    os.mkdir('../basin_boundary')
for basin in basins:
    gdf = read_dataframe(basin)
    
    # difference between ohdb_darea and grit_darea less than 20%
    gdf['bias'] = np.abs(gdf.grit_darea - gdf.ohdb_darea_hydrosheds) / gdf.ohdb_darea_hydrosheds * 100
    gdf1 = gdf.loc[gdf.bias<=20,:]
    
    # darea greater than 125 km2 to ensure at least one grid cell
    gdf1 = gdf1.loc[gdf1.grit_darea>=125,:]

    gdf1['segment_id'] = gdf1.segment_id.astype(int).astype(str)
    gdf1['reach_id'] = gdf1.segment_id.astype(int).astype(str)
    gdf1 = gdf1.rename(columns={'grit_darea':'gritDarea','ohdb_darea_hydrosheds':'ohdbDarea1','ohdb_darea':'ohdbDarea0'})
    
    # save
    basin1 = os.path.basename(basin)
    write_dataframe(gdf1, f'../basin_boundary/{basin1[:-5]}'+'_125km2.gpkg')
    gdf1 = gdf1.to_crs('epsg:4326')
    basin1 = re.sub('EPSG8857','EPSG4326',basin1)
    write_dataframe(gdf1, f'../basin_boundary/{basin1[:-5]}'+'_125km2.shp')
    print(basin)

../../data/GRIT/full_catchment/GRIT_full_catchment_AS_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_AF_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SA_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_EU_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_NA_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SI_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SP_EPSG8857_simplify_final.gpkg


# select OHDB gauge and calculate streamflow indices

In [2]:
def cleanQ(df):
    # eliminate invalid records
    df1 = df.loc[df.Q.apply(lambda x: not isinstance(x, str)),:]
    df2 = df.loc[df.Q.apply(lambda x: isinstance(x, str)),:]
    try:
        df2 = df2.loc[df2.Q.str.match('\d+'),:]
    except:
        pass
    df = pd.concat([df1, df2])
    df['Q'] = df.Q.astype(np.float32)
    return df

def del_unreliableQ(df):
    '''observations less than 0 were flagged as
        suspected, and (b) observations with more than ten consecutive
        equal values greater than 0 were flagged as suspected'''
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').set_index('date')
    index = pd.date_range(df.index[0], df.index[-1], freq = 'D')
    df = df.reindex(index).fillna(0)
    df1 = df.diff()
    df1 = df1.where(df1==0, 1).diff()
    start = np.where(df1.values==-1)[0]
    end = np.where(df1.values==1)[0]
    if len(start) == 0 or len(end) == 0:
        # must no less than zero
        df = df.loc[df.Q>=0,:]
        return (df)
    if start[0] > end[0]:
        start = np.array([0]+start.tolist())
    if start[-1] > end[-1]:
        end = np.array(end.tolist()+[df1.shape[0]+10])
    duration = end - start
    start = start[duration>=10]
    end = end[duration>=10]
    del_idx = np.array([item for a,b in zip(start,end) for item in np.arange(a+1,b+2).tolist()])
    del_idx = del_idx[del_idx<df.shape[0]]
    if len(del_idx) > 0:
        df.drop(df.index[del_idx], inplace = True)
    # must no less than zero
    df = df.loc[df.Q>=0,:]
    return (df)

def main(ohdb_id):
    df = pd.read_csv(os.environ['DATA']+f'/data/OHDB/OHDB_v0.2.3/OHDB_data/discharge/daily/{ohdb_id}.csv')
    # read
    df = cleanQ(df)
    # quality check
    df = del_unreliableQ(df)
    # only retain records with at least 328 observations (90%) are required
    tmp = df.resample('Y')['Q'].agg(countDay = lambda x:x.shape[0])
    if tmp.loc[tmp.countDay>=328,:].shape[0] == 0:
        return
    years = tmp.loc[tmp.countDay>=328,:].index.year
    df = df.loc[df.index.year.isin(years),:]
    
    # only use data from year1 to year2
    df = df.loc[(df.index.year>=1982)&(df.index.year<=2023),:]
    df['year'] = df.index.year
    
    # count observations for each year
    df1 = df.groupby('year')['Q'].agg(countDay = lambda x:x.shape[0], 
                                    Qmax7 = lambda x:x.rolling(7).mean().max(),
                                    Qmin7 = lambda x:x.rolling(7).mean().min(),
                                    Qmax7date = lambda x:x.rolling(7).mean().idxmax(),
                                    Qmin7date = lambda x:x.rolling(7).mean().idxmin(),
                                    )
    # AMS must be greater than zero
    df1 = df1.loc[df1.Qmax7>0,:]
    if df1.shape[0] == 0:
        return
    
    df1 = df1.reset_index()
    # only retain gauge with at least 20 years of AMS during 1982-2023
    if df1.shape[0] < 20:
        return
    df1['ohdb_id'] = ohdb_id
    print(ohdb_id)
    return (df1)

if __name__ == '__main__':
    if not os.path.exists('../OHDB_metadata_subset.csv'):
        # select gauges that have good basin boundary
        df = pd.read_csv('../../data/OHDB/OHDB_v0.2.3/OHDB_metadata/OHDB_metadata.csv')
        df1 = []
        for fname in glob.glob('../basin_boundary/GRIT*8857*'):
            gdf = read_dataframe(fname, read_geometry = False)
            print(gdf.shape, gdf.ohdb_id.unique().shape)
            df1.append(df.loc[df.ohdb_id.isin(gdf.ohdb_id.unique()),:])
        df1 = pd.concat(df1)
        df1.to_csv('../OHDB_metadata_subset.csv', index = False)
    else:
        df1 = pd.read_csv('../OHDB_metadata_subset.csv')
    print(df1.shape)
    ohdb_ids = df1.ohdb_id.values
    pool = mp.Pool(48)
    df = pool.map(main, ohdb_ids)
    df = pd.concat(df)
    df.to_csv('../dis_OHDB_Qmin7_Qmax7_1982-2023.csv', index = False)

(27421, 16)
OHDB_007000288
OHDB_001000181
OHDB_016000002
OHDB_011001091OHDB_001000662

OHDB_007004150
OHDB_011000063
OHDB_015000547
OHDB_016000003
OHDB_007004151OHDB_005001403
OHDB_012002123OHDB_006002503


OHDB_001000663
OHDB_011001092
OHDB_001000182
OHDB_011000066
OHDB_012000616
OHDB_006004294
OHDB_015000556
OHDB_007000297
OHDB_016000005
OHDB_007004152
OHDB_006002505OHDB_001000183

OHDB_001000664
OHDB_011001094OHDB_009000771OHDB_006004296


OHDB_016000008
OHDB_007000300
OHDB_006000301OHDB_007004155

OHDB_007001110
OHDB_011000070
OHDB_001000184
OHDB_006002511
OHDB_001000665
OHDB_009000773
OHDB_011001095OHDB_007001111OHDB_007004158


OHDB_012002141
OHDB_011000071
OHDB_012000630
OHDB_006002524
OHDB_001000185
OHDB_006004303
OHDB_001000666
OHDB_006002531
OHDB_011000072
OHDB_001000667OHDB_007004159

OHDB_001000186
OHDB_011001096
OHDB_007000303
OHDB_006004307
OHDB_016000013
OHDB_015000587OHDB_009000777

OHDB_012002144OHDB_011000076
OHDB_006002541

OHDB_012000640OHDB_007004160
OHDB_001000668

OHDB_015000115OHDB_007003762OHDB_007000486


OHDB_006005076OHDB_009000072
OHDB_016000351
OHDB_012001283
OHDB_007004324OHDB_003000224


OHDB_006003094OHDB_001000033

OHDB_011000279
OHDB_006001266OHDB_011001386

OHDB_001000306
OHDB_007000488
OHDB_007003764
OHDB_001000034OHDB_012001284

OHDB_003000225
OHDB_016000353OHDB_006003095
OHDB_007000070
OHDB_007003765OHDB_009000079
OHDB_011001387


OHDB_006001268
OHDB_011000290OHDB_006005086

OHDB_001000307OHDB_003000232

OHDB_015000132
OHDB_007004325OHDB_016000354

OHDB_001000036
OHDB_009000080OHDB_006003099OHDB_012001285


OHDB_011001389OHDB_007000491
OHDB_007000071
OHDB_006001270

OHDB_011000293OHDB_016000355OHDB_006005088

OHDB_003000233
OHDB_001000308
OHDB_015000136

OHDB_007004326
OHDB_011001391
OHDB_001000037OHDB_006001276

OHDB_006003101OHDB_012001287
OHDB_003000241
OHDB_001000309OHDB_007000493
OHDB_007000072OHDB_011000294




OHDB_006005101OHDB_016000358OHDB_015000141

OHDB_007004327OHDB_011001393
OHDB_007000496
OHDB_011000297
OHDB_001000

OHDB_011000498
OHDB_011001589
OHDB_007003997
OHDB_002000162
OHDB_007008330OHDB_001000400
OHDB_009000337

OHDB_007007986OHDB_006001937OHDB_012001734

OHDB_001000134

OHDB_007003998OHDB_011001594OHDB_011000500OHDB_002000166
OHDB_007000260OHDB_009000344



OHDB_006003651OHDB_001000401


OHDB_010000424OHDB_007008900

OHDB_007007988OHDB_009000346

OHDB_011000501
OHDB_001000135
OHDB_007003999
OHDB_007008332
OHDB_011001595
OHDB_007000261OHDB_006003655

OHDB_006001941OHDB_001000402

OHDB_002000187
OHDB_007007989
OHDB_007004000
OHDB_011000504
OHDB_001000136
OHDB_006003670
OHDB_012001738OHDB_007000262
OHDB_001000403OHDB_011001600OHDB_002000191



OHDB_009000351OHDB_007007991OHDB_007008333


OHDB_010000452OHDB_007004001
OHDB_007008901

OHDB_011000505OHDB_001000137

OHDB_007000263
OHDB_007007992OHDB_007001084
OHDB_006001953

OHDB_002000196OHDB_006003685

OHDB_007004002OHDB_011000507OHDB_001000138


OHDB_010000472
OHDB_001000405OHDB_007008902OHDB_012001756


OHDB_009000368
OHDB_007000265OHDB_007008

OHDB_007009585OHDB_006004230OHDB_007008992


OHDB_007004123OHDB_001000502


OHDB_006002411OHDB_009000628
OHDB_008001059OHDB_011000030

OHDB_005001198
OHDB_011000702
OHDB_007008135
OHDB_009000629
OHDB_006002440OHDB_007008993
OHDB_001000503OHDB_007010285
OHDB_007009586


OHDB_007004124
OHDB_008001060

OHDB_008001729OHDB_011000031
OHDB_009000630
OHDB_006004242
OHDB_007008994OHDB_011000704
OHDB_007008461

OHDB_007008137
OHDB_007004125OHDB_007009587OHDB_007010286OHDB_001000504



OHDB_011000032
OHDB_009000636
OHDB_008001076
OHDB_007008463
OHDB_006002463
OHDB_008001732OHDB_012000432
OHDB_011000707OHDB_007008995


OHDB_007004127OHDB_007008138
OHDB_008001077OHDB_006002467

OHDB_006004244OHDB_009000637


OHDB_001000505
OHDB_011000034
OHDB_007009589
OHDB_007008465OHDB_007010287

OHDB_011000710
OHDB_007008996
OHDB_009000638
OHDB_007008141OHDB_007004128

OHDB_011000035OHDB_007008466OHDB_012000441OHDB_001000506



OHDB_007009591
OHDB_008002475
OHDB_008001741OHDB_009000639OHDB_006002474
OHDB_0110007

OHDB_007009216
OHDB_001000641
OHDB_007009832OHDB_008004166

OHDB_008000572OHDB_007009217

OHDB_008002205OHDB_014004662
OHDB_014002455
OHDB_014000130

OHDB_001000642
OHDB_007009833
OHDB_007009219OHDB_014006393

OHDB_008000591
OHDB_014000133OHDB_008004182

OHDB_014004669
OHDB_001000643
OHDB_007009221
OHDB_007009834OHDB_008000592

OHDB_014002468
OHDB_001000644OHDB_008000596

OHDB_007009223
OHDB_014000136OHDB_008004190
OHDB_007009835OHDB_014004683


OHDB_007009224OHDB_001000645OHDB_014006403


OHDB_008001592
OHDB_008000607
OHDB_014000140
OHDB_007009836OHDB_001000646OHDB_007008716OHDB_014004684



OHDB_007009225
OHDB_014002488
OHDB_008000613
OHDB_007008717
OHDB_014000144
OHDB_001000648OHDB_007009226

OHDB_007009838
OHDB_007008718
OHDB_008002224
OHDB_007009227
OHDB_014004691
OHDB_001000649
OHDB_014002498OHDB_007009840

OHDB_008004207
OHDB_008001598
OHDB_014000148OHDB_007008719OHDB_007009229


OHDB_014006420
OHDB_008000621
OHDB_014004733
OHDB_001000650OHDB_007009850
OHDB_007009230

OHDB_01400

OHDB_014019930
OHDB_014003453
OHDB_007009373OHDB_014011390

OHDB_014016168
OHDB_014000982OHDB_014012518
OHDB_014009845
OHDB_007008838

OHDB_007009374OHDB_014017688

OHDB_014005367OHDB_014007925

OHDB_007010049
OHDB_014003458
OHDB_014019933
OHDB_007008839OHDB_014016172OHDB_014001008



OHDB_014019109OHDB_007009375
OHDB_014017702
OHDB_007010050
OHDB_014021787
OHDB_014012536
OHDB_007008841
OHDB_014003461
OHDB_014016180OHDB_014001018OHDB_014005383

OHDB_014008080OHDB_007009376


OHDB_014017705
OHDB_014009868
OHDB_007008842OHDB_014011402

OHDB_014021788
OHDB_014001020OHDB_014003465

OHDB_014019942
OHDB_007009378
OHDB_014017706
OHDB_014012540OHDB_014005388OHDB_007010053

OHDB_014016186

OHDB_007008843
OHDB_007009381
OHDB_014017709
OHDB_014008171
OHDB_014009879
OHDB_014019124
OHDB_014021792
OHDB_014005390
OHDB_014001026
OHDB_014012552OHDB_014019945OHDB_014016189


OHDB_014003467
OHDB_007009382
OHDB_007008844OHDB_007010060
OHDB_014009880
OHDB_014011412

OHDB_014021794
OHDB_014019946OHDB_014008

OHDB_014026574OHDB_014025694


OHDB_014022168
OHDB_014018210
OHDB_014021364OHDB_014024326

OHDB_014020339
OHDB_014023136OHDB_014011809
OHDB_014004101

OHDB_014016662
OHDB_014025695OHDB_014019610

OHDB_014008928
OHDB_014014968
OHDB_014010666
OHDB_014024327
OHDB_014023141
OHDB_014026587OHDB_014008936
OHDB_014020345

OHDB_014025697
OHDB_014006049OHDB_014022174

OHDB_014024328
OHDB_014011814
OHDB_014018215OHDB_014016669

OHDB_014008941
OHDB_014004106OHDB_014026590

OHDB_014022176OHDB_014010676

OHDB_014006053
OHDB_014020347
OHDB_014023147
OHDB_014024330
OHDB_014014971
OHDB_014004108
OHDB_014018217OHDB_014021375
OHDB_014026594
OHDB_014008952
OHDB_014025704

OHDB_014016690
OHDB_014011821OHDB_014014974OHDB_014010680OHDB_014023149


OHDB_014027679
OHDB_014008957

OHDB_014024335
OHDB_014025712OHDB_014016692OHDB_014020356


OHDB_014006076OHDB_014021378

OHDB_014018224
OHDB_014004114
OHDB_014010688
OHDB_014023151OHDB_014008962

OHDB_014011822
OHDB_014020363
OHDB_014024340
OHDB_014025729OHDB_01402

OHDB_007004506
OHDB_014030373
OHDB_014028229OHDB_014023582

OHDB_014018722
OHDB_007004950OHDB_014026136

OHDB_014029065
OHDB_014031406
OHDB_014023585
OHDB_014032539OHDB_014030375

OHDB_014027174OHDB_014018731

OHDB_014034785
OHDB_007004512
OHDB_014029066
OHDB_014033748
OHDB_014031407
OHDB_007004951
OHDB_014026138OHDB_014023587
OHDB_014022805

OHDB_014034788
OHDB_014018734
OHDB_014028242
OHDB_014025335
OHDB_014032567
OHDB_014029067
OHDB_014033749
OHDB_014031408
OHDB_014035667
OHDB_007004953
OHDB_014025338
OHDB_014035676
OHDB_014032568
OHDB_014023590
OHDB_014018740
OHDB_014031411
OHDB_014025344OHDB_007004954

OHDB_014028249
OHDB_014026141
OHDB_014022823
OHDB_014023594
OHDB_014027199OHDB_014032580

OHDB_014018750OHDB_014031412OHDB_014035682


OHDB_014023596
OHDB_014026142
OHDB_014025360
OHDB_014028253
OHDB_014022830
OHDB_014033768
OHDB_007004956OHDB_014029077

OHDB_014034852
OHDB_014032584
OHDB_014018751
OHDB_014025363OHDB_014026143
OHDB_014031414

OHDB_014027208
OHDB_014023597OHDB_014028

OHDB_007004634
OHDB_014035304
OHDB_004001365
OHDB_014036384
OHDB_014033317
OHDB_007003376
OHDB_007006245
OHDB_007004635OHDB_004003913

OHDB_014034279
OHDB_004001366
OHDB_014031033OHDB_014031953
OHDB_014036386OHDB_004002654

OHDB_007005661
OHDB_007005081

OHDB_004004843
OHDB_004003914

OHDB_014033331OHDB_004000600OHDB_014034284

OHDB_004001371
OHDB_004002655
OHDB_007005662
OHDB_014036392
OHDB_007005082
OHDB_007004641OHDB_004003915
OHDB_007006250OHDB_007003566
OHDB_014031035


OHDB_004001373
OHDB_004002658
OHDB_014035316
OHDB_007005083OHDB_014034287

OHDB_004000604OHDB_007005664
OHDB_014033346

OHDB_007003597
OHDB_004004850
OHDB_004003916
OHDB_014036411
OHDB_007006253
OHDB_014034292
OHDB_004002659
OHDB_004000605
OHDB_007005084OHDB_007005665OHDB_007004643


OHDB_007006254OHDB_004003919

OHDB_014031037
OHDB_004002660
OHDB_014033361
OHDB_007006255OHDB_007003626

OHDB_004003921
OHDB_004001393
OHDB_007004644
OHDB_004000610
OHDB_007005089
OHDB_007005668
OHDB_004001394
OHDB_007005090
OHDB_00400

OHDB_004007244
OHDB_007006446
OHDB_004004373
OHDB_007005245
OHDB_004003189
OHDB_007004771
OHDB_004001154
OHDB_007006447
OHDB_007005246
OHDB_004001995
OHDB_007004773
OHDB_004001156
OHDB_007005247
OHDB_007006448
OHDB_004001996
OHDB_004001161
OHDB_007004774
OHDB_004003192
OHDB_004001165
OHDB_004001997
OHDB_007005248
OHDB_004004381
OHDB_007004775
OHDB_004007460
OHDB_004003194
OHDB_007006031
OHDB_007005249
OHDB_007004776
OHDB_007006032
OHDB_004004384
OHDB_007006454
OHDB_007005250
OHDB_007004777OHDB_004004387

OHDB_007006455
OHDB_007005251
OHDB_004007464
OHDB_004001181
OHDB_004004397
OHDB_007004778
OHDB_007006456
OHDB_007006036
OHDB_004007465
OHDB_007005253
OHDB_004002015OHDB_007004779

OHDB_007006827
OHDB_007006458
OHDB_004004400
OHDB_007007240
OHDB_004001188
OHDB_004003217
OHDB_007006040
OHDB_007005254
OHDB_004002016
OHDB_007004780
OHDB_007007241
OHDB_004001189
OHDB_007006829
OHDB_004003218
OHDB_004002019
OHDB_007005255OHDB_007007242

OHDB_004001194
OHDB_004003219
OHDB_004002022
OHDB_00700

# subset basin boundary files again

In [10]:
df = pd.read_csv('../dis_OHDB_Qmin7_Qmax7_1982-2023.csv')
ohdb_ids = df.ohdb_id.unique()
for fname in glob.glob('../basin_boundary/GRIT*8857*'):
    gdf = read_dataframe(fname)
    gdf = gdf.loc[gdf.ohdb_id.isin(ohdb_ids),:]
    write_dataframe(gdf, fname[:-5]+'_subset.gpkg')
    print(fname)

../basin_boundary/GRIT_full_catchment_AS_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_AF_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SA_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_EU_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_NA_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SI_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SP_EPSG8857_simplify_final_125km2.gpkg
