In [1]:
from pyogrio import read_dataframe,write_dataframe
import geopandas as gpd
import os,glob,sys,time,re
import pandas as pd
import numpy as np
import multiprocessing as mp
from parallel_pandas import ParallelPandas
from tqdm.auto import tqdm
ParallelPandas.initialize(n_cpu=24, split_factor=24)

### get basin boundary files

In [25]:
basins = glob.glob('../../data/GRIT/full_catchment/GRIT_full_catchment_*_EPSG8857_simplify_final.gpkg')
if not os.path.exists('../basin_boundary'):
    os.mkdir('../basin_boundary')
for basin in basins:
    gdf = read_dataframe(basin)
    
    # difference between ohdb_darea and grit_darea less than 20%
    gdf['bias'] = np.abs(gdf.grit_darea - gdf.ohdb_darea_hydrosheds) / gdf.ohdb_darea_hydrosheds * 100
    gdf1 = gdf.loc[gdf.bias<=20,:]
    
    # darea greater than 125 km2 to ensure at least one grid cell
    gdf1 = gdf1.loc[gdf1.grit_darea>=125,:]

    gdf1['segment_id'] = gdf1.segment_id.astype(int).astype(str)
    gdf1['reach_id'] = gdf1.segment_id.astype(int).astype(str)
    gdf1 = gdf1.rename(columns={'grit_darea':'gritDarea','ohdb_darea_hydrosheds':'ohdbDarea1','ohdb_darea':'ohdbDarea0'})
    
    # save
    basin1 = os.path.basename(basin)
    write_dataframe(gdf1, f'../basin_boundary/{basin1[:-5]}'+'_125km2.gpkg')
    gdf1 = gdf1.to_crs('epsg:4326')
    basin1 = re.sub('EPSG8857','EPSG4326',basin1)
    write_dataframe(gdf1, f'../basin_boundary/{basin1[:-5]}'+'_125km2.shp')
    print(basin)

../../data/GRIT/full_catchment/GRIT_full_catchment_AS_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_AF_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SA_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_EU_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_NA_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SI_EPSG8857_simplify_final.gpkg
../../data/GRIT/full_catchment/GRIT_full_catchment_SP_EPSG8857_simplify_final.gpkg


In [4]:
df = pd.DataFrame({'a':[2,1,0],'b':[-23,-1,0]})
df.b = df.b.where(df.b>=0,np.nan)
df

Unnamed: 0,a,b
0,2,
1,1,
2,0,0.0


### select OHDB gauge and calculate streamflow indices

In [None]:
def cleanQ(df):
    # eliminate invalid records
    df1 = df.loc[df.Q.apply(lambda x: not isinstance(x, str)),:]
    df2 = df.loc[df.Q.apply(lambda x: isinstance(x, str)),:]
    try:
        df2 = df2.loc[df2.Q.str.match('\d+'),:]
    except:
        pass
    df = pd.concat([df1, df2])
    df['Q'] = df.Q.astype(np.float32)
    return df

def del_unreliableQ(df):
    '''observations less than 0 were flagged as
        suspected, and (b) observations with more than ten consecutive
        equal values greater than 0 were flagged as suspected'''
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').set_index('date')
    index = pd.date_range(df.index[0], df.index[-1], freq = 'D')
    df = df.reindex(index).fillna(0)
    df1 = df.diff()
    df1 = df1.where(df1==0, 1).diff()
    start = np.where(df1.values==-1)[0]
    end = np.where(df1.values==1)[0]
    if len(start) == 0 or len(end) == 0:
        # must no less than zero
        df = df.loc[df.Q>=0,:]
        return (df)
    if start[0] > end[0]:
        start = np.array([0]+start.tolist())
    if start[-1] > end[-1]:
        end = np.array(end.tolist()+[df1.shape[0]+10])
    duration = end - start
    start = start[duration>=10]
    end = end[duration>=10]
    del_idx = np.array([item for a,b in zip(start,end) for item in np.arange(a+1,b+2).tolist()])
    del_idx = del_idx[del_idx<df.shape[0]]
    if len(del_idx) > 0:
        df.drop(df.index[del_idx], inplace = True)
    # must no less than zero
    df = df.loc[df.Q>=0,:]
    return (df)

def main(par, scale = 'year'):
    ohdb_id, Darea = par
    df = pd.read_csv(os.environ['DATA']+f'/data/OHDB/OHDB_v0.2.3/OHDB_data/discharge/daily/{ohdb_id}.csv')
    # read
    df = cleanQ(df)
    # quality check
    df = del_unreliableQ(df)
    # only retain records with at least 328 observations (90%) are required
    tmp = df.resample('Y')['Q'].agg(countDay = lambda x:x.shape[0])
    if tmp.loc[tmp.countDay>=328,:].shape[0] == 0:
        return
    years = tmp.loc[(tmp.countDay>=328)&(tmp.index.year>=1982),:].index.year.tolist()
    if tmp.loc[(tmp.countDay>=300)&(tmp.index.year==2023),:].shape[0] > 0:
        years = years + [2023]
    df = df.loc[df.index.year.isin(years),:]
    # only retain gauge with at least 20 years of AMS during 1982-2023
    if len(years) < 20:
        return
    # reindex
    newindex = pd.date_range(df.index.values[0], df.index.values[-1], freq = 'D')
    df = df.reindex(newindex)
    # 7-day moving average
    df = df.rolling(7).mean().dropna()
    df['year'] = df.index.year
    df['season'] = 'winter'
    df.loc[(df.index.month>=5)&(df.index.month<=10),'season'] = 'summer'
    if scale == 'year':
        # count observations and calculate Qmax7 and Qmin7 for each year
        df1 = df.groupby('year')['Q'].agg(countDay = lambda x:x.shape[0], 
                                        Qmax7 = lambda x:x.max(),
                                        Qmin7 = lambda x:x.min(),
                                        Qmax7date = lambda x:x.idxmax(),
                                        Qmin7date = lambda x:x.idxmin(),
                                        )
    elif scale == 'season':
        # count observations and calculate Qmax7 and Qmin7 for each season
        df1 = df.groupby(['season','year'])['Q'].agg(countDay = lambda x:x.shape[0], 
                                        Qmax7 = lambda x:x.max(),
                                        Qmin7 = lambda x:x.min(),
                                        Qmax7date = lambda x:x.idxmax(),
                                        Qmin7date = lambda x:x.idxmin(),
                                        )
        df1 = df1.loc[df1.countDay>=60,:] # at least 60 days of records to calculate seasonal extremes
    else:
        raise Exception('scale must be season or year')
    df1['Qmax7date'] = pd.to_datetime(df1['Qmax7date'])
    df1['Qmin7date'] = pd.to_datetime(df1['Qmin7date'])
    
    if scale == 'season':
        # keep events independent
        thres = 5 + np.log(Darea * 0.386102) # thres for Qmax7
        df1_Qmax = df1[['Qmax7','Qmax7date']].sort_values('Qmax7date')
        df1_Qmax = df1_Qmax.loc[~(df1_Qmax.Qmax7date.diff().dt.days<thres),:]
        thres = 30 # thres for Qmin7
        df1_Qmin = df1[['Qmin7','Qmin7date']].sort_values('Qmin7date')
        df1_Qmin = df1_Qmin.loc[~(df1_Qmin.Qmin7date.diff().dt.days<thres),:]
        df1 = pd.concat([df1_Qmax, df1_Qmin], axis = 1)

    # Qmax7 and Qmin7 must be greater than zero
    df1['Qmax7'] = df1.Qmax7.where(df1.Qmax7>0, np.nan)
    df1['Qmin7'] = df1.Qmin7.where(df1.Qmin7>0, np.nan)

    # Qmax7 cannot be lower than 50% percentile of daily discharges between 1982-2023
    q = df.loc[df.Q>0,'Q'].quantile(0.5)
    df1['Qmax7'] = df1.Qmax7.where(df1.Qmax7 >= q, np.nan)
    # Qmin7 cannot be greater than 50% percentile of daily discharges between 1982-2023
    df1['Qmin7'] = df1.Qmin7.where(df1.Qmin7 <= q, np.nan)

    if df1.shape[0] == 0:
        return
    df1 = df1.reset_index()
    df1['ohdb_id'] = ohdb_id
    print(ohdb_id)
    return (df1)

if __name__ == '__main__':
    # if not os.path.exists('../data/OHDB_metadata_subset.csv'):
    #     # select gauges that have good basin boundary
    #     df = pd.read_csv('../../data/OHDB/OHDB_v0.2.3/OHDB_metadata/OHDB_metadata.csv')
    #     df1 = []
    #     for fname in glob.glob('../basin_boundary/GRIT*8857*'):
    #         gdf = read_dataframe(fname, read_geometry = False)
    #         print(gdf.shape, gdf.ohdb_id.unique().shape)
    #         df1.append(df.loc[df.ohdb_id.isin(gdf.ohdb_id.unique()),:])
    #     df1 = pd.concat(df1)
    #     df1.to_csv('../OHDB_metadata_subset.csv', index = False)
    # else:
    df1 = pd.read_csv('../data/basin_attributes.csv')
    print(df1.shape)
    ohdb_ids = df1.ohdb_id.values
    pool = mp.Pool(48)
    pars = df1[['ohdb_id','gritDarea']].values.tolist()
    df = pool.map(main, pars)
    df = pd.concat(df)
    df.to_csv('../data/dis_OHDB_Qmin7_Qmax7_1982-2023_filter.csv', index = False)
    print(df.Qmin7.isna().sum(), df.Qmax7.isna().sum())

(10717, 62)
OHDB_011000604OHDB_014030677
OHDB_008000953OHDB_007008819OHDB_014031282OHDB_008004429OHDB_014030718
OHDB_006002577
OHDB_011001039OHDB_011001432
OHDB_014020473OHDB_007004811OHDB_008001574

OHDB_007008845OHDB_003000053OHDB_006003449OHDB_001000705OHDB_009000271
OHDB_009000845OHDB_006004441

OHDB_014021493
OHDB_006004656OHDB_007005041OHDB_014003918
OHDB_014011858OHDB_007009635

OHDB_014010699OHDB_014000054
OHDB_007009994
OHDB_014023472OHDB_012000686OHDB_007009733



OHDB_014039706
OHDB_011001343
OHDB_014011019OHDB_012002737


OHDB_002000683OHDB_014018150OHDB_007009859






OHDB_011000924

OHDB_011000556OHDB_007009037

OHDB_014031283OHDB_012001995

OHDB_014022595OHDB_011000484OHDB_011000701OHDB_007006237OHDB_010000050OHDB_014034284OHDB_003000163



OHDB_007008093
OHDB_014012189


OHDB_007008488


OHDB_014019391OHDB_007008628OHDB_011001096



OHDB_007006087OHDB_006004659OHDB_006002623OHDB_006001966OHDB_007009991
OHDB_006002925OHDB_014030602OHDB_014022562OHDB_014030989OHDB_011000

Process ForkPoolWorker-89:
Process ForkPoolWorker-86:
Process ForkPoolWorker-87:
Process ForkPoolWorker-70:
Process ForkPoolWorker-51:
Process ForkPoolWorker-69:
Process ForkPoolWorker-84:
Process ForkPoolWorker-96:
Process ForkPoolWorker-90:
Process ForkPoolWorker-85:
Process ForkPoolWorker-63:
Process ForkPoolWorker-82:
Process ForkPoolWorker-72:
Process ForkPoolWorker-88:
Process ForkPoolWorker-83:
Process ForkPoolWorker-65:
Process ForkPoolWorker-94:
Process ForkPoolWorker-74:
Process ForkPoolWorker-81:
Process ForkPoolWorker-61:
Process ForkPoolWorker-66:
Process ForkPoolWorker-78:
Process ForkPoolWorker-95:
Process ForkPoolWorker-49:
Process ForkPoolWorker-77:
Process ForkPoolWorker-55:
Process ForkPoolWorker-53:
Process ForkPoolWorker-93:
Process ForkPoolWorker-79:
Process ForkPoolWorker-75:
Process ForkPoolWorker-67:
Process ForkPoolWorker-58:
Process ForkPoolWorker-76:
Process ForkPoolWorker-52:
Process ForkPoolWorker-62:
Process ForkPoolWorker-92:
Process ForkPoolWorker-71:
P

### subset basin boundary files again

In [10]:
df = pd.read_csv('../dis_OHDB_Qmin7_Qmax7_1982-2023.csv')
ohdb_ids = df.ohdb_id.unique()
for fname in glob.glob('../basin_boundary/GRIT*8857*'):
    gdf = read_dataframe(fname)
    gdf = gdf.loc[gdf.ohdb_id.isin(ohdb_ids),:]
    write_dataframe(gdf, fname[:-5]+'_subset.gpkg')
    print(fname)

../basin_boundary/GRIT_full_catchment_AS_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_AF_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SA_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_EU_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_NA_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SI_EPSG8857_simplify_final_125km2.gpkg
../basin_boundary/GRIT_full_catchment_SP_EPSG8857_simplify_final_125km2.gpkg


### use catch_mean_GLHYMPS_GLiM.py to get catchment average subsurface characteristics

### calculate number of upstream dams

In [None]:
gdf_dam = read_dataframe('../../data/geography/GDAT_data_v1/data/GDAT_v1_dams.shp')
gdf_dam = gdf_dam.to_crs('espg:8857')
gdf = read_dataframe('../basin_boundary/GRIT_full_catchment_all_EPSG8857_simplify_final_125km2_subset.gpkg')
join = gpd.overlay(gdf_dam, gdf)
join = join.groupby('ohdb_id')['Feature_ID'].count().rename(columns={'Feature_ID':'dam_num'})
join = join.reindex(gdf.ohdb_id.values).fillna(0).reset_index()
join.to_csv('../geography/dam_num.csv', index = False)

### extract average meteorological conditions in the past 7 days preceding Qmax7 and Qmin7

In [2]:
# func to calculate averages in a parallel manner
def func_meteo(x, tw = 7):
    x['tmp'] = (x.date - x.Qdate).dt.days
    a = tw * -1
    x = x.loc[(x.tmp>a)&(x.tmp<=0),:].drop(columns=['date','Qdate','ohdb_id','tmp']).mean()
    return x

df_flood = pd.read_csv('../data/dis_OHDB_Qmin7_Qmax7_1982-2023.csv')
df_flood['Qmax7date'] = pd.to_datetime(df_flood['Qmax7date'])
df_flood['Qmin7date'] = pd.to_datetime(df_flood['Qmin7date'])

for year in range(1982, 2024):
    if os.path.exists(f'../data/Qmax7_meteo_{year}.csv') and os.path.exists(f'../data/Qmin7_meteo30_{year}.csv'):
        continue
    fname = f'../ee_era5_land/ERA5_Land_daily_meteorology_for_OHDB_10717_stations_{year}.csv'
    df = pd.read_csv(fname)
    df = df.melt(id_vars = 'ohdb_id')
    df['date'] = df.variable.apply(lambda x:x[:4]+'-'+x[4:6]+'-'+x[6:8])
    df['meteo'] = df.variable.str[9:]
    df = df.drop(columns=['variable'])
    df = df.pivot_table(index = ['ohdb_id','date'], columns = 'meteo', values = 'value').reset_index()
    df['date'] = pd.to_datetime(df.date.values)

    df_flood0 = df_flood.loc[df_flood.year==year,:]

    # df_Qmax7 = df.merge(df_flood0[['ohdb_id','Qmax7date']].rename(columns={'Qmax7date':'Qdate'}), on = 'ohdb_id')
    # print(df_Qmax7.shape)
    # df_Qmax7 = df_Qmax7.groupby('ohdb_id').p_apply(lambda x:func_meteo(x,tw=7)).reset_index().assign(year=year)
    # df_Qmax7 = df_Qmax7.merge(df_flood0[['ohdb_id','Qmax7','Qmax7date']], on = 'ohdb_id')
    # df_Qmax7.to_csv(f'../data/Qmax7_meteo_{year}.csv', index = False)

    df_Qmin7 = df.merge(df_flood0[['ohdb_id','Qmin7date']].rename(columns={'Qmin7date':'Qdate'}), on = 'ohdb_id')
    print(df_Qmin7.shape)
    df_Qmin7 = df_Qmin7.groupby('ohdb_id').p_apply(lambda x:func_meteo(x,tw=30)).reset_index().assign(year=year)
    df_Qmin7 = df_Qmin7.merge(df_flood0[['ohdb_id','Qmin7','Qmin7date']], on = 'ohdb_id')
    df_Qmin7.to_csv(f'../data/Qmin7_meteo30_{year}.csv', index = False)

(2998475, 10)


<LAMBDA> DONE:   0%|          | 0/8215 [00:00<?, ?it/s]

(3028405, 10)


<LAMBDA> DONE:   0%|          | 0/8297 [00:00<?, ?it/s]

(3113196, 10)


<LAMBDA> DONE:   0%|          | 0/8506 [00:00<?, ?it/s]

(3116370, 10)


<LAMBDA> DONE:   0%|          | 0/8538 [00:00<?, ?it/s]

(3166010, 10)


<LAMBDA> DONE:   0%|          | 0/8674 [00:00<?, ?it/s]

(3184625, 10)


<LAMBDA> DONE:   0%|          | 0/8725 [00:00<?, ?it/s]

(3189690, 10)


<LAMBDA> DONE:   0%|          | 0/8715 [00:00<?, ?it/s]

(3196305, 10)


<LAMBDA> DONE:   0%|          | 0/8757 [00:00<?, ?it/s]

(3220395, 10)


<LAMBDA> DONE:   0%|          | 0/8823 [00:00<?, ?it/s]

(3216380, 10)


<LAMBDA> DONE:   0%|          | 0/8812 [00:00<?, ?it/s]

(3308274, 10)


<LAMBDA> DONE:   0%|          | 0/9039 [00:00<?, ?it/s]

(3315295, 10)


<LAMBDA> DONE:   0%|          | 0/9083 [00:00<?, ?it/s]

(3325150, 10)


<LAMBDA> DONE:   0%|          | 0/9110 [00:00<?, ?it/s]

(3351065, 10)


<LAMBDA> DONE:   0%|          | 0/9181 [00:00<?, ?it/s]

(3399408, 10)


<LAMBDA> DONE:   0%|          | 0/9288 [00:00<?, ?it/s]

(3434285, 10)


<LAMBDA> DONE:   0%|          | 0/9409 [00:00<?, ?it/s]

(3455820, 10)


<LAMBDA> DONE:   0%|          | 0/9468 [00:00<?, ?it/s]

(3508015, 10)


<LAMBDA> DONE:   0%|          | 0/9611 [00:00<?, ?it/s]

(3521652, 10)


<LAMBDA> DONE:   0%|          | 0/9622 [00:00<?, ?it/s]

(3556195, 10)


<LAMBDA> DONE:   0%|          | 0/9743 [00:00<?, ?it/s]

(3603645, 10)


<LAMBDA> DONE:   0%|          | 0/9873 [00:00<?, ?it/s]

(3556195, 10)


<LAMBDA> DONE:   0%|          | 0/9743 [00:00<?, ?it/s]

(3618642, 10)


<LAMBDA> DONE:   0%|          | 0/9887 [00:00<?, ?it/s]

(3610215, 10)


<LAMBDA> DONE:   0%|          | 0/9891 [00:00<?, ?it/s]

(3562400, 10)


<LAMBDA> DONE:   0%|          | 0/9760 [00:00<?, ?it/s]

(3537580, 10)


<LAMBDA> DONE:   0%|          | 0/9692 [00:00<?, ?it/s]

(3623400, 10)


<LAMBDA> DONE:   0%|          | 0/9900 [00:00<?, ?it/s]

(3559115, 10)


<LAMBDA> DONE:   0%|          | 0/9751 [00:00<?, ?it/s]

(3598535, 10)


<LAMBDA> DONE:   0%|          | 0/9859 [00:00<?, ?it/s]

(3568240, 10)


<LAMBDA> DONE:   0%|          | 0/9776 [00:00<?, ?it/s]

(3529704, 10)


<LAMBDA> DONE:   0%|          | 0/9644 [00:00<?, ?it/s]

(3507650, 10)


<LAMBDA> DONE:   0%|          | 0/9610 [00:00<?, ?it/s]

(3456915, 10)


<LAMBDA> DONE:   0%|          | 0/9471 [00:00<?, ?it/s]

(3420415, 10)


<LAMBDA> DONE:   0%|          | 0/9371 [00:00<?, ?it/s]

(3375252, 10)


<LAMBDA> DONE:   0%|          | 0/9222 [00:00<?, ?it/s]

(3369315, 10)


<LAMBDA> DONE:   0%|          | 0/9231 [00:00<?, ?it/s]

(3306170, 10)


<LAMBDA> DONE:   0%|          | 0/9058 [00:00<?, ?it/s]

(3132795, 10)


<LAMBDA> DONE:   0%|          | 0/8583 [00:00<?, ?it/s]

(2968992, 10)


<LAMBDA> DONE:   0%|          | 0/8112 [00:00<?, ?it/s]

(2708300, 10)


<LAMBDA> DONE:   0%|          | 0/7420 [00:00<?, ?it/s]

(2071010, 10)


<LAMBDA> DONE:   0%|          | 0/5674 [00:00<?, ?it/s]

(1275675, 10)


<LAMBDA> DONE:   0%|          | 0/3495 [00:00<?, ?it/s]

In [3]:
df = pd.read_csv('../data/Qmin7_final_dataset.csv')
df.head()

  df = pd.read_csv('../data/Qmin7_final_dataset.csv')


Unnamed: 0,ohdb_id,snow_depth_water_equivalent,snowmelt_sum,surface_net_solar_radiation_sum,temperature_2m_max,temperature_2m_min,total_evaporation_sum,total_precipitation_sum,year,Q,...,gritDarea,domain,ohdbDarea1,ohdbDarea0,sedimentary,plutonic,volcanic,metamorphic,climate,climate_label
0,OHDB_001000001,-7.345365000000001e-25,0.0,15993770.0,303.437983,294.652104,-0.004396,0.003298,1983,8.096,...,1089.430087,SA,971.846,,0.995808,0.0,0.004192,0.0,1.0,tropical
1,OHDB_001000001,-7.345365000000001e-25,0.0,15339360.0,301.596786,293.96171,-0.004192,0.00644,1984,8.148857,...,1089.430087,SA,971.846,,0.995808,0.0,0.004192,0.0,1.0,tropical
2,OHDB_001000001,-7.345365000000001e-25,0.0,13745420.0,301.389596,294.772102,-0.003658,0.005402,1985,9.951714,...,1089.430087,SA,971.846,,0.995808,0.0,0.004192,0.0,1.0,tropical
3,OHDB_001000001,-7.345365000000001e-25,0.0,14381440.0,300.937256,294.528471,-0.003639,0.005614,1986,11.428143,...,1089.430087,SA,971.846,,0.995808,0.0,0.004192,0.0,1.0,tropical
4,OHDB_001000001,-7.345365000000001e-25,0.0,17666170.0,303.54542,295.500225,-0.004882,0.004115,1988,10.212143,...,1089.430087,SA,971.846,,0.995808,0.0,0.004192,0.0,1.0,tropical


### merge basin attributes

In [None]:
fnames = glob.glob('../geography/*csv')
df_all = []
for fname in fnames:
    df = pd.read_csv(fname)
    if df.shape[0] == 1:
        df = df.T
    if df.shape[0] == 10717:
        df = df.set_index('ohdb_id')
    name = os.path.basename(fname).split('_')[0]
    if '0-5cm' in fname:
        name = name + '_layer1'
    elif '5-15cm' in fname:
        name = name + '_layer2'
    elif '15-30cm' in fname:
        name = name + '_layer3'
    elif '30-60cm' in fname:
        name = name + '_layer4'
    elif '60-100cm' in fname:
        name = name + '_layer5'
    elif '100-200cm' in fname:
        name = name + '_layer6'
    if df.shape[1] == 1:
        df.columns = [name]
    df_all.append(df)
df_all = pd.concat(df_all, axis = 1)
df_all = df_all.loc[df_all.index.str.contains('OHDB'),:].reset_index().rename(columns={'index':'ohdb_id'})

# merge metadata
df_meta = pd.read_csv('../OHDB_metadata_subset.csv')
df_all = df_all.merge(df_meta, on = 'ohdb_id')

df_all.to_csv('../basin_attributes.csv', index = False)


In [6]:
!python merge_dataset_for_modeling.py Qmin7