In [12]:
import os,glob,sys,re
import numpy as np
import pandas as pd
import joblib
import xgboost as xgb
import hydroeval as he
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
import cupy as cp
import warnings
warnings.filterwarnings("ignore")

In [68]:
fname = '../data/Qmin7_final_dataset_seasonal_multi_MSWX_meteo.csv'
target = 'Qmin7'
model = 'xgb'

# define outName to better name the ouput files
outName = re.sub('final_dataset_', '', os.path.basename(fname).split('.')[0])


In [69]:
# read dataset
df = pd.read_csv(fname)

# define outName to better name the ouput files
outName = re.sub('final_dataset_', '', os.path.basename(fname).split('.')[0])

# create label-encoding variable for gauge id
x = pd.DataFrame({'ohdb_id':df.ohdb_id.unique(),'gauge_id':np.arange(df.ohdb_id.unique().shape[0])})
df = df.merge(x, on = 'ohdb_id')
df.gauge_id = df.gauge_id.astype(str)

# create label-encoding variable for country id
x = pd.DataFrame({'country':df.country.unique(),'country_id':np.arange(df.country.unique().shape[0])})
df = df.merge(x, on = 'country')
df.country_id = df.country_id.astype(str)

# Create a binary feature to indicate whether the temperature is below freezing
df['freeze'] = np.where(df.tmax_3 < 0, True, False)

# create label-encoding variable for dam purpose
x = pd.DataFrame({'Main_Purpose':df.Main_Purpose.unique(),'Main_Purpose_id':np.arange(df.Main_Purpose.unique().shape[0])})
df = df.merge(x, on = 'Main_Purpose')
df.Main_Purpose_id = df.Main_Purpose_id.astype(str)

# create label-encoding variable for season 
x = pd.DataFrame({'season':df.season.unique(),'season_id':np.arange(df.season.unique().shape[0])})
df = df.merge(x, on = 'season')
df.season_id = df.season_id.astype(str)
df.climate = df.climate.astype(str)
df.year = df.year.astype(np.float32)

0         1985.0
1         1985.0
2         1993.0
3         1993.0
4         2016.0
           ...  
753273    2019.0
753274    2020.0
753275    2020.0
753276    2021.0
753277    2021.0
Name: year, Length: 753278, dtype: float32

In [38]:
df.shape

(753278, 120)

In [63]:
predictors = [  'BDTICM', 'elevation', 'slope', 'aridity', 
                'sedimentary', 'plutonic', 'volcanic', 'metamorphic',
                'clay_layer1', 'clay_layer6', 'clay_layer3', 'clay_layer4', 'clay_layer2', 'clay_layer5',
                'sand_layer1', 'sand_layer6', 'sand_layer3', 'sand_layer4', 'sand_layer2', 'sand_layer5',
                'silt_layer1', 'silt_layer6', 'silt_layer3', 'silt_layer4', 'silt_layer2', 'silt_layer5',
                'Porosity_x', 'logK_Ice_x',

                'ohdb_latitude', 'ohdb_longitude', 'year', 'month', 'gauge_id', 'country_id', 'freeze', 'climate',

                # 'swe_3', 'swmelt_3', 'srad_3', 't2max_3', 't2min_3', 'evap_3', 'pr_3',
                # 'swe_7', 'swmelt_7', 'srad_7', 't2max_7', 't2min_7', 'evap_7', 'pr_7',
                # 'swe_15', 'swmelt_15', 'srad_15', 't2max_15', 't2min_15', 'evap_15', 'pr_15',
                # 'swe_30', 'swmelt_30', 'srad_30', 't2max_30', 't2min_30', 'evap_30', 'pr_30',

                'lwd_3', 'p_3', 'pres_3', 'relhum_3', 'swd_3', 'spechum_3', 'tmax_3', 'tmin_3', 'wind_3', 
                'lwd_7', 'p_7', 'pres_7', 'relhum_7', 'swd_7', 'spechum_7', 'tmax_7', 'tmin_7', 'wind_7', 
                'lwd_15', 'p_15', 'pres_15', 'relhum_15', 'swd_15', 'spechum_15', 'tmax_15', 'tmin_15', 'wind_15', 
                'lwd_30', 'p_30', 'pres_30', 'relhum_30', 'swd_30', 'spechum_30', 'tmax_30', 'tmin_30', 'wind_30',

                'runoff_ratio', 'slope_fdc', 'Q10_50', 'high_q_freq', 'low_q_freq', 
                'zero_q_freq', 'cv', 'high_q_dur', 'low_q_dur', 'BFI', 'lagT', 'noResRatio', 'FI', 'p_mean', 
                'stream_elas', 'hfd_mean',

                'tmax_ave', 'tmax_std',

                'ImperviousSurface', 'crop', 'forest', 'grass', 'water', 'wetland',

                'res_darea_normalize', 'Year_ave', 'Main_Purpose_id',
            ]

X = df[predictors]
y = df['Q'].values
y = y / df['gritDarea'].values * 86.4
if (df.Q==0).any():
    y = y + 0.1
y = np.log(y)

X = cp.array(X)

In [52]:
for fname in glob.glob('../results/xgb*json'):
    ml = xgb.Booster()
    ml.load_model(fname)
    print(ml.get_params()['eta'])

AttributeError: 'Booster' object has no attribute 'get_params'

In [43]:
ml.set_param({'device': 'cpu'})
X = cp.asnumpy(X)
X = xgb.DMatrix(X)

y_pred = ml.predict(X)
y_pred = np.exp(y_pred)
if (df.Q==0).any():
    y_pred = y_pred - 0.1
y_pred = y_pred * df['gritDarea'].values / 86.4

In [66]:
X.agg(lambda x: 'q' if x.dtype=='float64' else 'c').to_dict()

{'BDTICM': 'q',
 'elevation': 'q',
 'slope': 'q',
 'aridity': 'q',
 'sedimentary': 'q',
 'plutonic': 'q',
 'volcanic': 'q',
 'metamorphic': 'q',
 'clay_layer1': 'q',
 'clay_layer6': 'q',
 'clay_layer3': 'q',
 'clay_layer4': 'q',
 'clay_layer2': 'q',
 'clay_layer5': 'q',
 'sand_layer1': 'q',
 'sand_layer6': 'q',
 'sand_layer3': 'q',
 'sand_layer4': 'q',
 'sand_layer2': 'q',
 'sand_layer5': 'q',
 'silt_layer1': 'q',
 'silt_layer6': 'q',
 'silt_layer3': 'q',
 'silt_layer4': 'q',
 'silt_layer2': 'q',
 'silt_layer5': 'q',
 'Porosity_x': 'q',
 'logK_Ice_x': 'q',
 'ohdb_latitude': 'q',
 'ohdb_longitude': 'q',
 'year': 'c',
 'month': 'c',
 'gauge_id': 'c',
 'country_id': 'c',
 'freeze': 'c',
 'climate': 'q',
 'lwd_3': 'q',
 'p_3': 'q',
 'pres_3': 'q',
 'relhum_3': 'q',
 'swd_3': 'q',
 'spechum_3': 'q',
 'tmax_3': 'q',
 'tmin_3': 'q',
 'wind_3': 'q',
 'lwd_7': 'q',
 'p_7': 'q',
 'pres_7': 'q',
 'relhum_7': 'q',
 'swd_7': 'q',
 'spechum_7': 'q',
 'tmax_7': 'q',
 'tmin_7': 'q',
 'wind_7': 'q',
 '

In [57]:
df.freeze.astype(str)

0         0
1         0
2         0
3         0
4         0
         ..
753273    0
753274    0
753275    0
753276    0
753277    0
Name: freeze, Length: 753278, dtype: object

In [50]:
ml.load_config()

TypeError: Booster.load_config() missing 1 required positional argument: 'config'