In [1]:
# builtins
import sys
import os
import time
import logging
from datetime import timedelta
from logging.config import dictConfig
import numpy as np
import datetime
import pathlib
import pandas as pd
import joblib

# externals
import xarray as xr

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# locals
from downscaleml.core.dataset import ERA5Dataset, NetCDFDataset, EoDataset

from downscaleml.main.config import (NET, ERA5_PLEVELS, ERA5_PREDICTORS, PREDICTAND,
                                     CALIB_PERIOD, VALID_PERIOD, DOY, NORM,
                                     OVERWRITE, DEM, DEM_FEATURES, STRATIFY,
                                     WET_DAY_THRESHOLD, VALID_SIZE, 
                                     start_year, end_year, CHUNKS)

from downscaleml.main.inputoutput import (ERA5_PATH, OBS_PATH, DEM_PATH, MODEL_PATH, TARGET_PATH)

from downscaleml.core.constants import (ERA5_P_VARIABLES, ERA5_P_VARIABLES_SHORTCUT, ERA5_P_VARIABLE_NAME,
                                        ERA5_S_VARIABLES, ERA5_S_VARIABLES_SHORTCUT, ERA5_S_VARIABLE_NAME,
                                        ERA5_VARIABLES, ERA5_VARIABLE_NAMES, ERA5_PRESSURE_LEVELS,
                                        PREDICTANDS, ERA5_P_VARIABLES, ERA5_S_VARIABLES)

from downscaleml.core.utils import NAMING_Model, normalize, search_files, LogConfig
from downscaleml.core.logging import log_conf
    
# module level logger
LOGGER = logging.getLogger(__name__)

def stacker(xarray_dataset):
    # stack along the lat and lon dimensions
    stacked = xarray_dataset.stack()
    dask_arr = stacked.to_array().data
    xarray_dataset = dask_arr.T
    LogConfig.init_log('Shape of the {} is in (spatial, time, variables):{}'.format(xarray_dataset, xarray_dataset.shape))
    return xarray_dataset

def doy_encoding(X, y=None, doy=False):

    # whether to include the day of the year as predictor variable
    if doy:
        # add doy to set of predictor variables
        LOGGER.info('Adding day of the year to predictor variables ...')
        X = X.assign(EoDataset.encode_doys(X, chunks=X.chunks))

    print(X)
    return X

if __name__ == '__main__':

    # initialize timing
    start_time = time.monotonic()
        
    # initialize network filename
    state_file = NAMING_Model.state_file(
        NET, PREDICTAND, ERA5_PREDICTORS, ERA5_PLEVELS, WET_DAY_THRESHOLD, dem=DEM,
        dem_features=DEM_FEATURES, doy=DOY, stratify=STRATIFY)
    
    state_file = MODEL_PATH.joinpath(PREDICTAND, state_file)
    target = TARGET_PATH.joinpath(PREDICTAND)

    # check if output path exists
    if not target.exists():
        target.mkdir(parents=True, exist_ok=True)
    # initialize logging
    log_file = state_file.with_name(state_file.name + "_log.txt")
    
    if log_file.exists():
        log_file.unlink()
    dictConfig(log_conf(log_file))

    # check if target dataset already exists
    target = target.joinpath(state_file.name + '.nc')
    if target.exists() and not OVERWRITE:
        LogConfig.init_log('{} already exists.'.format(target))
        sys.exit()

    LogConfig.init_log('Initializing downscaling for period: {}'.format(
        ' - '.join([str(CALIB_PERIOD[0]), str(CALIB_PERIOD[-1])])))

    # initialize ERA5 predictor dataset
    LogConfig.init_log('Initializing ERA5 predictors.')
    Era5 = ERA5Dataset(ERA5_PATH.joinpath('ERA5'), ERA5_PREDICTORS,
                       plevels=ERA5_PLEVELS)
    Era5_ds = Era5.merge(chunks=CHUNKS)
    Era5_ds = Era5_ds.rename({'lon': 'x','lat': 'y'})
    
    # initialize OBS predictand dataset
    LogConfig.init_log('Initializing observations for predictand: {}'
                       .format(PREDICTAND))

    # read in-situ gridded observations
    Obs_ds = search_files(OBS_PATH.joinpath(PREDICTAND), '.nc$').pop()
    Obs_ds = xr.open_dataset(Obs_ds)
    Obs_ds = Obs_ds.rename({'lon': 'x','lat': 'y'})

    # whether to use digital elevation model
    if DEM:
        # digital elevation model: Copernicus EU-Dem v1.1
        dem = search_files(DEM_PATH, '^interTwin_dem.nc$').pop()

        # read elevation and compute slope and aspect
        dem = ERA5Dataset.dem_features(
            dem, {'y': Era5_ds.y, 'x': Era5_ds.x},
            add_coord={'time': Era5_ds.time})

        # check whether to use slope and aspect
        if not DEM_FEATURES:
            dem = dem.drop_vars(['slope', 'aspect']).chunk(Era5_ds.chunks)

        # add dem to set of predictor variables
        dem = dem.chunk(Era5_ds.chunks)
        Era5_ds = xr.merge([Era5_ds, dem])

    # initialize training data
    LogConfig.init_log('Initializing training data.')

    # split calibration period into training and validation period
    if PREDICTAND == 'pr' and STRATIFY:
        # stratify training and validation dataset by number of
        # observed wet days for precipitation
        wet_days = (Obs_ds.sel(time=CALIB_PERIOD).mean(dim=('y', 'x'))
                    >= WET_DAY_THRESHOLD).to_array().values.squeeze()
        train, valid = train_test_split(
            CALIB_PERIOD, stratify=wet_days, test_size=VALID_SIZE)

        # sort chronologically
        train, valid = sorted(train), sorted(valid)
        Era5_train, Obs_train = Era5_ds.sel(time=train), Obs_ds.sel(time=train)
        Era5_valid, Obs_valid = Era5_ds.sel(time=valid), Obs_ds.sel(time=valid)
    else:
        LogConfig.init_log('We are not calculating Stratified Precipitation based on Wet Days here!')

    # training and validation dataset
    Era5_train, Obs_train = Era5_ds.sel(time=CALIB_PERIOD), Obs_ds.sel(time=CALIB_PERIOD)
    Era5_valid, Obs_valid = Era5_ds.sel(time=VALID_PERIOD), Obs_ds.sel(time=VALID_PERIOD)

    Era5_train = doy_encoding(Era5_train, Obs_train, doy=DOY)
    Era5_valid = doy_encoding(Era5_valid, Obs_valid, doy=DOY)

    predictors_train = Era5_train
    predictors_valid = Era5_valid
    predictand_train = Obs_train
    predictand_valid = Obs_valid
    
    predictors_train = stacker(predictors_train).compute()
    predictors_valid = stacker(predictors_valid).compute()
    predictand_train = stacker(predictand_train)
    predictand_valid = stacker(predictand_valid)
    
    LogConfig.init_log('Dask computations done!')
    # iterate over the grid points
    LogConfig.init_log('Downscaling by Random Forest Starts: iterating each grid cell over time dimension')
    
    Models = {
        'RandomForestRegressor' : RandomForestRegressor,
        'XGBRegressor' : XGBRegressor,
        'AdaBoostRegressor': AdaBoostRegressor,
        'LGBMRegressor': LGBMRegressor,
    }
    Model_name = NET

downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: 2024-02-14T12:07:26: Initializing downscaling for period: 2014-01-01 - 2015-12-31
downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: 2024-02-14T12:07:26: Initializing ERA5 predictors.
downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: Searching: /mnt/CEPH_PROJECTS/InterTwin/Climate_Downscaling/hydroModelDownscale/p_REANALYSIS/ERA5/geopotential, pattern: .nc$
downscaleml.core.utils: Searching: /mnt/CEPH_PROJECTS/InterTwin/Climate_Downscaling/hydroModelDownscale/p_REANALYSIS/ERA5/temperature, pattern: .nc$
downscaleml.core.utils: Searching: /mnt/CEPH_PROJECTS/InterTwin/Climate_Downscaling/hydroModelDownsc

<xarray.Dataset>
Dimensions:    (time: 730, x: 161, y: 96)
Coordinates:
  * time       (time) datetime64[ns] 2014-01-01 2014-01-02 ... 2015-12-31
  * x          (x) float64 5.084 5.151 5.218 5.285 ... 15.62 15.69 15.76 15.82
  * y          (y) float64 43.62 43.69 43.75 43.82 ... 49.8 49.86 49.93 50.0
Data variables: (12/17)
    z_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    z_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    t_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    t_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    u_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    u_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    ...         ...
    t2m        (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    elevation  (time, y,

downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: 2024-02-14T12:07:29: Shape of the dask.array<transpose, shape=(161, 96, 730, 17), dtype=float32, chunksize=(161, 96, 365, 1), chunktype=numpy.ndarray> is in (spatial, time, variables):(161, 96, 730, 17)
downscaleml.core.utils: --------------------------------------------------------------------------------


<xarray.Dataset>
Dimensions:    (time: 365, x: 161, y: 96)
Coordinates:
  * time       (time) datetime64[ns] 2016-01-01 2016-01-02 ... 2016-12-30
  * x          (x) float64 5.084 5.151 5.218 5.285 ... 15.62 15.69 15.76 15.82
  * y          (y) float64 43.62 43.69 43.75 43.82 ... 49.8 49.86 49.93 50.0
Data variables: (12/17)
    z_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    z_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    t_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    t_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    u_500      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    u_850      (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    ...         ...
    t2m        (time, y, x) float32 dask.array<chunksize=(357, 96, 161), meta=np.ndarray>
    elevation  (time, y,

downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: 2024-02-14T12:07:47: Shape of the dask.array<transpose, shape=(161, 96, 365, 17), dtype=float32, chunksize=(161, 96, 357, 1), chunktype=numpy.ndarray> is in (spatial, time, variables):(161, 96, 365, 17)
downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: --------------------------------------------------------------------------------
downscaleml.core.utils: 2024-02-14T12:07:56: Shape of the [[[[282.51562]
   [284.83865]
   [284.09058]
   ...
   [283.90338]
   [281.9838 ]
   [283.27625]]

  [[282.11453]
   [284.16345]
   [283.51794]
   ...
   [283.28622]
   [281.8241 ]
   [282.96323]]

  [[282.27887]
   [283.824  ]
   [283.46872]
   ...
   [283.4605 ]
   [281.72687]
   [283.02945]]

  ...

  [[278.4892 ]
   [280.05664]
   [280.6861 ]
   ...
   [278.6501 ]
   [277.42   ]
   [278.915  ]

CV Steps - 10 fold cross validation

In [13]:
import math
import random


combination = 100



In [26]:
def grid_creator(combinations, numpy_object):
    gridded = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations)), numpy_object.shape[2], numpy_object.shape[3])) * np.nan
    return gridded

#predictors_train_grid = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations)), predictors_train.shape[2], predictors_train.shape[3])) * np.nan
#predictand_train_grid = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations)), predictand_train.shape[2], predictand_train.shape[3])) * np.nan
#predictors_valid_grid = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations)), predictors_valid.shape[2], predictors_valid.shape[3])) * np.nan
#predictand_valid_grid = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations)), predictand_valid.shape[2], predictand_valid.shape[3])) * np.nan

predictors_train_grid = grid_creator(combination, predictors_train)
predictand_train_grid = grid_creator(combination, predictand_train)
predictors_valid_grid = grid_creator(combination, predictors_valid)
predictand_valid_grid = grid_creator(combination, predictand_valid)

x_range = (0, 161)
y_range = (0, 96)
random.seed(42)

dummy_array = np.ones(shape=(int(math.sqrt(combinations)), int(math.sqrt(combinations))))



In [30]:
dummy_array[1]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [25]:
for _ in range(combination):
    x = random.randint(*x_range)
    y = random.randint(*y_range)
    list.append((x, y))

# Print the list of tuples
print(list)

NameError: name 'array' is not defined

In [None]:
for i in range(int(math.sqrt(combination))):
    for j in range(int(math.sqrt(combination))):
        predictors_train_grid = 
        predictand_train_grid = 
        predictors_valid_grid =  
        predictand_valid_grid = 
        