tsai/data/preprocessing.py

# AUTOGENERATED! DO NOT EDIT! File to edit: ../../nbs/009_data.preprocessing.ipynb.

# %% ../../nbs/009_data.preprocessing.ipynb 3
from __future__ import annotations
from ..imports import *
import re
from joblib import dump, load
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin
from pandas._libs.tslibs.timestamps import Timestamp
from fastcore.transform import Transform, ItemTransform, Pipeline
from fastai.data.transforms import Categorize
from fastai.data.load import DataLoader
from fastai.tabular.core import df_shrink_dtypes, make_date
from ..utils import *
from .core import *
from .preparation import *

# %% auto 0
__all__ = ['Nan2Value', 'TSRandomStandardize', 'default_date_attr', 'PD_TIME_UNITS', 'StandardScaler', 'RobustScaler',
           'Normalizer', 'BoxCox', 'YeoJohnshon', 'Quantile', 'ToNumpyCategory', 'OneHot', 'TSNan2Value',
           'TSStandardize', 'TSNormalize', 'TSStandardizeTuple', 'TSCatEncode', 'TSDropFeatByKey', 'TSClipOutliers',
           'TSClip', 'TSSelfMissingness', 'TSRobustScale', 'get_stats_with_uncertainty', 'get_random_stats',
           'TSGaussianStandardize', 'TSDiff', 'TSLog', 'TSCyclicalPosition', 'TSLinearPosition', 'TSMissingness',
           'TSPositionGaps', 'TSRollingMean', 'TSLogReturn', 'TSAdd', 'TSClipByVar', 'TSDropVars', 'TSOneHotEncode',
           'TSPosition', 'PatchEncoder', 'TSPatchEncoder', 'TSTuplePatchEncoder', 'TSShrinkDataFrame', 'object2date',
           'TSOneHotEncoder', 'TSCategoricalEncoder', 'TSTargetEncoder', 'TSDateTimeEncoder', 'TSDropIfTrueCols',
           'TSApplyFunction', 'TSMissingnessEncoder', 'TSSortByColumns', 'TSSelectColumns', 'TSStepsSinceStart',
           'TSStandardScaler', 'TSRobustScaler', 'TSAddMissingTimestamps', 'TSDropDuplicates', 'TSFillMissing',
           'Preprocessor', 'ReLabeler']

# %% ../../nbs/009_data.preprocessing.ipynb 6
class ToNumpyCategory(Transform):
    "Categorize a numpy batch"
    order = 90

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def encodes(self, o: np.ndarray):
        self.type = type(o)
        self.cat = Categorize()
        self.cat.setup(o)
        self.vocab = self.cat.vocab
        return np.asarray(stack([self.cat(oi) for oi in o]))

    def decodes(self, o: np.ndarray):
        return stack([self.cat.decode(oi) for oi in o])

    def decodes(self, o: torch.Tensor):
        return stack([self.cat.decode(oi) for oi in o])

# %% ../../nbs/009_data.preprocessing.ipynb 9
class OneHot(Transform): 
    "One-hot encode/ decode a batch"
    order = 90
    def __init__(self, n_classes=None, **kwargs): 
        self.n_classes = n_classes
        super().__init__(**kwargs)
    def encodes(self, o: torch.Tensor): 
        if not self.n_classes: self.n_classes = len(np.unique(o))
        return torch.eye(self.n_classes)[o]
    def encodes(self, o: np.ndarray): 
        o = ToNumpyCategory()(o)
        if not self.n_classes: self.n_classes = len(np.unique(o))
        return np.eye(self.n_classes)[o]
    def decodes(self, o: torch.Tensor): return torch.argmax(o, dim=-1)
    def decodes(self, o: np.ndarray): return np.argmax(o, axis=-1)

# %% ../../nbs/009_data.preprocessing.ipynb 13
class TSNan2Value(Transform):
    "Replaces any nan values by a predefined value or median"
    order = 90
    def __init__(self, value=0, median=False, by_sample_and_var=True, sel_vars=None):
        store_attr()
        if not ismin_torch("1.8"):
            raise ValueError('This function only works with Pytorch>=1.8.')

    def encodes(self, o:TSTensor):
        if self.sel_vars is not None: 
            mask = torch.isnan(o[:, self.sel_vars])
            if mask.any() and self.median:
                if self.by_sample_and_var:
                    median = torch.nanmedian(o[:, self.sel_vars], dim=2, keepdim=True)[0].repeat(1, 1, o.shape[-1])
                    o[:, self.sel_vars][mask] = median[mask]
                else:
                    o[:, self.sel_vars] = torch.nan_to_num(o[:, self.sel_vars], torch.nanmedian(o[:, self.sel_vars]))
            o[:, self.sel_vars] = torch.nan_to_num(o[:, self.sel_vars], self.value)
        else:
            mask = torch.isnan(o)
            if mask.any() and self.median:
                if self.by_sample_and_var:
                    median = torch.nanmedian(o, dim=2, keepdim=True)[0].repeat(1, 1, o.shape[-1])
                    o[mask] = median[mask]
                else:
                    o = torch.nan_to_num(o, torch.nanmedian(o))
            o = torch.nan_to_num(o, self.value)
        return o


Nan2Value = TSNan2Value

# %% ../../nbs/009_data.preprocessing.ipynb 16
class TSStandardize(Transform):
    """Standardizes batch of type `TSTensor`

    Args:
        - mean: you can pass a precalculated mean value as a torch tensor which is the one that will be used, or leave as None, in which case
            it will be estimated using a batch.
        - std: you can pass a precalculated std value as a torch tensor which is the one that will be used, or leave as None, in which case
            it will be estimated using a batch. If both mean and std values are passed when instantiating TSStandardize, the rest of arguments won't be used.
        - by_sample: if True, it will calculate mean and std for each individual sample. Otherwise based on the entire batch.
        - by_var:
            * False: mean and std will be the same for all variables.
            * True: a mean and std will be be different for each variable.
            * a list of ints: (like [0,1,3]) a different mean and std will be set for each variable on the list. Variables not included in the list
            won't be standardized.
            * a list that contains a list/lists: (like[0, [1,3]]) a different mean and std will be set for each element of the list. If multiple elements are
            included in a list, the same mean and std will be set for those variable in the sublist/s. (in the example a mean and std is determined for
            variable 0, and another one for variables 1 & 3 - the same one). Variables not included in the list won't be standardized.
        - by_step: if False, it will standardize values for each time step.
        - exc_vars: list of variables that won't be standardized.
        - eps: it avoids dividing by 0
        - use_single_batch: if True a single training batch will be used to calculate mean & std. Else the entire training set will be used.
    """

    parameters, order = L('mean', 'std'), 90
    _setup = True # indicates it requires set up
    def __init__(self, mean=None, std=None, by_sample=False, by_var=False, by_step=False, exc_vars=None, eps=1e-8, use_single_batch=True, verbose=False, **kwargs):
        super().__init__(**kwargs)
        self.mean = tensor(mean) if mean is not None else None
        self.std = tensor(std) if std is not None else None
        self._setup = (mean is None or std is None) and not by_sample
        self.eps = eps
        self.by_sample, self.by_var, self.by_step = by_sample, by_var, by_step
        drop_axes = []
        if by_sample: drop_axes.append(0)
        if by_var: drop_axes.append(1)
        if by_step: drop_axes.append(2)
        self.exc_vars = exc_vars
        self.axes = tuple([ax for ax in (0, 1, 2) if ax not in drop_axes])
        if by_var and is_listy(by_var):
            self.list_axes = tuple([ax for ax in (0, 1, 2) if ax not in drop_axes]) + (1,)
        self.use_single_batch = use_single_batch
        self.verbose = verbose
        if self.mean is not None or self.std is not None:
            pv(f'{self.__class__.__name__} mean={self.mean}, std={self.std}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n', 
               self.verbose)

    @classmethod
    def from_stats(cls, mean, std): return cls(mean, std)

    def setups(self, dl: DataLoader):
        if self._setup:
            if not self.use_single_batch:
                o = dl.dataset.__getitem__([slice(None)])[0]
            else:
                o, *_ = dl.one_batch()
            if self.by_var and is_listy(self.by_var):
                shape = torch.mean(o, dim=self.axes, keepdim=self.axes!=()).shape
                mean = torch.zeros(*shape, device=o.device)
                std = torch.ones(*shape, device=o.device)
                for v in self.by_var:
                    if not is_listy(v): v = [v]
                    mean[:, v] = torch_nanmean(o[:, v], dim=self.axes if len(v) == 1 else self.list_axes, keepdim=True)
                    std[:, v] = torch.clamp_min(torch_nanstd(o[:, v], dim=self.axes if len(v) == 1 else self.list_axes, keepdim=True), self.eps)
            else:
                mean = torch_nanmean(o, dim=self.axes, keepdim=self.axes!=())
                std = torch.clamp_min(torch_nanstd(o, dim=self.axes, keepdim=self.axes!=()), self.eps)
            if self.exc_vars is not None:
                mean[:, self.exc_vars] = 0.
                std[:, self.exc_vars] = 1.
            self.mean, self.std = mean, std
            if len(self.mean.shape) == 0:
                pv(f'{self.__class__.__name__} mean={self.mean}, std={self.std}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n',
                   self.verbose)
            else:
                pv(f'{self.__class__.__name__} mean shape={self.mean.shape}, std shape={self.std.shape}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n',
                   self.verbose)
            self._setup = False
        elif self.by_sample: self.mean, self.std = torch.zeros(1), torch.ones(1)

    def encodes(self, o:TSTensor):
        if self.by_sample:
            if self.by_var and is_listy(self.by_var):
                shape = torch.mean(o, dim=self.axes, keepdim=self.axes!=()).shape
                mean = torch.zeros(*shape, device=o.device)
                std = torch.ones(*shape, device=o.device)
                for v in self.by_var:
                    if not is_listy(v): v = [v]
                    mean[:, v] = torch_nanmean(o[:, v], dim=self.axes if len(v) == 1 else self.list_axes, keepdim=True)
                    std[:, v] = torch.clamp_min(torch_nanstd(o[:, v], dim=self.axes if len(v) == 1 else self.list_axes, keepdim=True), self.eps)
            else:
                mean = torch_nanmean(o, dim=self.axes, keepdim=self.axes!=())
                std = torch.clamp_min(torch_nanstd(o, dim=self.axes, keepdim=self.axes!=()), self.eps)
            if self.exc_vars is not None:
                mean[:, self.exc_vars] = 0.
                std[:, self.exc_vars] = 1.
            self.mean, self.std = mean, std
        return (o - self.mean) / self.std

    def decodes(self, o:TSTensor):
        if self.mean is None or self.std is None: return o
        return o * self.std + self.mean

    def __repr__(self): return f'{self.__class__.__name__}(by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step})'

# %% ../../nbs/009_data.preprocessing.ipynb 24
@patch
def mul_min(x:torch.Tensor|TSTensor|NumpyTensor, axes=(), keepdim=False):
    if axes == (): return retain_type(x.min(), x)
    axes = reversed(sorted(axes if is_listy(axes) else [axes]))
    min_x = x
    for ax in axes: min_x, _ = min_x.min(ax, keepdim)
    return retain_type(min_x, x)


@patch
def mul_max(x:torch.Tensor|TSTensor|NumpyTensor, axes=(), keepdim=False):
    if axes == (): return retain_type(x.max(), x)
    axes = reversed(sorted(axes if is_listy(axes) else [axes]))
    max_x = x
    for ax in axes: max_x, _ = max_x.max(ax, keepdim)
    return retain_type(max_x, x)


class TSNormalize(Transform):
    "Normalizes batch of type `TSTensor`"
    parameters, order = L('min', 'max'), 90
    _setup = True # indicates it requires set up
    def __init__(self, min=None, max=None, range=(-1, 1), by_sample=False, by_var=False, by_step=False, clip_values=True, 
                 use_single_batch=True, verbose=False, **kwargs):
        super().__init__(**kwargs)
        self.min = tensor(min) if min is not None else None
        self.max = tensor(max) if max is not None else None
        self._setup = (self.min is None and self.max is None) and not by_sample
        self.range_min, self.range_max = range
        self.by_sample, self.by_var, self.by_step = by_sample, by_var, by_step
        drop_axes = []
        if by_sample: drop_axes.append(0)
        if by_var: drop_axes.append(1)
        if by_step: drop_axes.append(2)
        self.axes = tuple([ax for ax in (0, 1, 2) if ax not in drop_axes])
        if by_var and is_listy(by_var):
            self.list_axes = tuple([ax for ax in (0, 1, 2) if ax not in drop_axes]) + (1,)
        self.clip_values = clip_values
        self.use_single_batch = use_single_batch
        self.verbose = verbose
        if self.min is not None or self.max is not None:
            pv(f'{self.__class__.__name__} min={self.min}, max={self.max}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n', self.verbose)
            
    @classmethod
    def from_stats(cls, min, max, range_min=0, range_max=1): return cls(min, max, range_min, range_max)

    def setups(self, dl: DataLoader):
        if self._setup:
            if not self.use_single_batch:
                o = dl.dataset.__getitem__([slice(None)])[0]
            else:
                o, *_ = dl.one_batch()
            if self.by_var and is_listy(self.by_var):
                shape = torch.mean(o, dim=self.axes, keepdim=self.axes!=()).shape
                _min = torch.zeros(*shape, device=o.device) + self.range_min
                _max = torch.zeros(*shape, device=o.device) + self.range_max
                for v in self.by_var:
                    if not is_listy(v): v = [v]
                    _min[:, v] = o[:, v].mul_min(self.axes if len(v) == 1 else self.list_axes, keepdim=self.axes!=())
                    _max[:, v] = o[:, v].mul_max(self.axes if len(v) == 1 else self.list_axes, keepdim=self.axes!=())
            else:
                _min, _max = o.mul_min(self.axes, keepdim=self.axes!=()), o.mul_max(self.axes, keepdim=self.axes!=())
            self.min, self.max = _min, _max
            if len(self.min.shape) == 0: 
                pv(f'{self.__class__.__name__} min={self.min}, max={self.max}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n', 
                   self.verbose)
            else:
                pv(f'{self.__class__.__name__} min shape={self.min.shape}, max shape={self.max.shape}, by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step}\n', 
                   self.verbose)
            self._setup = False
        elif self.by_sample: self.min, self.max = -torch.ones(1), torch.ones(1)

    def encodes(self, o:TSTensor): 
        if self.by_sample: 
            if self.by_var and is_listy(self.by_var):
                shape = torch.mean(o, dim=self.axes, keepdim=self.axes!=()).shape
                _min = torch.zeros(*shape, device=o.device) + self.range_min
                _max = torch.ones(*shape, device=o.device) + self.range_max
                for v in self.by_var:
                    if not is_listy(v): v = [v]
                    _min[:, v] = o[:, v].mul_min(self.axes, keepdim=self.axes!=())
                    _max[:, v] = o[:, v].mul_max(self.axes, keepdim=self.axes!=())
            else:
                _min, _max = o.mul_min(self.axes, keepdim=self.axes!=()), o.mul_max(self.axes, keepdim=self.axes!=())
            self.min, self.max = _min, _max
        output = ((o - self.min) / (self.max - self.min)) * (self.range_max - self.range_min) + self.range_min
        if self.clip_values:
            if self.by_var and is_listy(self.by_var):
                for v in self.by_var:
                    if not is_listy(v): v = [v]
                    output[:, v] = torch.clamp(output[:, v], self.range_min, self.range_max)
            else:
                output = torch.clamp(output, self.range_min, self.range_max)
        return output
    
    def __repr__(self): return f'{self.__class__.__name__}(by_sample={self.by_sample}, by_var={self.by_var}, by_step={self.by_step})'

# %% ../../nbs/009_data.preprocessing.ipynb 28
class TSStandardizeTuple(ItemTransform):
    "Standardizes X (and y if provided)"
    parameters, order = L('x_mean', 'x_std', 'y_mean', 'y_std'), 90
    
    def __init__(self, x_mean, x_std, y_mean=None, y_std=None, eps=1e-5): 
        self.x_mean, self.x_std = torch.as_tensor(x_mean).float(), torch.as_tensor(x_std + eps).float()
        self.y_mean = self.x_mean if y_mean is None else torch.as_tensor(y_mean).float()
        self.y_std = self.x_std if y_std is None else torch.as_tensor(y_std + eps).float()
        
    def encodes(self, xy): 
        if len(xy) == 2:
            x, y = xy
            x = (x - self.x_mean) / self.x_std
            y = (y - self.y_mean) / self.y_std
            return (x, y)
        elif len(xy) == 1:
            x = xy[0]
            x = (x - self.x_mean) / self.x_std
            return (x, )
    def decodes(self, xy): 
        if len(xy) == 2:
            x, y = xy
            x = x * self.x_std + self.x_mean
            y = y * self.y_std + self.y_mean
            return (x, y)
        elif len(xy) == 1:
            x = xy[0]
            x = x * self.x_std + self.x_mean
            return (x, )

# %% ../../nbs/009_data.preprocessing.ipynb 30
class TSCatEncode(Transform):
    "Encodes a variable based on a categorical array"
    def __init__(self, a, sel_var):
        a_key = np.unique(a)
        a_val = np.arange(1, len(a_key) + 1)
        self.o2i = dict(zip(a_key, a_val))
        self.a_key = torch.from_numpy(a_key)
        self.sel_var = sel_var

    def encodes(self, o:TSTensor):
        o_ = o[:, self.sel_var]
        o_val = torch.zeros_like(o_)
        o_in_a = torch.isin(o_, self.a_key.to(o.device))
        o_val[o_in_a] = o_[o_in_a].cpu().apply_(self.o2i.get).to(o.device) # apply is not available for cuda!!
        o[:, self.sel_var] = o_val
        return o

# %% ../../nbs/009_data.preprocessing.ipynb 33
class TSDropFeatByKey(Transform):
    """Randomly drops selected features at selected steps based 
    with a given probability per feature, step and a key variable"""
    parameters, order = 'p', 90
    
    def __init__(self, 
    key_var, # int representing the variable that contains the key information
    p, # array of shape (n_keys, n_features, n_steps) representing the probabilities of dropping a feature at a given step for a given key
    sel_vars, # int or slice or list of ints or array of ints representing the variables to drop
    sel_steps=None, # int or slice or list of ints or array of ints representing the steps to drop
    **kwargs,
    ):
        super().__init__(**kwargs)
        if isinstance(p, np.ndarray):
            p = torch.from_numpy(p)
        if not isinstance(sel_vars, slice):
            if isinstance(sel_vars, Integral): sel_vars = [sel_vars]
            sel_vars = np.asarray(sel_vars)
            if not isinstance(sel_steps, slice) and sel_steps is not None:
                sel_vars = sel_vars.reshape(-1, 1)
        if sel_steps is None:
            sel_steps = slice(None)
        elif not isinstance(sel_steps, slice):
            if isinstance(sel_steps, Integral): sel_steps = [sel_steps]
            sel_steps = np.asarray(sel_steps)
            if not isinstance(sel_vars, slice):
                sel_steps = sel_steps.reshape(1, -1)
        self.key_var, self.p = key_var, p
        self.sel_vars, self.sel_steps = sel_vars, sel_steps
        if p.shape[-1] == 1:
            if isinstance(self.sel_vars, slice) or isinstance(self.sel_steps, slice):
                self._idxs = [slice(None), slice(None), slice(None), 0]
            else:
                self._idxs = [slice(None), 0, slice(None), slice(None), 0]
        else:
            if isinstance(self.sel_vars, slice) or isinstance(self.sel_steps, slice):
                self._idxs = self._idxs = [slice(None), np.arange(p.shape[-1]), slice(None), np.arange(p.shape[-1])]
            else:
                self._idxs = [slice(None), 0, np.arange(p.shape[-1]), slice(None), np.arange(p.shape[-1])]

    def encodes(self, o:TSTensor):
        o_slice = o[:, self.sel_vars, self.sel_steps]
        o_values = o[:, self.key_var, self.sel_steps]
        o_values = torch.nan_to_num(o_values)
        o_values = torch.round(o_values).long()
        if self.p.shape[-1] == 1:
            p = self.p[o_values][self._idxs].permute(0, 2, 1)
        else:
            p = self.p[o_values][self._idxs].permute(1, 2, 0)
        mask = torch.rand_like(o_slice) < p
        o_slice[mask] = np.nan
        o[:, self.sel_vars, self.sel_steps] = o_slice
        return o

# %% ../../nbs/009_data.preprocessing.ipynb 35
class TSClipOutliers(Transform):
    "Clip outliers batch of type `TSTensor` based on the IQR"
    parameters, order = L('min', 'max'), 90
    _setup = True # indicates it requires set up
    def __init__(self, min=None, max=None, by_sample=False, by_var=False, use_single_batch=False, verbose=False, **kwargs):
        super().__init__(**kwargs)
        self.min = tensor(min) if min is not None else tensor(-np.inf)
        self.max = tensor(max) if max is not None else tensor(np.inf)
        self.by_sample, self.by_var = by_sample, by_var
        self._setup = (min is None or max is None) and not by_sample 
        if by_sample and by_var: self.axis = (2)
        elif by_sample: self.axis = (1, 2)
        elif by_var: self.axis = (0, 2)
        else: self.axis = None
        self.use_single_batch = use_single_batch
        self.verbose = verbose
        if min is not None or max is not None:
            pv(f'{self.__class__.__name__} min={min}, max={max}\n', self.verbose)

    def setups(self, dl: DataLoader):
        if self._setup:
            if not self.use_single_batch:
                o = dl.dataset.__getitem__([slice(None)])[0]
            else:
                o, *_ = dl.one_batch()
            min, max = get_outliers_IQR(o, self.axis)
            self.min, self.max = tensor(min), tensor(max)
            if self.axis is None: pv(f'{self.__class__.__name__} min={self.min}, max={self.max}, by_sample={self.by_sample}, by_var={self.by_var}\n', 
                                     self.verbose)
            else: pv(f'{self.__class__.__name__} min={self.min.shape}, max={self.max.shape}, by_sample={self.by_sample}, by_var={self.by_var}\n', 
                     self.verbose)
            self._setup = False
            
    def encodes(self, o:TSTensor):
        if self.axis is None: return torch.clamp(o, self.min, self.max)
        elif self.by_sample: 
            min, max = get_outliers_IQR(o, axis=self.axis)
            self.min, self.max = o.new(min), o.new(max)
        return torch_clamp(o, self.min, self.max)
    
    def __repr__(self): return f'{self.__class__.__name__}(by_sample={self.by_sample}, by_var={self.by_var})'

# %% ../../nbs/009_data.preprocessing.ipynb 37
class TSClip(Transform):
    "Clip  batch of type `TSTensor`"
    parameters, order = L('min', 'max'), 90
    def __init__(self, min=-6, max=6, **kwargs):
        super().__init__(**kwargs)
        self.min = torch.tensor(min)
        self.max = torch.tensor(max)

    def encodes(self, o:TSTensor):
        return torch.clamp(o, self.min, self.max)
    def __repr__(self): return f'{self.__class__.__name__}(min={self.min}, max={self.max})'

# %% ../../nbs/009_data.preprocessing.ipynb 39
class TSSelfMissingness(Transform):
    "Applies missingness from samples in a batch to random samples in the batch for selected variables"
    order = 90
    def __init__(self, sel_vars=None, **kwargs):
        self.sel_vars = sel_vars
        super().__init__(**kwargs)

    def encodes(self, o:TSTensor):
        if self.sel_vars is not None: 
            mask = rotate_axis0(torch.isnan(o[:, self.sel_vars]))
            o[:, self.sel_vars] = o[:, self.sel_vars].masked_fill(mask, np.nan)
        else:
            mask = rotate_axis0(torch.isnan(o))
            o.masked_fill_(mask, np.nan)
        return o

# %% ../../nbs/009_data.preprocessing.ipynb 41
class TSRobustScale(Transform):
    r"""This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)"""
    parameters, order = L('median', 'iqr'), 90
    _setup = True # indicates it requires set up
    def __init__(self, median=None, iqr=None, quantile_range=(25.0, 75.0), use_single_batch=True, exc_vars=None, eps=1e-8, verbose=False, **kwargs):
        super().__init__(**kwargs)
        self.median = tensor(median) if median is not None else None
        self.iqr = tensor(iqr) if iqr is not None else None
        self._setup = median is None or iqr is None
        self.use_single_batch = use_single_batch
        self.exc_vars = exc_vars
        self.eps = eps
        self.verbose = verbose
        self.quantile_range = quantile_range
            
    def setups(self, dl: DataLoader):
        if self._setup:
            if not self.use_single_batch:
                o = dl.dataset.__getitem__([slice(None)])[0]
            else:
                o, *_ = dl.one_batch()

            new_o = o.permute(1,0,2).flatten(1)
            median = get_percentile(new_o, 50, axis=1)
            iqrmin, iqrmax = get_outliers_IQR(new_o, axis=1, quantile_range=self.quantile_range)
            self.median = median.unsqueeze(0)
            self.iqr = torch.clamp_min((iqrmax - iqrmin).unsqueeze(0), self.eps)
            if self.exc_vars is not None: 
                self.median[:, self.exc_vars] = 0
                self.iqr[:, self.exc_vars] = 1
            
            pv(f'{self.__class__.__name__} median={self.median.shape} iqr={self.iqr.shape}', self.verbose)
            self._setup = False
        else: 
            if self.median is None: self.median = torch.zeros(1, device=dl.device)
            if self.iqr is None: self.iqr = torch.ones(1, device=dl.device)

            
    def encodes(self, o:TSTensor):
        return (o - self.median) / self.iqr

    def __repr__(self): return f'{self.__class__.__name__}(quantile_range={self.quantile_range}, use_single_batch={self.use_single_batch})'

# %% ../../nbs/009_data.preprocessing.ipynb 44
def get_stats_with_uncertainty(o, sel_vars=None, sel_vars_zero_mean_unit_var=False, bs=64, n_trials=None, axis=(0,2)):
    o_dtype = o.dtype
    if n_trials is None: n_trials = len(o) // bs
    random_idxs = random_choice(len(o), n_trials * bs, n_trials * bs > len(o))
    oi_mean = []
    oi_std = []
    start = 0
    for i in progress_bar(range(n_trials)):
        idxs = random_idxs[start:start + bs]
        start += bs
        if hasattr(o, 'oindex'):
            oi = o.index[idxs]
        if hasattr(o, 'compute'):
            oi = o[idxs].compute()
        else:
            oi = o[idxs]
        oi_mean.append(np.nanmean(oi.astype('float32'), axis=axis, keepdims=True))
        oi_std.append(np.nanstd(oi.astype('float32'), axis=axis, keepdims=True))
    oi_mean = np.concatenate(oi_mean)
    oi_std = np.concatenate(oi_std)
    E_mean = np.nanmean(oi_mean, axis=0, keepdims=True).astype(o_dtype)
    S_mean = np.nanstd(oi_mean, axis=0, keepdims=True).astype(o_dtype)
    E_std = np.nanmean(oi_std, axis=0, keepdims=True).astype(o_dtype)
    S_std = np.nanstd(oi_std, axis=0, keepdims=True).astype(o_dtype)
    if sel_vars is not None:
        non_sel_vars = np.isin(np.arange(o.shape[1]), sel_vars, invert=True)
        if sel_vars_zero_mean_unit_var:
            E_mean[:, non_sel_vars] = 0 # zero mean
            E_std[:, non_sel_vars] = 1  # unit var
        S_mean[:, non_sel_vars] = 0 # no uncertainty
        S_std[:, non_sel_vars] = 0  # no uncertainty
    return np.stack([E_mean, S_mean, E_std, S_std])


def get_random_stats(E_mean, S_mean, E_std, S_std):
    mult = np.random.normal(0, 1, 2)
    new_mean = E_mean + S_mean * mult[0]
    new_std = E_std + S_std * mult[1]
    return new_mean, new_std


class TSGaussianStandardize(Transform):
    "Scales each batch using modeled mean and std based on UNCERTAINTY MODELING FOR OUT-OF-DISTRIBUTION GENERALIZATION https://arxiv.org/abs/2202.03958"

    parameters, order = L('E_mean', 'S_mean', 'E_std', 'S_std'), 90
    def __init__(self, 
        E_mean : np.ndarray, # Mean expected value
        S_mean : np.ndarray, # Uncertainty (standard deviation) of the mean
        E_std : np.ndarray,  # Standard deviation expected value
        S_std : np.ndarray,  # Uncertainty (standard deviation) of the standard deviation
        eps=1e-8, # (epsilon) small amount added to standard deviation to avoid deviding by zero
        split_idx=0, # Flag to indicate to which set is this transofrm applied. 0: training, 1:validation, None:both
        **kwargs,
        ):
        self.E_mean, self.S_mean = torch.from_numpy(E_mean), torch.from_numpy(S_mean)
        self.E_std, self.S_std = torch.from_numpy(E_std), torch.from_numpy(S_std)
        self.eps = eps
        super().__init__(split_idx=split_idx, **kwargs)
        
    def encodes(self, o:TSTensor):
        mult = torch.normal(0, 1, (2,), device=o.device)
        new_mean = self.E_mean + self.S_mean * mult[0]
        new_std = torch.clamp(self.E_std + self.S_std * mult[1], self.eps)
        return (o - new_mean) / new_std
    
TSRandomStandardize = TSGaussianStandardize

# %% ../../nbs/009_data.preprocessing.ipynb 47
class TSDiff(Transform):
    "Differences batch of type `TSTensor`"
    order = 90
    def __init__(self, lag=1, pad=True, **kwargs):
        super().__init__(**kwargs)
        self.lag, self.pad = lag, pad

    def encodes(self, o:TSTensor): 
        return torch_diff(o, lag=self.lag, pad=self.pad)
    
    def __repr__(self): return f'{self.__class__.__name__}(lag={self.lag}, pad={self.pad})'

# %% ../../nbs/009_data.preprocessing.ipynb 49
class TSLog(Transform):
    "Log transforms batch of type `TSTensor` + 1. Accepts positive and negative numbers"
    order = 90
    def __init__(self, ex=None, **kwargs):
        self.ex = ex
        super().__init__(**kwargs)
    def encodes(self, o:TSTensor):
        output = torch.zeros_like(o)
        output[o > 0] = torch.log1p(o[o > 0])
        output[o < 0] = -torch.log1p(torch.abs(o[o < 0]))
        if self.ex is not None: output[...,self.ex,:] = o[...,self.ex,:]
        return output
    def decodes(self, o:TSTensor):
        output = torch.zeros_like(o)
        output[o > 0] = torch.exp(o[o > 0]) - 1
        output[o < 0] = -torch.exp(torch.abs(o[o < 0])) + 1
        if self.ex is not None: output[...,self.ex,:] = o[...,self.ex,:]
        return output
    def __repr__(self): return f'{self.__class__.__name__}()'

# %% ../../nbs/009_data.preprocessing.ipynb 51
class TSCyclicalPosition(Transform):
    "Concatenates the position along the sequence as 2 additional variables (sine and cosine)"
    order = 90
    def __init__(self, 
        cyclical_var=None, # Optional variable to indicate the steps withing the cycle (ie minute of the day)
        magnitude=None, # Added for compatibility. It's not used.
        drop_var=False, # Flag to indicate if the cyclical var is removed
        **kwargs
        ):
        super().__init__(**kwargs)
        self.cyclical_var, self.drop_var = cyclical_var, drop_var

    def encodes(self, o: TSTensor):
        bs,nvars,seq_len = o.shape
        if self.cyclical_var is None:
            sin, cos = sincos_encoding(seq_len, device=o.device)
            output = torch.cat([o, sin.reshape(1,1,-1).repeat(bs,1,1), cos.reshape(1,1,-1).repeat(bs,1,1)], 1)
            return output
        else:
            sin = torch.sin(o[:, [self.cyclical_var]]/seq_len * 2 * np.pi)
            cos = torch.cos(o[:, [self.cyclical_var]]/seq_len * 2 * np.pi)
            if self.drop_var:
                exc_vars = np.isin(np.arange(nvars), self.cyclical_var, invert=True)
                output = torch.cat([o[:, exc_vars], sin, cos], 1)
            else:
                output = torch.cat([o, sin, cos], 1)
            return output

# %% ../../nbs/009_data.preprocessing.ipynb 54
class TSLinearPosition(Transform):
    "Concatenates the position along the sequence as 1 additional variable"

    order = 90
    def __init__(self, 
        linear_var:int=None, # Optional variable to indicate the steps withing the cycle (ie minute of the day)
        var_range:tuple=None, # Optional range indicating min and max values of the linear variable
        magnitude=None, # Added for compatibility. It's not used.
        drop_var:bool=False, # Flag to indicate if the cyclical var is removed
        lin_range:tuple=(-1,1), 
        **kwargs): 
        self.linear_var, self.var_range, self.drop_var, self.lin_range = linear_var, var_range, drop_var, lin_range
        super().__init__(**kwargs)

    def encodes(self, o: TSTensor): 
        bs,nvars,seq_len = o.shape
        if self.linear_var is None:
            lin = linear_encoding(seq_len, device=o.device, lin_range=self.lin_range)
            output = torch.cat([o, lin.reshape(1,1,-1).repeat(bs,1,1)], 1)
        else:
            linear_var = o[:, [self.linear_var]]
            if self.var_range is None:
                lin = (linear_var - linear_var.min()) / (linear_var.max() - linear_var.min())
            else:
                lin = (linear_var - self.var_range[0]) / (self.var_range[1] - self.var_range[0])
            lin = (linear_var - self.lin_range[0]) / (self.lin_range[1] - self.lin_range[0])
            if self.drop_var:
                exc_vars = np.isin(np.arange(nvars), self.linear_var, invert=True)
                output = torch.cat([o[:, exc_vars], lin], 1)
            else:
                output = torch.cat([o, lin], 1)
            return output
        return output

# %% ../../nbs/009_data.preprocessing.ipynb 57
class TSMissingness(Transform):
    "Concatenates data missingness for selected features along the sequence as additional variables"

    order = 90
    def __init__(self, sel_vars=None, feature_idxs=None, magnitude=None, **kwargs):
        sel_vars = sel_vars or feature_idxs
        self.sel_vars = listify(sel_vars)
        super().__init__(**kwargs)

    def encodes(self, o: TSTensor):
        if self.sel_vars is not None:
            missingness = o[:, self.sel_vars].isnan()
        else:
            missingness = o.isnan()
        return torch.cat([o, missingness], 1)

# %% ../../nbs/009_data.preprocessing.ipynb 59
class TSPositionGaps(Transform):
    """Concatenates gaps for selected features along the sequence as additional variables"""

    order = 90
    def __init__(self, sel_vars=None, feature_idxs=None, magnitude=None, forward=True, backward=False, 
                 nearest=False, normalize=True, **kwargs):
        sel_vars = sel_vars or feature_idxs
        self.sel_vars = listify(sel_vars)
        self.gap_fn = partial(get_gaps, forward=forward, backward=backward, nearest=nearest, normalize=normalize)
        super().__init__(**kwargs)

    def encodes(self, o: TSTensor):
        if self.sel_vars:
            gaps = self.gap_fn(o[:, self.sel_vars])
        else:
            gaps = self.gap_fn(o)
        return torch.cat([o, gaps], 1)

# %% ../../nbs/009_data.preprocessing.ipynb 61
class TSRollingMean(Transform):
    """Calculates the rolling mean for all/ selected features alongside the sequence
    
       It replaces the original values or adds additional variables (default)
       If nan values are found, they will be filled forward and backward"""

    order = 90
    def __init__(self, sel_vars=None, feature_idxs=None, magnitude=None, window=2, replace=False, **kwargs):
        sel_vars = sel_vars or feature_idxs
        self.sel_vars = listify(sel_vars)
        self.rolling_mean_fn = partial(rolling_moving_average, window=window)
        self.replace = replace
        super().__init__(**kwargs)

    def encodes(self, o: TSTensor):
        if self.sel_vars:
            if torch.isnan(o[:, self.sel_vars]).any():
                o[:, self.sel_vars] = fbfill_sequence(o[:, self.sel_vars])
            rolling_mean = self.rolling_mean_fn(o[:, self.sel_vars])
            if self.replace: 
                o[:, self.sel_vars] = rolling_mean
                return o
        else:
            if torch.isnan(o).any():
                o = fbfill_sequence(o)
            rolling_mean = self.rolling_mean_fn(o)
            if self.replace: return rolling_mean
        return torch.cat([o, rolling_mean], 1)

# %% ../../nbs/009_data.preprocessing.ipynb 63
class TSLogReturn(Transform):
    "Calculates log-return of batch of type `TSTensor`. For positive values only"
    order = 90
    def __init__(self, lag=1, pad=True, **kwargs):
        super().__init__(**kwargs)
        self.lag, self.pad = lag, pad

    def encodes(self, o:TSTensor):
        return torch_diff(torch.log(o), lag=self.lag, pad=self.pad)

    def __repr__(self): return f'{self.__class__.__name__}(lag={self.lag}, pad={self.pad})'

# %% ../../nbs/009_data.preprocessing.ipynb 65
class TSAdd(Transform):
    "Add a defined amount to each batch of type `TSTensor`."
    order = 90
    def __init__(self, add, **kwargs):
        super().__init__(**kwargs)
        self.add = add

    def encodes(self, o:TSTensor):
        return torch.add(o, self.add)
    def __repr__(self): return f'{self.__class__.__name__}(lag={self.lag}, pad={self.pad})'

# %% ../../nbs/009_data.preprocessing.ipynb 67
class TSClipByVar(Transform):
    """Clip  batch of type `TSTensor` by variable
    
    Args:
        var_min_max: list of tuples containing variable index, min value (or None) and max value (or None)
    """
    order = 90
    def __init__(self, var_min_max, **kwargs):
        super().__init__(**kwargs)
        self.var_min_max = var_min_max

    def encodes(self, o:TSTensor):
        for v,m,M in self.var_min_max:
            o[:, v] = torch.clamp(o[:, v], m, M)
        return o

# %% ../../nbs/009_data.preprocessing.ipynb 69
class TSDropVars(Transform):
    "Drops selected variable from the input"
    order = 90
    def __init__(self, drop_vars, **kwargs):
        super().__init__(**kwargs)
        self.drop_vars = drop_vars

    def encodes(self, o:TSTensor):
        exc_vars = np.isin(np.arange(o.shape[1]), self.drop_vars, invert=True)
        return o[:, exc_vars]

# %% ../../nbs/009_data.preprocessing.ipynb 71
class TSOneHotEncode(Transform):
    order = 90
    def __init__(self,
        sel_var:int, # Variable that is one-hot encoded
        unique_labels:list, # List containing all labels (excluding nan values)
        add_na:bool=False, # Flag to indicate if values not included in vocab should be set as 0
        drop_var:bool=True, # Flag to indicate if the selected var is removed
        magnitude=None, # Added for compatibility. It's not used.
        **kwargs
        ):
        unique_labels = listify(unique_labels)
        self.sel_var = sel_var
        self.unique_labels = unique_labels
        self.n_classes = len(unique_labels) + add_na
        self.add_na = add_na
        self.drop_var = drop_var
        super().__init__(**kwargs)
        
    def encodes(self, o: TSTensor):
        bs, n_vars, seq_len = o.shape
        o_var = o[:, [self.sel_var]]
        ohe_var = torch.zeros(bs, self.n_classes, seq_len, device=o.device)
        if self.add_na:
            is_na = torch.isin(o_var, o_var.new(list(self.unique_labels)), invert=True) # not available in dict
            ohe_var[:, [0]] = is_na.to(ohe_var.dtype)
        for i,l in enumerate(self.unique_labels):
            ohe_var[:, [i + self.add_na]] = (o_var == l).to(ohe_var.dtype)
        if self.drop_var:
            exc_vars = torch.isin(torch.arange(o.shape[1], device=o.device), self.sel_var, invert=True)
            output = torch.cat([o[:, exc_vars], ohe_var], 1)
        else:
            output = torch.cat([o, ohe_var], 1)
        return output

# %% ../../nbs/009_data.preprocessing.ipynb 77
class TSPosition(Transform):
    order = 90
    def __init__(self,
        steps:list, # List containing the steps passed as an additional variable. Theu should be normalized.
        magnitude=None, # Added for compatibility. It's not used.
        **kwargs
        ):
        self.steps = torch.from_numpy(np.asarray(steps)).reshape(1, 1, -1)
        super().__init__(**kwargs)

    def encodes(self, o: TSTensor):
        bs = o.shape[0]
        steps = self.steps.expand(bs, -1, -1).to(device=o.device, dtype=o.dtype)
        return torch.cat([o, steps], 1)

# %% ../../nbs/009_data.preprocessing.ipynb 79
import torch
import torch.nn.functional as F

class PatchEncoder():
    "Creates a sequence of patches from a 3d input tensor."

    def __init__(self, 
        patch_len:int, # Number of time steps in each patch.
        patch_stride:int=None, # Stride of the patch.
        pad_at_start:bool=True, # If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
        value:float=0.0, # Value to pad the input tensor with.
        seq_len:int=None, # Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
        merge_dims:bool=True, # If True, merge channels within the same patch.
        reduction:str='none', # type of reduction applied. Available: "none", "mean", "min", "max", "mode"
        reduction_dim:int=-1, # dimension where the reduction is applied
        swap_dims:tuple=None, # If True, swap the time and channel dimensions.
        ):
        super().__init__()

        self.seq_len = seq_len
        self.patch_len = patch_len
        self.patch_stride = patch_stride or patch_len
        self.pad_at_start = pad_at_start
        self.value = value
        self.merge_dims = merge_dims

        assert reduction in ["none", "mean", "min", "max", "mode"]
        self.reduction = reduction
        self.reduction_dim = reduction_dim
        self.swap_dims = swap_dims
        
        if seq_len is None:
            self.pad_size = 0
        elif self.seq_len < self.patch_len:
            self.pad_size = self.patch_len - self.seq_len
        else:
            if (self.seq_len % self.patch_len) % self.patch_stride == 0:
                self.pad_size = 0
            else:
                self.pad_size = self.patch_stride - (self.seq_len % self.patch_len) % self.patch_stride

    def __call__(self, 
        x: torch.Tensor # 3d input tensor with shape [batch size, sequence length, channels]
        ) -> torch.Tensor: #  Transformed tensor of patches with shape [batch size, channels*patch length, number of patches]

        if x.ndim == 2:
            x = x[:, None]
            
        bs, c_in, *_ = x.size()
        if not bs: 
            return x
        
        if self.pad_size:
            x = F.pad(x, (self.pad_size, 0), value=self.value) if self.pad_at_start else F.pad(x, (0, self.pad_size), value=self.value)
        
        x = x.unfold(2, self.patch_len, self.patch_stride)
        x = x.permute(0, 1, 3, 2)
        if self.merge_dims:
            x = x.reshape(bs, c_in * self.patch_len, -1)

        if self.reduction == "mean":
            x = x.mean(self.reduction_dim)
        elif self.reduction == "min":
            x = x.min(self.reduction_dim).values
        elif self.reduction == "max":
            x = x.max(self.reduction_dim).values
        elif self.reduction == "mode":
            x = x.mode(self.reduction_dim).values

        if self.swap_dims:
            x = x.swapaxes(*self.swap_dims)

        return x

# %% ../../nbs/009_data.preprocessing.ipynb 81
class TSPatchEncoder(Transform):
    "Tansforms a time series into a sequence of patches along the last dimension"
    order = 90
    
    def __init__(self, 
        patch_len:int, # Number of time steps in each patch.
        patch_stride:int=None, # Stride of the patch.
        pad_at_start:bool=True, # If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
        value:float=0.0, # Value to pad the input tensor with.
        seq_len:int=None, # Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
        merge_dims:bool=True, # If True, merge channels within the same patch.
        reduction:str='none', # type of reduction applied. Available: "none", "mean", "min", "max", "mode"
        reduction_dim:int=-2, # dimension where the y reduction is applied.
        swap_dims:tuple=None, # If True, swap the time and channel dimensions.
        ):
        super().__init__()

        self.patch_encoder = PatchEncoder(patch_len=patch_len, 
                                          patch_stride=patch_stride, 
                                          pad_at_start=pad_at_start, 
                                          value=value,
                                          seq_len=seq_len, 
                                          merge_dims=merge_dims,
                                          reduction=reduction,
                                          reduction_dim=reduction_dim,
                                          swap_dims=swap_dims)

    def encodes(self, o:TSTensor):
        return self.patch_encoder(o)

# %% ../../nbs/009_data.preprocessing.ipynb 83
from fastcore.transform import ItemTransform

class TSTuplePatchEncoder(ItemTransform):
    "Tansforms a time series with x and y into sequences of patches along the last dimension"
    order = 90
    
    def __init__(self, 
        patch_len:int, # Number of time steps in each patch.
        patch_stride:int=None, # Stride of the patch.
        pad_at_start:bool=True, # If True, pad the input tensor at the start to ensure that the input tensor is evenly divisible by the patch length.
        value:float=0.0, # Value to pad the input tensor with.
        seq_len:int=None, # Number of time steps in the input tensor. If None, make sure seq_len >= patch_len and a multiple of stride
        merge_dims:bool=True, # If True, merge y channels within the same patch.
        reduction:str='none', # type of reduction applied to y. Available: "none", "mean", "min", "max", "mode"
        reduction_dim:int=-2, # dimension where the y reduction is applied.
        swap_dims:tuple=None, # If True, swap the time and channel dimensions in y.
        ):
        super().__init__()

        self.x_patch_encoder = PatchEncoder(patch_len=patch_len, 
                                            patch_stride=patch_stride, 
                                            pad_at_start=pad_at_start, 
                                            value=value,
                                            seq_len=seq_len)

        self.y_patch_encoder = PatchEncoder(patch_len=patch_len, 
                                            patch_stride=patch_stride, 
                                            pad_at_start=pad_at_start, 
                                            value=value,
                                            seq_len=seq_len, 
                                            merge_dims=merge_dims,
                                            reduction=reduction,
                                            reduction_dim=reduction_dim,
                                            swap_dims=swap_dims)

    def encodes(self, xy):
        if len(xy) == 2:
            x, y = xy
            x, y = self.x_patch_encoder(x), self.y_patch_encoder(y)
            return (x, y)
        elif len(xy) == 1:
            x = xy[0]
            x = self.x_patch_encoder(x)
            return (x, )

# %% ../../nbs/009_data.preprocessing.ipynb 86
class TSShrinkDataFrame(BaseEstimator, TransformerMixin):
    """A transformer to shrink dataframe or series memory usage"""

    def __init__(self, 
        columns=None, # List[str], optional. Columns to shrink, all columns by default.
        skip=None, # List[str], optional. Columns to skip, None by default.
        obj2cat=True, # bool, optional. Convert object columns to category, True by default.
        int2uint=False, # bool, optional. Convert int columns to uint, False by default.
        verbose=True # bool, optional. Print memory usage info. True by default.
        ):
        self.columns, self.skip, self.obj2cat, self.int2uint, self.verbose = listify(columns), listify(skip), obj2cat, int2uint, verbose
        
    def fit(self, X, y=None, **fit_params):
        if isinstance(X, pd.Series): 
            X = X.to_frame()
        assert isinstance(X, pd.DataFrame), "X must be a pd.DataFrame or pd.Series" 
        if isinstance(X, pd.Series):
            X = X.to_frame().apply(object2date)
        else:
            X = X.apply(object2date)
        if self.columns:
            self.dt = df_shrink_dtypes(X[self.columns], self.skip, obj2cat=self.obj2cat, int2uint=self.int2uint)
        else:
            self.dt = df_shrink_dtypes(X, self.skip, obj2cat=self.obj2cat, int2uint=self.int2uint)
        return self
        
    def transform(self, X, **kwargs):
        if isinstance(X, pd.Series): 
            col_name = X.name
            X = X.to_frame()
        else:
            col_name = None
        assert isinstance(X, pd.DataFrame), "X must be a pd.DataFrame or pd.Series"
        if self.verbose:
            start_memory = X.memory_usage().sum()
            print(f"Initial memory usage: {bytes2str(start_memory):10}")
        if isinstance(X, pd.Series):
            X = X.to_frame().apply(object2date)
        else:
            X = X.apply(object2date)
        if self.columns:
            X.loc[:, self.columns] = X[self.columns].astype(self.dt)
        else:
            X = X.astype(self.dt)
        if self.verbose:
            end_memory = X.memory_usage().sum()
            print(f"Final memory usage  : {bytes2str(end_memory):10} ({(end_memory - start_memory) / start_memory:.1%})")
        if col_name is not None:
            X = X[col_name]
        return X
         
    def inverse_transform(self, X, **kwargs):
        return X


def object2date(x, format=None):
    if not x.dtype == np.dtype('object'): return x
    try:
        return pd.to_datetime(x, format=format)
    except:
        return x

# %% ../../nbs/009_data.preprocessing.ipynb 90
class TSOneHotEncoder(BaseEstimator, TransformerMixin):
    "Encode categorical variables using one-hot encoding"

    def __init__(
        self,
        columns=None,  # (str or List[str], optional): Column name(s) to encode. If None, all columns will be encoded. Defaults to None.
        drop=True,  # (bool, optional): Whether to drop the original columns after encoding. Defaults to True.
        add_na=True,  # (bool, optional): Whether to add a 'NaN' category for missing values. Defaults to True.
        dtype=np.int8,  # (type, optional): Data type of the encoded output. Defaults to np.int64.
    ):
        self.columns = listify(columns)
        self.drop, self.add_na, self.dtype = drop, add_na, dtype

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if not self.columns: self.columns = X.columns
        handle_unknown = "ignore" if self.add_na else "error"
        self.ohe_tfm = sklearn.preprocessing.OneHotEncoder(handle_unknown=handle_unknown)
        self.dtypes = [X[c].dtype for c in self.columns]
        if len(self.columns) == 1:
            self.ohe_tfm.fit(X[self.columns].to_numpy().reshape(-1, 1))
        else:
            self.ohe_tfm.fit(X[self.columns])
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        if len(self.columns) == 1:
            output = self.ohe_tfm.transform(X[self.columns].to_numpy().reshape(-1, 1)).toarray().astype(self.dtype)
        else:
            output = self.ohe_tfm.transform(X[self.columns]).toarray().astype(self.dtype)
        new_cols = []
        for i,col in enumerate(self.columns):
            for cats in self.ohe_tfm.categories_[i]:
                new_cols.append(f"{str(col)}_{str(cats)}")
        X[new_cols] = output
        self.new_cols = new_cols
        if self.drop: X = X.drop(self.columns, axis=1)
        return X

    def inverse_transform(self, X, **kwargs):
        if len(self.new_cols) == 1:
            output = self.ohe_tfm.inverse_transform(X[self.new_cols].to_numpy().reshape(-1, 1))
        else:
            output = self.ohe_tfm.inverse_transform(X[self.new_cols])
        for i,(col,d) in enumerate(zip(self.columns, self.dtypes)):
            X[col] = output[:, i]
            if hasattr(d, "categories"):
                X[col] = X[col].astype('category')
        if self.drop:
            X = X.drop(self.new_cols, axis=1)
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 92
class TSCategoricalEncoder(BaseEstimator, TransformerMixin):
    """A transformer to encode categorical columns"""

    def __init__(self,
        columns=None, # List[str], optional. Columns to encode, all columns by default.
        add_na=True, # bool, optional. Add a NaN category, True by default.
        sort=True, # bool, optional. Sort categories by frequency, True by default.
        categories='auto', # dict, optional. The custom mapping of categories. 'auto' by default.
        inplace=True, # bool, optional. Modify input DataFrame, True by default.
        prefix=None, # str, optional. Prefix for created column names. None by default.
        suffix=None, # str, optional. Suffix for created column names. None by default.
        drop=False # bool, optional. Drop original columns, False by default.
        ):
        self.columns = listify(columns)
        self.add_na = add_na
        self.prefix = prefix
        self.suffix = suffix
        self.sort = sort
        self.inplace = inplace
        self.drop = drop
        if categories is None or categories == 'auto': 
            self.categories = None
        else:
            assert is_listy(categories) and len(categories) > 0, "you must pass a list or list of lists of categories"
            self.categories = self.to_categorical(categories)


    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if not self.columns:
            if isinstance(X, pd.DataFrame):
                self.columns = X.columns
            else:
                self.columns = X.name
        
        idxs = fit_params.get("idxs", slice(None))
        if self.categories is None:
            _categories = []
            for column in self.columns:
                if isinstance(X, pd.DataFrame) and hasattr(X[column], "cat"):
                    categories = X[column].cat.categories
                elif hasattr(X, "cat"):
                    categories = X.cat.categories
                else:
                    categories = X.loc[idxs, column].dropna().unique() if isinstance(X, pd.DataFrame) else X[idxs].dropna().unique()
                    if self.sort:
                        categories = np.sort(categories)
                categories = pd.CategoricalDtype(categories=categories, ordered=True)
                _categories.append(categories)
            self.categories = _categories
        assert len(self.categories) == len(self.columns)
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if isinstance(X, pd.DataFrame):
            columns = X.columns
        else:
            columns = X.name
        for column, categories in zip(self.columns, self.categories):
            if column not in columns:
                continue
            if isinstance(X, pd.DataFrame):
                name = []
                if self.prefix: name += [self.prefix]
                name += [column]
                if self.suffix: name += [self.suffix]
                new_col = '_'.join(name)
                if self.drop:
                    X.loc[:, column] = X.loc[:, column].astype(categories).cat.codes + self.add_na
                    X.rename(columns={column: new_col}, inplace=True)
                else:
                    X.loc[:, new_col] = X.loc[:, column].astype(categories).cat.codes + self.add_na
            else:
                X = X.astype(categories).cat.codes + self.add_na
        return X

    def inverse_transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if isinstance(X, pd.DataFrame):
            columns = X.columns
            for column, categories in zip(self.columns, self.categories):
                if column not in columns:
                    continue
                name = []
                if self.prefix: name += [self.prefix]
                name += [column]
                if self.suffix: name += [self.suffix]
                new_col = '_'.join(name)
                if self.add_na:
                    X.loc[:, new_col] = np.array(['#na#'] + list(categories.categories))[X.loc[:, new_col].astype(int)]
                else:
                    X.loc[:, new_col] = categories.categories[X.loc[:, new_col].astype(int)]
        else:
            if self.add_na:
                X = pd.Series(np.array(['#na#'] + list(self.categories[0].categories))[X], name=X.name, index=X.index)
            else:
                X = pd.Series(self.categories[0].categories[X], name=X.name, index=X.index)
        return X

    def to_categorical(self, categories):
        if is_listy(categories[0]):
            return [pd.CategoricalDtype(categories=np.sort(c) if self.sort else c, ordered=True) for c in categories]
        else:
            return pd.CategoricalDtype(categories=np.sort(categories) if self.sort else categories, ordered=True)

# %% ../../nbs/009_data.preprocessing.ipynb 98
class TSTargetEncoder(TransformerMixin, BaseEstimator):
    def __init__(self, 
        target_column, # column containing the target 
        columns=None, # List[str], optional. Columns to encode, all non-numerical columns by default.
        inplace=True, # bool, optional. Modify input DataFrame, True by default.
        prefix=None, # str, optional. Prefix for created column names. None by default.
        suffix=None, # str, optional. Suffix for created column names. None by default.
        drop=True, # bool, optional. Drop original columns, False by default.
        dtypes=["object", "category"], # List[str]. List with dtypes that will be used to identify columns to encode if not explicitly passed.
        ):
        "Transforms categorical columns into numerical by replacing categories with target means."
        
        self.columns = listify(columns)
        self.target_column = target_column
        self.target_means = {}
        self.inplace = inplace
        self.prefix = prefix
        self.suffix = suffix
        self.drop = drop
        self.dtypes = listify(dtypes)

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if not self.columns:
            self.columns = X.select_dtypes(include=self.dtypes).columns
                
        assert self.target_column in X.columns
        idxs = fit_params.get("idxs", slice(None))
        X_fit = X.loc[idxs]

        for column in self.columns:
            assert column in X.columns
            self.target_means[column] = X_fit.groupby(column)[self.target_column].mean()

        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        if not self.inplace:
            X = X.copy()
        for column in self.columns:
            if column not in X.columns:
                continue
            name = []
            if self.prefix: name += [self.prefix]
            name += [column]
            if self.suffix: name += [self.suffix]
            new_col = '_'.join(name)
            if self.drop:
                X.loc[:, column] = X.loc[:, column].map(self.target_means[column])
                X.rename(columns={column: new_col}, inplace=True)
            else:
                X.loc[:, new_col] = X.loc[:, column].map(self.target_means[column])
        return X

    def inverse_transform(self, X, **kwargs):
        raise NotImplementedError("This method cannot be implemented because the original data cannot be reconstructed exactly.")

# %% ../../nbs/009_data.preprocessing.ipynb 100
default_date_attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
                     'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']

class TSDateTimeEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, datetime_columns=None, prefix=None, drop=True, time=False, attr=default_date_attr):
        self.datetime_columns = listify(datetime_columns)
        self.prefix, self.drop, self.time, self.attr = prefix, drop, time, listify(attr)
        
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if self.time: self.attr = self.attr + ['Hour', 'Minute', 'Second']
        if not self.datetime_columns:
            self.datetime_columns = X.columns
        self.prefixes = []
        for dt_column in self.datetime_columns: 
            self.prefixes.append(re.sub('[Dd]ate$', '', dt_column) if self.prefix is None else self.prefix)
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        
        for dt_column,prefix in zip(self.datetime_columns,self.prefixes): 
            make_date(X, dt_column)
            field = X[dt_column]

            # Pandas removed `dt.week` in v1.1.10
            week = field.dt.isocalendar().week.astype(field.dt.day.dtype) if hasattr(field.dt, 'isocalendar') else field.dt.week
            for n in self.attr: X[prefix + "_" + n] = getattr(field.dt, n.lower()) if n != 'Week' else week
            if self.drop: X = X.drop(self.datetime_columns, axis=1)
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 103
class TSDropIfTrueCols(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns = listify(columns)

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if not self.columns: self.columns = X.columns
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        mask = X[self.columns].sum(axis=1) == 0
        X = X[mask].reset_index(drop=True)
        X.drop(columns=self.columns, inplace=True)
        return X

    def inverse_transform(self, X, **kwargs):
        raise NotImplementedError("Inverse transform is not implemented for TSDropIfTrueCols")

# %% ../../nbs/009_data.preprocessing.ipynb 105
class TSApplyFunction(BaseEstimator, TransformerMixin):

    def __init__(self, function, groups=None, group_keys=False, axis=1, columns=None, reset_index=False, drop=True):
        self.function = function
        self.groups = listify(groups)
        self.group_keys = group_keys
        self.columns = listify(columns)
        self.reset_index = reset_index
        self.drop = drop
        self.axis = axis

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if self.columns is None:
            self.columns = X.columns
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        if self.groups:
            if self.columns:
                X = X.groupby(self.groups, group_keys=self.group_keys)[self.columns].apply(lambda x: self.function(x))
            else:
                X = X.groupby(self.groups, group_keys=self.group_keys).apply(lambda x: self.function(x))
        else:
            if self.columns:
                X[self.columns] = X[self.columns].apply(lambda x: self.function(x), axis=self.axis)
            else:
                X = X.apply(lambda x: self.function(x), axis=self.axis)
        if self.reset_index: 
            X = X.reset_index(drop=self.drop)
        return X

    def inverse_transform(self, X, **kwargs):
        raise NotImplementedError("Inverse transform is not implemented for ApplyFunction")

# %% ../../nbs/009_data.preprocessing.ipynb 109
class TSMissingnessEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns = listify(columns)
        
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if not self.columns: self.columns = X.columns
        self.missing_columns = [f"{cn}_missing" for cn in self.columns]
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        X[self.missing_columns] = X[self.columns].isnull().astype(int)
        return X
         
    def inverse_transform(self, X, **kwargs):
        assert isinstance(X, pd.DataFrame)
        X.drop(self.missing_columns, axis=1, inplace=True)
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 111
class TSSortByColumns(TransformerMixin, BaseEstimator):
    "Transforms a dataframe by sorting by columns."

    def __init__(self, 
        columns, # Columns to sort by
        ascending=True, # Ascending or descending
        inplace=True, # Perform operation in place
        kind='stable', # Type of sort to use
        na_position='last', # Where to place NaNs
        ignore_index=False, # Do not preserve index
        key=None, # Function to apply to values before sorting
        ):
        self.columns, self.ascending, self.inplace, self.kind, self.na_position, self.ignore_index, self.key = \
        listify(columns), ascending, inplace, kind, na_position, ignore_index, key
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if self.inplace:
            X.sort_values(self.columns, axis=0, ascending=self.ascending, inplace=True, kind=self.kind, 
                          na_position=self.na_position, ignore_index=self.ignore_index, key=self.key)
        else:
            X = X.sort_values(self.columns, axis=0, ascending=self.ascending, inplace=False, kind=self.kind, 
                              na_position=self.na_position, ignore_index=self.ignore_index, key=self.key)
        return X

    def inverse_transform(self, X, **kwargs):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 113
class TSSelectColumns(TransformerMixin, BaseEstimator):
    "Transform used to select columns"

    def __init__(self, 
        columns # str or List[str]. Selected columns.
        ):
        self.columns = listify(columns)
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        return self
        
    def transform(self, X, idxs=None, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if idxs is not None:
            return X.loc[idxs, self.columns]
        return X[self.columns].copy()

    def inverse_transform(self, X, **kwargs):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 115
PD_TIME_UNITS = dict([
    ("Y", "year"), 
    ("M", "month"), 
    ("W", "week"), 
    ("D", "day"), 
    ("h", "hour"), 
    ("m", "minute"), 
    ("s", "second"), 
    ("ms", "millisecond"), 
    ("us", "microsecond"), 
    ("ns", "nanosecond"), 
    ("ps", "picosecond"), 
    ("fs", "femtosecond"), 
    ("as", "attosecond")
])

class TSStepsSinceStart(BaseEstimator, TransformerMixin):
    "Add a column indicating the number of steps since the start in each row"

    def __init__(self,
        datetime_col, # (str or List[str]): Column name(s) containing datetime values.
        datetime_unit="D", #(str, optional): Time unit of the datetime values. Defaults to 'D'.
        start_datetime=None, #(str or pd.Timestamp, optional): The start datetime value. If None, the minimum value of the datetime_col is used. Defaults to None.
        drop=False, # (bool, optional): Whether to drop the datetime_col column after computing time steps. Defaults to False.
        dtype=None, # (type, optional): Data type of the time steps. Defaults to None.
):

        self.datetime_col = listify(datetime_col)[0]
        self.datetime_div = np.timedelta64(1, datetime_unit)
        datetime_value = PD_TIME_UNITS[datetime_unit]
        self.drop = drop
        self.dtype = dtype
        self.new_col = f"{datetime_value}s_since_start"
        self.datetime_unit = datetime_unit
        if start_datetime is None:
            self.start_datetime = None  
        elif isinstance(start_datetime, Timestamp):
            self.start_datetime = start_datetime
        else:
            self.start_datetime = Timestamp(start_datetime)

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if self.start_datetime is None:
            self.start_datetime = X[self.datetime_col].min()
        self.ori_dtype = X[self.datetime_col].dtypes
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        time_deltas = (pd.to_timedelta(X[self.datetime_col] - self.start_datetime) / self.datetime_div).astype(dtype=self.dtype)
        if self.drop: 
            X[self.datetime_col] = time_deltas
            X.rename(columns={self.datetime_col:self.new_col}, inplace=True)
        else:
            X[self.new_col] = time_deltas
        return X

    def inverse_transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        datetimes  = pd.to_datetime(X[self.new_col] * self.datetime_div + self.start_datetime).astype(dtype=self.ori_dtype)
        if self.drop: 
            X[self.new_col] = datetimes
            X.rename(columns={self.new_col:self.datetime_col}, inplace=True)
        else:
            X[self.datetime_col] = datetimes
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 117
class TSStandardScaler(TransformerMixin, BaseEstimator):
    "Scale the values of specified columns in the input DataFrame to have a mean of 0 and standard deviation of 1."

    def __init__(self,
        columns=None, # Column name(s) to be transformed. If None, all columns are transformed. Defaults to None.
        mean=None, # Mean value for each column. If None, the mean value of each column is calculated during the fit method. Defaults to None.
        std=None, # Stdev value for each column. If None, the standard deviation value of each column is calculated during the fit method. Defaults to None.
        eps=1e-6, # A small value to avoid division by zero. Defaults to 1e-6.
    ):
        self.columns = listify(columns)
        self.mean = mean
        self.std = std
        self.eps = np.array(eps, dtype='float32')
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if not self.columns:
            if isinstance(X, pd.DataFrame):
                self.columns = X.columns
            else:
                self.columns = X.name
        idxs = fit_params.get("idxs", slice(None))
        if self.mean is None:
            self.mean = []
            for c in self.columns:
                self.mean.append(X.loc[idxs, c].mean())
        else:
            assert len(self.mean) == len(self.columns)
        if self.std is None:
            self.std = []
            for c in self.columns:
                self.std.append(X.loc[idxs, c].std())
        else:
            assert len(self.std) == len(self.columns)
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        for c, m, s in zip(self.columns, self.mean, self.std):
            X[c] = (X[c] - m) / (s + self.eps)
        return X

    def inverse_transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        for c, m, s in zip(self.columns, self.mean, self.std):
            X[c] = X[c] * (s + self.eps)  + m
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 120
class TSRobustScaler(TransformerMixin, BaseEstimator):
    """This Scaler removes the median and scales the data according to the quantile range (defaults to IQR: Interquartile Range)"""

    def __init__(self, columns=None, quantile_range=(25.0, 75.0), eps=1e-6):
        self.columns = listify(columns)
        self.quantile_range = quantile_range
        self.eps = np.array(eps, dtype='float32')

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if not self.columns:
            if isinstance(X, pd.DataFrame):
                self.columns = X.columns
            else:
                self.columns = X.name
        idxs = fit_params.get("idxs", slice(None))
        
        self.median = []
        for c in self.columns:
            self.median.append(np.nanpercentile(X.loc[idxs, c], 50))

        self.iqr = []
        for c in self.columns:
            q1 = np.nanpercentile(X.loc[idxs, c], self.quantile_range[0])
            q3 = np.nanpercentile(X.loc[idxs, c], self.quantile_range[1])
            self.iqr.append(q3 - q1)
                
        return self

    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        for c, m, q in zip(self.columns, self.median, self.iqr):
            X[c] = (X[c] - m) / (q + self.eps)
        return X

    def inverse_transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        for c, m, q in zip(self.columns, self.median, self.iqr):
            X[c] = X[c] * (q + self.eps) + m
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 122
class TSAddMissingTimestamps(TransformerMixin, BaseEstimator):
    def __init__(self, datetime_col=None, use_index=False, unique_id_cols=None, fill_value=np.nan, range_by_group=True, 
                 start_date=None, end_date=None, freq=None):
        assert datetime_col is not None or use_index
        store_attr()
        self.func = partial(add_missing_timestamps, datetime_col=datetime_col, use_index=use_index, unique_id_cols=unique_id_cols, 
                            fill_value=fill_value, range_by_group=range_by_group, start_date=start_date, end_date=end_date, 
                            freq=freq)
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        X = self.func(X)
        return X

    def inverse_transform(self, X, **kwargs):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 126
class TSDropDuplicates(TransformerMixin, BaseEstimator):
    "Drop rows with duplicated values in a set of columns, optionally including a datetime column or index"

    def __init__(self,
        datetime_col=None, #(str or List[str], optional): Name(s) of column(s) containing datetime values. If None, the index is used if use_index=True.
        use_index=False, #(bool, optional): Whether to include the index in the set of columns for checking duplicates. Defaults to False.
        unique_id_cols=None, #(str or List[str], optional): Name(s) of column(s) to be included in the set of columns for checking duplicates. Defaults to None.
        keep='last', #(str, optional): Which duplicated values to keep. Choose from {'first', 'last', False}. Defaults to 'last'.
        reset_index=False, #(bool, optional): Whether to reset the index after dropping duplicates. Ignored if use_index=False. Defaults to False.
    ):
        assert datetime_col is not None or use_index, "you need to either pass a datetime_col or set use_index=True"
    
        self.datetime_col, self.use_index, self.unique_id_cols, self.keep, self.reset_index =  \
        listify(datetime_col), use_index, listify(unique_id_cols), keep, reset_index
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if self.use_index:
            cols = [X.index.name or 'index'] + self.unique_id_cols
            idxs_to_drop = X.reset_index().duplicated(subset=cols, keep=self.keep)
        else:
            cols = self.datetime_col + self.unique_id_cols
            idxs_to_drop = X.duplicated(subset=cols, keep=self.keep)
        if idxs_to_drop.sum():
            X.drop(index=idxs_to_drop[idxs_to_drop].index, inplace=True)
            if not self.use_index and self.reset_index:
                X.reset_index(drop=True, inplace=True)
        return X

    def inverse_transform(self, X, **kwargs):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 128
class TSFillMissing(TransformerMixin, BaseEstimator):
    "Fill missing values in specified columns using the specified method and/ or value."

    def __init__(self,
        columns=None, #(str or List[str], optional): Column name(s) to be transformed. If None, all columns are transformed. Defaults to None.
        unique_id_cols=None, #(str or List[str], optional): Col name(s) with unique ids for each row. If None, uses all rows at once. Defaults to None .
        method='ffill', #(str, optional): The method to use for filling missing values, e.g. 'ffill', 'bfill'. If None, `value` is used. Defaults to None.
        value=0, #(scalar or dict or Series, optional): The value to use for filling missing values. If None, `method` is used. Defaults to None.
    ):

        self.columns = listify(columns)
        self.unique_id_cols = unique_id_cols
        self.method = method
        self.value = value
    
    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if not self.columns:
            if isinstance(X, pd.DataFrame):
                self.columns = X.columns
            else:
                self.columns = X.name
        return self
        
    def transform(self, X, **kwargs):
        assert isinstance(X, (pd.DataFrame, pd.Series))
        if self.method is not None:
            for c in self.columns:
                if self.unique_id_cols is not None:
                    X[c] = X.groupby(self.unique_id_cols)[c].fillna(method=self.method)
                else:
                    X[c] = X[c].fillna(method=self.method)
        if self.value is not None:
            for c in self.columns:
                X[c] = X[c].fillna(value=self.value)
        return X

    def inverse_transform(self, X, **kwargs):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 130
class TSMissingnessEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, columns=None):
        self.columns = listify(columns)

    def fit(self, X, y=None, **fit_params):
        assert isinstance(X, pd.DataFrame)
        if not self.columns: self.columns = X.columns
        self.missing_columns = [f"{cn}_missing" for cn in self.columns]
        return self

    def transform(self, X:pd.DataFrame, y=None, **transform_params):
        assert isinstance(X, pd.DataFrame)
        X[self.missing_columns] = X[self.columns].isnull().astype(np.int16)
        return X

    def inverse_transform(self, X):
        return X

# %% ../../nbs/009_data.preprocessing.ipynb 134
class Preprocessor():
    def __init__(self, preprocessor, **kwargs): 
        self.preprocessor = preprocessor(**kwargs)
        
    def fit(self, o): 
        if isinstance(o, pd.Series): o = o.values.reshape(-1,1)
        else: o = o.reshape(-1,1)
        self.fit_preprocessor = self.preprocessor.fit(o)
        return self.fit_preprocessor
    
    def transform(self, o, copy=True):
        if type(o) in [float, int]: o = array([o]).reshape(-1,1)
        o_shape = o.shape
        if isinstance(o, pd.Series): o = o.values.reshape(-1,1)
        else: o = o.reshape(-1,1)
        output = self.fit_preprocessor.transform(o).reshape(*o_shape)
        if isinstance(o, torch.Tensor): return o.new(output)
        return output
    
    def inverse_transform(self, o, copy=True):
        o_shape = o.shape
        if isinstance(o, pd.Series): o = o.values.reshape(-1,1)
        else: o = o.reshape(-1,1)
        output = self.fit_preprocessor.inverse_transform(o).reshape(*o_shape)
        if isinstance(o, torch.Tensor): return o.new(output)
        return output


StandardScaler = partial(sklearn.preprocessing.StandardScaler)
setattr(StandardScaler, '__name__', 'StandardScaler')
RobustScaler = partial(sklearn.preprocessing.RobustScaler)
setattr(RobustScaler, '__name__', 'RobustScaler')
Normalizer = partial(sklearn.preprocessing.MinMaxScaler, feature_range=(-1, 1))
setattr(Normalizer, '__name__', 'Normalizer')
BoxCox = partial(sklearn.preprocessing.PowerTransformer, method='box-cox')
setattr(BoxCox, '__name__', 'BoxCox')
YeoJohnshon = partial(sklearn.preprocessing.PowerTransformer, method='yeo-johnson')
setattr(YeoJohnshon, '__name__', 'YeoJohnshon')
Quantile = partial(sklearn.preprocessing.QuantileTransformer, n_quantiles=1_000, output_distribution='normal', random_state=0)
setattr(Quantile, '__name__', 'Quantile')

# %% ../../nbs/009_data.preprocessing.ipynb 142
def ReLabeler(cm):
    r"""Changes the labels in a dataset based on a dictionary (class mapping) 
        Args:
            cm = class mapping dictionary
    """
    def _relabel(y):
        obj = len(set([len(listify(v)) for v in cm.values()])) > 1
        keys = cm.keys()
        if obj: 
            new_cm = {k:v for k,v in zip(keys, [listify(v) for v in cm.values()])}
            return np.array([new_cm[yi] if yi in keys else listify(yi) for yi in y], dtype=object).reshape(*y.shape)
        else: 
            new_cm = {k:v for k,v in zip(keys, [listify(v) for v in cm.values()])}
            return np.array([new_cm[yi] if yi in keys else listify(yi) for yi in y]).reshape(*y.shape)
    return _relabel