In [1]:
import scipy
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn import model_selection
from sklearn.base import is_classifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import get_scorer, check_scoring
from sklearn.preprocessing import LabelBinarizer
from scipy.special import xlogy
from scipy.stats import hmean, gmean

In [2]:
def gen_xy(
    model,
    iv_corr,
    n_obs):
    n_ivs = 10
    mean = np.zeros((n_ivs,))
    cov = np.block([[(iv_corr * np.ones((n_ivs - 3, n_ivs - 3)) + 
         (1 - iv_corr) * np.eye(n_ivs - 3)), np.zeros((7, 3))],
                    [np.zeros((3, 7)), np.eye(3)]])
    x = np.random.multivariate_normal(
      mean = mean, 
      cov = cov, 
      size = n_obs)
    if model == "linear":
        coef = np.array([.1, .2, .3, .4]).reshape(4, -1)
        cov_signal = cov[0:4, 0:4]
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = x[:,0:4]
    else:
        coef = np.array([.3, .3, .3, .4]).reshape(4, -1)
        sd_quad = np.sqrt(2)
        sd_prod = np.sqrt(1 + iv_corr**2)
        a = (2 * (iv_corr**2)) / (sd_quad * sd_quad)
        b = (2 * (iv_corr**2)) / (sd_quad * sd_prod)
        cov_signal = np.array(
            [[ 1.  ,  0.  , 0.  ,  0.  ],
             [ 0.  ,  1.  ,  a,  b],
             [0.  ,  a,  1.  ,  b],
             [ 0.  ,  b,  b,  1.  ]])
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = np.concatenate(
            (x[:,0:1], 
             (x[:,0:1]**2)  / sd_quad, 
             (x[:,1:2]**2) / sd_quad,
             (x[:,2:3] * x[:,3:4]) / sd_prod), 
            axis = 1)
    error = np.random.normal(
      loc = 0.0, 
      scale = np.sqrt(error_var), 
      size = (n_obs, ))
    y = (x_signal @ coef).reshape(-1,) + error
    r2 = 1 - error_var
    return x, y, r2




In [3]:
class Crosser():
    def __init__(
        self,
        estimator,
        cv
    ):
        cv, n_splits, n_repeats, n_folds = self._check_cv(cv)
        self.estimator = estimator
        self.cv = cv
        self.n_splits = n_splits
        self.n_repeats = n_repeats
        self.n_folds = n_folds

    def fit(
        self, 
        X, 
        y
    ):
        cv = self.cv
        cv_indexes = []
        estimators = []
        for train_index, test_index in cv.split(X):     
            estimator = clone(self.estimator)
            if hasattr(X, "iloc") and hasattr(y, "iloc"):
                _ = estimator.fit(
                    X.iloc[train_index,], 
                    y.iloc[train_index])
            else:
                _ = estimator.fit(
                    X[train_index,], 
                    y[train_index])
            estimator = estimator
            estimators.append(estimator)               
            cv_indexes.append((train_index, test_index))
        self.X_, self.y_ = X.copy(), y.copy()
        self.estimators_ = estimators
        self.cv_indexes_ = cv_indexes
        if is_classifier(self.estimator):
            label_binarizer = LabelBinarizer()
            _ = label_binarizer.fit(y)
            self.label_binarizer_ = label_binarizer
        return self

    def predict(
        self, 
        X,
        *,
        split = None
    ):
        if split is None:
            preds = np.array(
                [estimator.predict(X) 
                 for estimator in self.estimators_])
            if is_classifier(self.estimator):
                pred = np.apply_along_axis(
                    lambda x: np.argmax(np.bincount(x)),
                    axis = 0,
                    arr = preds)
            else:
                pred = preds.mean(axis = 0)
        else:
            pred = self.estimators_[split].predict(X)
        return pred

    def predict_proba(
        self, 
        X,
        *,
        split = None
    ):
        if split is None:
            preds = np.array(
                [estimator.predict_proba(X)
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
        else:
            pred = self.estimators_[split].predict_proba(X)
        return pred

    def predict_log_proba(
        self, 
        X,
        *,
        split = None
    ):
        if split is None:
            preds = np.array(
                [estimator.predict_log_proba(X)
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
        else:
            pred = self.estimators_[split].predict_log_proba(X)
        return pred
            
    def decision_function(
        self, 
        X,
        *,
        split = None
    ):
        if split is None:
            preds = np.array(
                [estimator.decision_function(X) 
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
        else:
            pred = self.estimators_[split].decision_function(X)
        return pred
            
    
    def sample(
        self,
        X,
        *,
        split = None,
        n_samples = None, 
        random_state = None
    ):
        rng = np.random.default_rng(random_state)
        if is_classifier(self.estimator):
            pred = self.predict_proba(
                X, 
                split = split)
            if n_samples is None:            
                rv = rng.multinomial(1, pred)
                rv = self.label_binarizer_.inverse_transform(rv)
            else:
                rv = rng.multinomial(
                    1, pred, (n_samples, len(pred)))
                rv = np.array(
                    [self.label_binarizer_.inverse_transform(rv_i) 
                     for rv_i in rv])
        else:
            if split is None:
                targets = self._targets()
                preds = self._preds()
                residual = np.concatenate(
                    [target - pred 
                     for target, pred in zip(targets, preds)])
            else:
                train_index, test_index = self.cv_indexes_[split]
                if hasattr(self.X_, "iloc") and hasattr(self.y_, "iloc"):
                    feature = self.X_.iloc[test_index, :]
                    target = self.y_.iloc[test_index]
                else:
                    feature = self.X_[test_index, :]
                    target = self.y_[test_index]
                pred = self.predict(
                    feature, 
                    split = split)     
                residual = target - pred
            pred = self.predict(
                X, 
                split = split)
            if len(pred) > len(residual):
                replace = True
            else:
                replace = False
            if n_samples is None:
                rv = pred + rng.choice(residual, len(pred), replace)
            else:
                rv = pred + np.array(
                    [rng.choice(residual, len(pred), replace) 
                     for repeat in range(n_samples)])
        return rv

    
    def _features(
        self
    ):
        features = []
        for train_index, test_index in self.cv_indexes_:
            if hasattr(self.X_, "iloc"):
                feature = self.X_.iloc[test_index, :]
            else:
                feature = self.X_[test_index, :]
            features.append(feature)
        return features
    
    def _targets(
        self,
        binarize = None
    ):
        targets = []
        for train_index, test_index in self.cv_indexes_:
            if hasattr(self.y_, "iloc"):
                target = self.y_.iloc[test_index]
            else:
                target = self.y_[test_index]
            if is_classifier(self.estimator):
                if binarize is True:
                    target = self.label_binarizer_.transform(target)
                    if target.shape[1] == 1:
                        target = np.append(1 - target, target, axis=1)
            targets.append(target)
        return targets

    def _preds(
        self,
        response_method = "predict"
    ):
        preds = []
        for split, (train_index, test_index) in enumerate(self.cv_indexes_):
            predict_func = getattr(self, response_method)
            if hasattr(self.X_, "iloc"):
                feature = self.X_.iloc[test_index, :]
            else:
                feature = self.X_[test_index, :]
            pred = predict_func(feature, split = split)
            preds.append(pred)
        return preds

    def _rvs(
        self,
        n_samples = None,
        random_state = None
    ):
        rvs = []
        for split, (train_index, test_index) in enumerate(self.cv_indexes_):
            if hasattr(self.X_, "iloc"):
                feature = self.X_.iloc[test_index, :]
            else:
                feature = self.X_[test_index, :]
            rv = self.sample(
                feature, 
                split = split, 
                n_samples = n_samples,
                random_state = random_state)
            rvs.append(rv)
        return rvs


    def _check_cv(
        self,
        cv
    ):
        kf_cvs = {"KFold", "RepeatedKFold", 
                  "StratifiedKFold", "RepeatedStratifiedKFold"}
        ss_cvs = {"ShuffleSplit", "StratifiedShuffleSplit"}
        allowed_cvs = set.union(ss_cvs, kf_cvs)
        if isinstance(
            cv,
            tuple(getattr(model_selection, allowed_cv) 
                  for allowed_cv in allowed_cvs)
        ):
            n_splits = cv.get_n_splits()
            if isinstance(
                cv, 
                tuple(getattr(model_selection, kf_cv) 
                      for kf_cv in kf_cvs)):
                if hasattr(cv, "n_repeats"):
                    n_repeats = cv.n_repeats
                else:
                    n_repeats = 1
            else:
                n_repeats = cv.get_n_splits()
            n_folds = n_splits // n_repeats
        else:
            raise ValueError("Support cross-validator types are {}".format(allowed_cvs))
        return cv, n_splits, n_repeats, n_folds


In [4]:
class BaseInferer():
    def __init__(
        self, 
        learner,
        remover,
        algorithm,
        *,
        loss_func = None,
        infer_type = None,
        n_samples = None,
        n_permutations = None,
        double_split = None,
        perturb_size = None,
        random_state = None,
        removed_column = None):
        if loss_func is None:
            if is_classifier(learner.estimator):
                loss_func = "log_loss"
            else:
                loss_func = "mean_squared_error"

        if loss_func == "log_loss":
            def log_loss(target, pred):
                eps = np.finfo(pred.dtype).eps
                pred = np.clip(pred, eps, 1 - eps)
                loss = -xlogy(target, pred).sum(axis=1)
                return loss
            loss_func = log_loss
            binarize = True
            response_method = "predict_proba" 

        if loss_func == "zero_one_loss":
            def zero_one_loss(target, pred):
                loss = 1 * (target == pred)
                return loss
            loss_func = zero_one_loss
            binarize = False
            response_method = "predict" 
        
        if loss_func == "mean_squared_error":
            def mean_squared_error(target, pred):
                loss = (target - pred)**2
                return loss
            loss_func = mean_squared_error
            binarize = False
            response_method = "predict"
        
        if loss_func == "mean_absolute_error":
            def mean_absolute_error(target, pred):
                loss = np.abs(target - pred)
                return loss
            loss_func = mean_absolute_error
            binarize = False
            response_method = "predict"

        self.learner = learner
        self.remover = remover
        self.algorithm = algorithm
        self.loss_func = loss_func
        self.infer_type = infer_type
        self.n_samples = n_samples
        self.n_permutations = n_permutations
        self.double_split = double_split
        self.perturb_size = perturb_size
        self.random_state = random_state
        self.removed_column = removed_column
        self.binarize = binarize
        self.response_method = response_method

    def summarize(
        self,
        *,
        agg_method = None,
        cross_fit = None
    ):
        if agg_method is None:
            summary = pd.DataFrame(
                {"estimate": self._estimates(),
                 "std_error": self._std_errors(),
                 "p_value": self._p_values()}
            )
            summary.index.name = "split" 
            
        elif agg_method == "gmean":
            
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                 "std_error": np.mean(self._std_errors()),
                 "p_value": np.minimum(np.e * gmean(self._p_values(), 0), 1.)},
                index = [0]
            )
            
        elif agg_method == "median":
            
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                 "std_error": np.mean(self._std_errors()),
                 "p_value": np.minimum(2*np.median(self._p_values(), 0), 1.)},
                index = [0]
            )
            
        elif agg_method == "q1": 
                
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                "std_error": np.mean(self._std_errors()),
                "p_value": np.minimum(len(self._p_values()) / 2.*np.partition(self._p_values(), 1)[1], 1.)},
                index = [0]
            )
                
        elif agg_method == "min":
                
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                "std_error": np.mean(self._std_errors()),
                "p_value": np.minimum(len(self._p_values())*np.min(self._p_values(), 0), 1.)},
                index = [0]
            )
                
        elif agg_method == "hmean":    
                
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                "std_error": np.mean(self._std_errors()),
                "p_value": np.minimum(np.e * np.log(len(self._p_values())) * hmean(self._p_values(), 0), 1.)},
                index = [0]
            )
                
        elif agg_method == "hommel":
                
            const = np.sum(1. / (np.arange(len(self._p_values())) + 1.))
            order_const = const * (len(self._p_values()) / (np.arange(len(self._p_values())) + 1.))
            
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                "std_error": np.mean(self._std_errors()),
                "p_value": np.minimum(np.min(np.sort(self._p_values()) * order_const), 1.)},
                index = [0]
            )
        
        elif agg_method == "cauchy":
            
            t0 =np.mean(np.tan((.5 - np.array(self._p_values()))*np.pi))
                  
            summary = pd.DataFrame(
                {"estimate": np.mean(self._estimates()),
                "std_error": np.mean(self._std_errors()),
                "p_value": np.minimum(.5 - np.arctan(t0) / np.pi, 1.)},
                index = [0]
            )
        
        
        return summary
        

    def _estimates(
        self
    ):
        l_losses = self.learner_losses_
        r_losses = self.removed_losses_
        estimates = [l_loss.mean() - r_loss.mean()
               for l_loss, r_loss in zip(l_losses, r_losses)]
        return estimates

    def _std_errors(
        self
    ):
        infer_type = self.infer_type
        if infer_type == "permutation" or infer_type == "randomization":
            null_values = self.null_values_
            std_errors = [null_value.std() 
                          for null_value in null_values]
        else:
            l_losses = self.learner_losses_
            r_losses = self.removed_losses_
            std_errors = [(l_loss - r_loss).std() / np.sqrt(len(l_loss)) 
                          for l_loss, r_loss in zip(l_losses, r_losses)]
        return std_errors

    def _p_values(
        self
    ):
        infer_type = self.infer_type
        if infer_type == "permutation" or infer_type == "randomization":
            null_values = self.null_values_
            p_values = [(null_value > 0).mean() 
                        for null_value in null_values]
        else:
            estimates = self._estimates()
            std_errors = self._std_errors()
            p_values = [scipy.stats.norm.cdf(estimate / std_error) 
                        for estimate, std_error in zip(estimates, std_errors)]
        return p_values

    def _removed_column(
        self,
        learner,
        remover
    ):
        cols = np.arange(learner.X_.shape[1])
        if hasattr(learner.X_, "iloc") and hasattr(remover.y_, "iloc"):
            X, y = learner.X_.values, remover.y_.values
        else:
            X, y = learner.X_, remover.y_
        for col in cols:
            tester = np.array_equal(y, X[:,col])
            if tester:
                removed_column = col
                break
        return removed_column

In [5]:
class CIT(BaseInferer):
    def infer(
        self
    ):
        learner = self.learner
        remover = self.remover
        algorithm = self.algorithm
        loss_func = self.loss_func
        infer_type = self.infer_type
        n_samples = self.n_samples
        n_permutations = self.n_permutations
        random_state = self.random_state
        removed_column = self.removed_column
        binarize = self.binarize
        response_method = self.response_method
        
        l_features = learner._features()
        l_targets = learner._targets(binarize)
        l_preds = learner._preds(response_method)
        l_losses = [loss_func(l_target, l_pred)
                   for l_target, l_pred in zip(l_targets, l_preds)]
        
        r_features = l_features
        r_rvs = remover._rvs(
            n_samples = n_samples,
            random_state = random_state)
        r_losses = []

        def _r_loss_repeat(r_rv_repeat):
            if hasattr(r_feature, "iloc"):
                r_feature.iloc[:, removed_column] = r_rv_repeat
            else:
                r_feature[:, removed_column] = r_rv_repeat
            r_pred_repeat = learner.predict(
                r_feature, 
                split = split)
            r_loss_repeat = loss_func(l_target, r_pred_repeat)
            return r_loss_repeat
        
        for split, (l_loss, l_target, r_feature, r_rv) in enumerate(
            zip(l_losses, l_targets, r_features, r_rvs)):
            r_loss = np.apply_along_axis(
                _r_loss_repeat,
                axis = 1,
                arr = r_rv)
            r_losses.append(r_loss)
        
        if infer_type == "randomization":
            null_values = []  
            for split, (l_loss, r_loss) in enumerate(
                zip(l_losses, r_losses)):
                null_value = (l_loss - r_loss).mean(axis = 1)
                null_values.append(null_value)
                r_loss = r_loss.mean(axis = 0)
                r_losses[split] = r_loss
        else:
            for split, (l_loss, r_loss) in enumerate(
                zip(l_losses, r_losses)):
                r_loss = r_loss.mean(axis = 0)
                r_losses[split] = r_loss
            
            if infer_type == "permutation":
                null_values = [] 
                rng = np.random.default_rng(random_state)
                for split, (l_loss, r_loss) in enumerate(
                    zip(l_losses, r_losses)):
                    estimate = l_loss.mean() - r_loss.mean()
                    paired_loss = np.column_stack([l_loss, r_loss])
                    null_value = np.array([
                        estimate - np.diff(
                            rng.permuted(
                                paired_loss, 
                                axis = 1).mean(
                                axis = 0)).item() 
                        for permutation in range(n_permutations)])
                    null_values.append(null_value)
            else:
                null_values = None

        
        self.learner_losses_ = l_losses
        self.removed_losses_ = r_losses
        self.null_values_ = null_values

In [6]:
class RIT(BaseInferer):
    def infer(
        self
    ):
        learner = self.learner
        remover = self.remover
        algorithm = self.algorithm
        loss_func = self.loss_func
        infer_type = self.infer_type
        n_permutations = self.n_permutations
        double_split = self.double_split
        perturb_size = self.perturb_size
        random_state = self.random_state
        binarize = self.binarize
        response_method = self.response_method

        l_targets = learner._targets(binarize)
        l_preds = learner._preds(response_method)
        l_losses = [loss_func(l_target, l_pred)
                   for l_target, l_pred in zip(l_targets, l_preds)]
        
        r_targets = remover._targets(binarize)
        r_preds = remover._preds(response_method)
        r_losses = [loss_func(r_target, r_pred)
                   for r_target, r_pred in zip(r_targets, r_preds)]

        if perturb_size is not None:
            rng = np.random.default_rng(random_state)
            r_losses = [r_loss + rng.normal(
                scale = perturb_size, 
                size = len(r_loss)) for r_loss in r_losses]
            
        if infer_type == "permutation":
            null_values = [] 
            rng = np.random.default_rng(random_state)
            for l_loss, r_loss in zip(l_losses, r_losses):
                estimate = l_loss.mean() - r_loss.mean()
                paired_loss = np.column_stack([l_loss, r_loss])
                null_value = np.array([
                    estimate - np.diff(
                        rng.permuted(
                            paired_loss, 
                            axis = 1).mean(
                            axis = 0)).item() 
                    for permutation in range(n_permutations)])
                null_values.append(null_value)
        else:
            null_values = None
        
        self.learner_losses_ = l_losses
        self.removed_losses_ = r_losses
        self.null_values_ = null_values

In [7]:
class Inferer(BaseInferer):
    def __init__(
        self,
        learner,
        remover,
        algorithm,
        *,
        loss_func = None,
        infer_type = None,
        n_samples = None,
        n_permutations = None,
        double_split = None,
        perturb_size = None,
        random_state = None,
        removed_column = None
    ):
        if algorithm in ["CRT", "HRT","RPT", "CPI"]:
            if algorithm  in ["CRT", "HRT","RPT"]:
                if infer_type is None:
                    infer_type = "randomization"
                if n_samples is None:
                    n_samples = 2000
            else:
                if infer_type is None:
                    infer_type = "normality"
                if n_samples is None:
                    n_samples = 1
                if (infer_type == "permutation") and (n_permutations is None):
                    n_permutations = 2000
            if removed_column is None:
                removed_column = self._removed_column(learner, remover)
            self.__class__ = CIT
            CIT.__init__(
                self,
                learner, 
                remover, 
                algorithm,
                loss_func = loss_func,
                infer_type = infer_type,
                n_samples = n_samples,
                n_permutations = n_permutations,
                random_state = random_state,
                removed_column = removed_column)
        elif algorithm in ["LOCO", "BBT", "PIE"]:
            if infer_type is None:
                infer_type = "normality"
            if (infer_type == "permutation") and (n_permutations is None):
                n_permutations = 2000
            if algorithm in ["BBT", "PIE"] and (double_split is None):
                double_split = True
            self.__class__ = RIT
            RIT.__init__(
                self,
                learner, 
                remover, 
                algorithm,
                loss_func = loss_func,
                infer_type = infer_type,
                double_split = double_split,
                n_permutations = n_permutations,
                perturb_size = perturb_size,
                random_state = random_state)

In [8]:
X, y, r2 = gen_xy(
        model = "linear",
        iv_corr = .8,
        n_obs= 400)
removed_column = 4

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneGroupOut, LeavePGroupsOut
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
splitter = RepeatedKFold(
    n_splits = 3, 
    n_repeats = 4, 
    random_state = 0)

In [10]:
learner = Crosser(
    GridSearchCV(
            estimator = RandomForestRegressor(max_samples = .5), 
            param_grid = {
                "max_features": [3, 6, 9]}, 
            cv = 4),
    cv = splitter)
sampler = Crosser(
    GridSearchCV(
            estimator = RandomForestRegressor(max_samples = .5), 
            param_grid = {
                "max_features": [3, 6, 9]}, 
            cv = 4),
    cv = splitter)
competitor = Crosser(
    GridSearchCV(
            estimator = RandomForestRegressor(max_samples = .5), 
            param_grid = {
                "max_features": [3, 6, 9]}, 
            cv = 4),
    cv = splitter)
_ = learner.fit(X, y)
_ = sampler.fit(
    np.delete(X, removed_column, axis = 1), X[:,removed_column])
_ = competitor.fit(
    np.delete(X, removed_column, axis = 1), y)

In [11]:
crt = Inferer(
    learner, 
    sampler,
    "CRT")
_ = crt.infer()
crt.summarize()

Unnamed: 0_level_0,estimate,std_error,p_value
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.003124,0.003866,0.8015
1,-0.002177,0.001627,0.095
2,0.005256,0.002717,0.971
3,-0.001943,0.002945,0.258
4,0.005732,0.00255,0.987
5,-0.001291,0.002236,0.284
6,0.001342,0.002753,0.6875
7,-0.004345,0.003252,0.0935
8,-0.002294,0.001933,0.113
9,-0.000249,0.001971,0.4555


In [12]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "normality",
    n_samples = 1)
_ = cpi.infer()
cpi.summarize()

Unnamed: 0_level_0,estimate,std_error,p_value
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-0.00591,0.003602,0.050414
1,-0.002534,0.001951,0.097009
2,0.004985,0.00389,0.900024
3,-0.002394,0.003777,0.263137
4,0.004518,0.003008,0.933435
5,-0.000451,0.003327,0.446092
6,-0.000488,0.002594,0.425376
7,-0.002817,0.0041,0.245987
8,0.002747,0.002728,0.843006
9,0.000848,0.002619,0.626897


In [13]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "permutation",
    n_samples = 1)
_ = cpi.infer()
cpi.summarize()

Unnamed: 0_level_0,estimate,std_error,p_value
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.007222,0.004592,0.936
1,0.001099,0.002152,0.6765
2,0.002354,0.003335,0.747
3,-0.003058,0.002985,0.153
4,0.008231,0.003825,0.9845
5,-0.002125,0.002908,0.243
6,0.000563,0.004096,0.5685
7,-0.000322,0.00404,0.465
8,-0.004318,0.00296,0.0775
9,0.00171,0.002504,0.761


In [14]:
loco = Inferer(
    learner, 
    competitor,
    "LOCO",
    infer_type = "normality")
_ = loco.infer()
loco.summarize()

Unnamed: 0_level_0,estimate,std_error,p_value
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-9.1e-05,0.006996,0.494809
1,-0.004583,0.008155,0.287053
2,0.005217,0.007356,0.760899
3,-0.00172,0.006318,0.392704
4,0.009275,0.006387,0.926767
5,-0.000466,0.007976,0.476704
6,-0.000415,0.007899,0.479056
7,-0.001451,0.00588,0.402575
8,-0.007818,0.007141,0.136804
9,3.8e-05,0.009155,0.501642


In [15]:
loco = Inferer(
    learner, 
    competitor,
    "LOCO",
    infer_type = "permutation")
_ = loco.infer()
loco.summarize()

Unnamed: 0_level_0,estimate,std_error,p_value
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-9.1e-05,0.007094,0.486
1,-0.004583,0.008061,0.287
2,0.005217,0.00745,0.767
3,-0.00172,0.00621,0.395
4,0.009275,0.006342,0.928
5,-0.000466,0.0078,0.461
6,-0.000415,0.007778,0.489
7,-0.001451,0.005801,0.402
8,-0.007818,0.007046,0.1395
9,3.8e-05,0.009073,0.5055


# test combine p-values methods 

In [16]:
test_pvalues = cpi.summarize().loc[:, "p_value"].to_numpy()
print(f'test p-values: {test_pvalues}')
cv_n = len(test_pvalues)
print(f'count of p-values: {cv_n}')

test p-values: [0.936  0.6765 0.747  0.153  0.9845 0.243  0.5685 0.465  0.0775 0.761
 0.4335 0.764 ]
count of p-values: 12


## gmean

In [17]:
np.e * gmean(test_pvalues, 0)

1.2499864591216037

In [18]:
cpi.summarize(agg_method = "gmean")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,1.0


## median

In [19]:
2*np.median(test_pvalues, 0)

1.245

In [20]:
cpi.summarize(agg_method = "median")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,1.0


## Q1

In [21]:
cv_n / 2.*np.partition(test_pvalues, 1)[1]

0.9179999999999999

In [22]:
cpi.summarize(agg_method = "q1")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,0.918


## min 

In [23]:
cv_n*np.min(test_pvalues, 0)

0.9299999999999999

In [24]:
cpi.summarize(agg_method = "min")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,0.93


## hmean

In [25]:
np.e * np.log(cv_n) * hmean(test_pvalues, 0)

2.173393766783454

In [26]:
cpi.summarize(agg_method = "hmean")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,1.0


## hommel

In [27]:
const = np.sum(1. / (np.arange(cv_n) + 1.))
order_const = const * (cv_n / (np.arange(cv_n) + 1.))
np.min(np.sort(test_pvalues) * order_const)

2.8450235497835497

In [28]:
cpi.summarize(agg_method = "hommel")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,1.0


## cauchy

In [29]:
t0 =np.mean(np.tan((.5 - test_pvalues)*np.pi))
.5 - np.arctan(t0) / np.pi

0.8416490608232399

In [30]:
cpi.summarize(agg_method = "cauchy")

Unnamed: 0,estimate,std_error,p_value
0,0.001146,0.003306,0.841649
