In [60]:
import scipy
import numpy as np
from sklearn.base import clone
from sklearn.model_selection import check_cv
from sklearn.base import is_classifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import get_scorer, check_scoring
from sklearn.preprocessing import LabelBinarizer
from scipy.special import xlogy
from scipy.stats import multinomial
from numpy.random import choice, permutation

In [61]:
class CrossEstimator():
    def __init__(
        self,
        estimator,
        cv = None
    ):
        self.estimator = estimator
        self.cv = cv

    def fit(
        self, 
        X, 
        y
    ):
        cv = check_cv(self.cv, y, classifier = is_classifier(self.estimator))
        cv_indexes = []
        estimators = []
        for train_index, test_index in cv.split(X):     
            estimator = clone(self.estimator)
            _ = estimator.fit(X[train_index,], y[train_index])
            if hasattr(estimator, "best_estimator_"):
                estimator = estimator.best_estimator_
            estimators.append(estimator)               
            cv_indexes.append((train_index, test_index))
        self.X, self.y = X.copy(), y.copy()
        self.estimators_ = estimators
        self.cv_indexes_ = cv_indexes
        if is_classifier(self.estimator):
            label_binarizer = LabelBinarizer()
            _ = label_binarizer.fit(y)
            self.label_binarizer_ = label_binarizer
        return self

    def predict(
        self, 
        X = None
    ):
        if X is None:
            X = self.X
            preds = []
            for (train_index, test_index), estimator in zip(
                self.cv_indexes_, self.estimators_):
                pred = estimator.predict(X[test_index,])
                preds.append(pred)
            return preds
        else:
            preds = np.array(
                [estimator.predict(X) 
                 for estimator in self.estimators_])
            if is_classifier(self.estimator):
                pred = np.apply_along_axis(
                    lambda x: np.argmax(np.bincount(x)),
                    axis = 0,
                    arr = preds)
            else:
                pred = preds.mean(axis = 0)
            return pred

    def predict_proba(
        self, 
        X = None
    ):
        if X is None:
            X = self.X
            preds = []
            for (train_index, test_index), estimator in zip(
                self.cv_indexes_, self.estimators_):
                pred = estimator.predict_proba(X[test_index,])
                preds.append(pred)
            return preds
        else:
            preds = np.array(
                [estimator.predict_proba(X) 
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
            return pred

    def predict_log_proba(
        self, 
        X = None
    ):
        if X is None:
            X = self.X
            preds = []
            for (train_index, test_index), estimator in zip(
                self.cv_indexes_, self.estimators_):
                pred = estimator.predict_log_proba(X[test_index,])
                preds.append(pred)
            return preds
        else:
            preds = np.array(
                [estimator.predict_log_proba(X) 
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
            return pred
            
    def decision_function(
        self, 
        X = None
    ):
        if X is None:
            X = self.X
            preds = []
            for (train_index, test_index), estimator in zip(
                self.cv_indexes_, self.estimators_):
                pred = estimator.decision_function(X[test_index,])
                preds.append(pred)
            return preds
        else:
            preds = np.array(
                [estimator.decision_function(X) 
                 for estimator in self.estimators_])
            pred = preds.mean(axis = 0)
            return pred
            
    def reply(
        self,
        y = None,
        binarize = False
    ):
        if y is None:
            y = self.y
            cv_indexes = self.cv_indexes_
            facts = []
            for train_index, test_index in cv_indexes:
                fact = y[test_index]
                if is_classifier(self.estimator):
                    if binarize is True:
                        fact = self.label_binarizer_.transform(fact)
                        if fact.shape[1] == 1:
                            fact = np.append(1 - fact, fact, axis=1)
                facts.append(fact)
            return facts
        else:
            fact = y
            if is_classifier(self.estimator):
                if binarize is True:
                    fact = self.label_binarizer_.transform(fact)
                    if fact.shape[1] == 1:
                        fact = np.append(1 - fact, fact, axis=1)
            return fact

    def quantify(
        self,
        fact = None,
        pred = None,
        loss_func = None,
        **kwargs
    ):
        if loss_func is None:
            if is_classifier(self.estimator):
                loss_func = "log_loss"
            else:
                loss_func == "mean_squared_error"
        
        if loss_func == "log_loss":
            def log_loss(fact, pred, **kwargs):
                eps = np.finfo(pred.dtype).eps
                pred = np.clip(pred, eps, 1 - eps)
                loss = -xlogy(fact, pred).sum(axis=1)
                return loss
            loss_func = log_loss
            
        if loss_func == "mean_squared_error":
            def mean_squared_error(fact, pred, **kwargs):
                loss = (fact - pred)**2
                return loss
            loss_func = mean_squared_error

        if fact is None and pred is None:
            if loss_func.__name__ in ["log_loss"]:
                facts = self.reply(binarize = True)
                preds = self.predict_proba()
            else:
                facts = self.reply()
                preds = self.predict()
            losses = []
            for fact, pred in zip(facts, preds):
                loss = loss_func(fact, pred, **kwargs)
                losses.append(loss)
            return losses
        else:
            loss = loss_func(fact, pred, **kwargs)
            return loss
    
    def sample(
        self,
        X = None):
        if X is None:
            rvs = []
            if is_classifier(self.estimator):
                preds = self.predict_proba()
                for pred in preds:
                    rv = np.apply_along_axis(
                        lambda pred_i: multinomial.rvs(
                            1, pred_i),
                        axis = 1,
                        arr = pred)
                    rv = self.label_binarizer_.inverse_transform(rv)
                    rvs.append(rv)
            else:
                preds = self.predict()
                facts = self.reply()
                for pred, fact in zip(preds, facts):
                    res = fact - pred
                    rv = pred + permutation(res)
                    rvs.append(rv)
            return rvs
        else:
            if is_classifier(self.estimator):
                pred = self.predict_proba(X)
                rv = np.apply_along_axis(
                    lambda pred_i: multinomial.rvs(
                        1, pred_i),
                    axis = 1,
                    arr = pred)
                rv = self.label_binarizer_.inverse_transform(rv)
            else:
                preds = self.predict()
                facts = self.reply()
                res = np.concatenate(
                    [fact - pred for fact, pred in zip(facts, preds)])
                pred = self.predict(X)
                rv = pred + choice(res, len(pred))
            return rv



In [68]:
from sklearn import datasets
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

X, y = datasets.load_diabetes(return_X_y=True)
X, y = X[1:150,], y[1:150]
lasso = Lasso(
    random_state=0, 
    max_iter=10000)
alphas = np.logspace(-4, -0.5, 30)
tuned_parameters = [{"alpha": alphas}]
n_folds = 5
rgs = GridSearchCV(lasso, tuned_parameters, cv=n_folds)
rkf = RepeatedKFold(n_splits = 5, n_repeats = 1)
learner = CrossEstimator(rgs, rkf)
_ = learner.fit(X, y)

In [70]:
learner.predict()

[array([ 81.79951213, 161.6800473 , 148.64169974, 134.95923814,
        110.52155513, 184.41442571,  94.2748343 , 202.97900428,
        158.91327041, 103.18708296, 127.08650977, 171.59914465,
        118.82741332,  92.43436574, 160.04057161,  87.86196425,
        215.55241897, 117.44655629, 181.81597251, 187.58215695,
        204.98778484, 193.61613087, 117.29315557, 192.62642813,
        217.65890634,  79.96491807, 214.75271249, 203.34423123,
        176.11130208, 109.66448643]),
 array([147.35470348, 101.59854541, 157.9727859 , 167.3328732 ,
        130.81452171, 207.30204143, 238.28949885,  88.67952429,
        135.66199525,  84.96584325, 223.82053542,  86.13172649,
        172.05258892,  75.81277023, 150.39327435,  43.34627297,
        195.78494798, 147.96725815, 164.71956196, 125.17577197,
        154.81333812, 156.55679726, 184.84453888,  93.90923675,
         74.11417455, 249.05918044, 237.29317963, 121.8149317 ,
        229.46646234, 188.27556924]),
 array([163.09279532, 135.13

In [64]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, RepeatedKFold
rkf = RepeatedKFold(n_splits = 5, n_repeats = 1)
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC(probability = True)
clf = GridSearchCV(svc, parameters)
learner = CrossEstimator(clf, rkf)
_ = learner.fit(iris.data, iris.target)

In [67]:
learner.sample(iris.data)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [329]:
class CIT():
    def __init__(
        self, 
        learner,
        remover,
        loss_func = None
    ):
        self.learner = learner
        self.remover = remover
        self.loss_func = loss_func

    def infer(
        self
    ):
        learner_facts = learner.reply()
        learner_preds = learner.predict()
        

        

SyntaxError: incomplete input (2282326014.py, line 21)

In [16]:
learner.sample()

[array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
        2, 2, 2, 2, 1, 2, 2, 2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 1, 2, 1,
        1, 2, 2, 2, 2, 2, 2, 2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
        1, 0, 1, 2, 2, 2, 2, 2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])]

In [390]:
learner.quantify()

[array([0.03047872, 0.03371168, 0.04622066, 0.04113181, 0.0199701 ,
        0.05687095, 0.04799187, 0.03460839, 0.05022425, 0.03405066,
        0.17229335, 0.25204089, 0.08572921, 0.34540535, 0.0699074 ,
        0.06635565, 0.03908081, 0.05198329, 0.05938119, 0.00350774,
        0.01521838, 0.04411749, 0.09087275, 0.01507744, 0.21357253,
        0.02099946, 0.09016782, 0.11047597, 0.02437719, 0.1494673 ]),
 array([0.03147789, 0.04726225, 0.03124575, 0.03744061, 0.08696101,
        0.0271521 , 0.02547536, 0.04119737, 0.03261583, 0.05335157,
        0.05107176, 0.15538328, 0.03916825, 0.05364395, 0.18464269,
        0.753597  , 0.02594374, 0.0583422 , 0.08509803, 0.0285936 ,
        0.05989299, 0.00355041, 0.01405706, 0.13744988, 0.02667314,
        0.5064949 , 0.01468881, 0.01553904, 0.00971281, 0.04256084]),
 array([0.0543839 , 0.02965617, 0.04939194, 0.02346165, 0.07150733,
        0.05849003, 0.05560787, 0.02689555, 0.06556507, 0.03648379,
        0.18724785, 0.11322675, 0.04016516, 

In [234]:
np.apply_along_axis(
                lambda x: np.argmax(np.bincount(x)),
                axis=0,
                arr=learner.predict(iris.data),
            )

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [157]:
learner.get_losses(loss_func = "log_loss")

[array([0.05645564, 0.05238995, 0.03368536, 0.05384512, 0.05161804,
        0.04756854, 0.02234061, 0.02452637, 0.03387893, 0.07033476,
        0.07788535, 0.04866863, 0.1046384 , 0.06914687, 0.04841894,
        0.04495078, 0.07118952, 0.02451234, 0.06079586, 0.03506321,
        0.02836699, 0.08203701, 0.06065754, 0.04027421, 0.04471997,
        0.02299849, 0.00648436, 0.01445789, 0.00856427, 0.14839065,
        0.02044827, 0.02350801, 0.00827815, 0.01438462, 0.0348732 ,
        0.0050696 , 0.0394479 , 0.00631385, 0.02726235, 0.00327092,
        0.00959646, 0.08875323, 0.01586876, 0.00797441, 0.0354622 ,
        0.02767817, 0.01763332, 0.01074017, 0.12976665, 0.02277979]),
 array([0.02837371, 0.03534768, 0.03498755, 0.03756079, 0.03942477,
        0.06382299, 0.02598236, 0.01590016, 0.03017132, 0.02642951,
        0.01289047, 0.08525378, 0.07070279, 0.06377964, 0.06635004,
        0.01645138, 0.02134683, 0.04700615, 0.03857348, 0.02531727,
        0.09849957, 0.03774065, 0.0549704 , 0.