In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import get_scorer

In [2]:
from sial.crosser import Crosser
from sial.inferer import Inferer

In [3]:
def gen_xy(
    model,
    iv_corr,
    n_obs):
    n_ivs = 10
    mean = np.zeros((n_ivs,))
    cov = np.block([[(iv_corr * np.ones((n_ivs - 3, n_ivs - 3)) + 
         (1 - iv_corr) * np.eye(n_ivs - 3)), np.zeros((7, 3))],
                    [np.zeros((3, 7)), np.eye(3)]])
    x = np.random.multivariate_normal(
      mean = mean, 
      cov = cov, 
      size = n_obs)
    if model == "linear":
        coef = np.array([.1, .2, .3, .4]).reshape(4, -1)
        cov_signal = cov[0:4, 0:4]
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = x[:,0:4]
    else:
        coef = np.array([.3, .3, .3, .4]).reshape(4, -1)
        sd_quad = np.sqrt(2)
        sd_prod = np.sqrt(1 + iv_corr**2)
        a = (2 * (iv_corr**2)) / (sd_quad * sd_quad)
        b = (2 * (iv_corr**2)) / (sd_quad * sd_prod)
        cov_signal = np.array(
            [[ 1.  ,  0.  , 0.  ,  0.  ],
             [ 0.  ,  1.  ,  a,  b],
             [0.  ,  a,  1.  ,  b],
             [ 0.  ,  b,  b,  1.  ]])
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = np.concatenate(
            (x[:,0:1], 
             (x[:,0:1]**2)  / sd_quad, 
             (x[:,1:2]**2) / sd_quad,
             (x[:,2:3] * x[:,3:4]) / sd_prod), 
            axis = 1)
    error = np.random.normal(
      loc = 0.0, 
      scale = np.sqrt(error_var), 
      size = (n_obs, ))
    y = (x_signal @ coef).reshape(-1,) + error
    r2 = 1 - error_var
    return x, y, r2

In [4]:
X, y, r2 = gen_xy(
        model = "linear",
        iv_corr = .3,
        n_obs= 200)
removed_column = 1

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, RepeatedKFold, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
splitter = RepeatedKFold(
    n_splits = 4, 
    n_repeats = 2, 
    random_state = 0)

In [6]:
learner = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('scaler', StandardScaler()),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
sampler = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('scaler', StandardScaler()),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
competitor = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('scaler', StandardScaler()),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
_ = learner.fit(X, y)
_ = sampler.fit(
    np.delete(X, removed_column, axis = 1), X[:,removed_column])
_ = competitor.fit(
    np.delete(X, removed_column, axis = 1), y)

In [13]:
learner.summarize(combine = False, cross_fit = True)

Estimator: GridSearchCV
Cross-Validator: RepeatedKFold (n_repeats=2, n_folds=4)
Scoring Function: Neg Mean Squared Error (reverse=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,val_score,train_score,test_score
split,repeat,fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0.48215,-0.219112,-0.625548
1,0,1,0.477243,-0.208878,-0.614662
2,0,2,0.33343,-0.232024,-0.396775
3,0,3,0.412214,-0.211172,-0.672896
4,1,0,0.452686,-0.228587,-0.531586
5,1,1,0.457539,-0.225104,-0.578624
6,1,2,0.436766,-0.213757,-0.78569
7,1,3,0.415411,-0.211571,-0.530636


In [8]:
crt = Inferer(
    learner, 
    sampler,
    "CRT")
_ = crt.infer()
crt.summarize(cross_fit = True)

Algorithm: CRT (double_split=None, perturb_size=None)
Inference Type: Randomization (n_copies=2000, n_permutations=None)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,size,estimate,std_error,p_value
repeat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,200,-0.045319,0.014152,0.0005
1,200,-0.047031,0.014868,0.0005


In [9]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "normality",
    n_copies = 1)
_ = cpi.infer()
cpi.summarize(cross_fit=True)

Algorithm: CPI (double_split=None, perturb_size=None)
Inference Type: Normality (n_copies=1, n_permutations=None)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,size,estimate,std_error,p_value
repeat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,200,-0.035907,0.01988,0.035445
1,200,-0.026301,0.019657,0.090442


In [10]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "permutation",
    n_copies = 100)
_ = cpi.infer()
cpi.summarize(cross_fit = True)

Algorithm: CPI (double_split=None, perturb_size=None)
Inference Type: Permutation (n_copies=100, n_permutations=2000)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,size,estimate,std_error,p_value
repeat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,200,-0.047754,0.014962,0.001
1,200,-0.046491,0.014992,0.0005


In [11]:
pie = Inferer(
    learner, 
    competitor,
    "PIE",
    infer_type = "normality")
_ = pie.infer()
pie.summarize()

Algorithm: PIE (double_split=True, perturb_size=None)
Inference Type: Normality (n_copies=None, n_permutations=None)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,size,estimate,std_error,p_value
split,repeat,fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,50,-0.303159,0.27295,0.133355
1,0,1,50,-0.479834,0.389755,0.10914
2,0,2,50,0.06911,0.168031,0.659571
3,0,3,50,-0.728386,0.291135,0.006177
4,1,0,50,-0.140964,0.203148,0.243874
5,1,1,50,0.016049,0.328007,0.519512
6,1,2,50,-0.309677,0.399886,0.219343
7,1,3,50,0.04228,0.190848,0.587662


In [12]:
pie = Inferer(
    learner, 
    competitor,
    "PIE",
    infer_type = "permutation")
_ = pie.infer()
pie.summarize(
    cross_fit = True, 
    combine = True)

Algorithm: PIE (double_split=True, perturb_size=None)
Inference Type: Permutation (n_copies=None, n_permutations=2000)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,size,estimate,std_error,p_value
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gmean,200.0,-0.229322,0.143954,0.107878
median,200.0,-0.229322,0.143954,0.2685
q1,200.0,-0.229322,0.143954,0.2625
min,200.0,-0.229322,0.143954,0.012
hmean,200.0,-0.229322,0.143954,0.022105
hommel,200.0,-0.229322,0.143954,0.018
cauchy,200.0,-0.229322,0.143954,0.01179
