In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import get_scorer

In [3]:
from sial.crosser import Crosser
from sial.inferer import Inferer

In [3]:
def gen_xy(
    model,
    iv_corr,
    n_obs):
    n_ivs = 10
    mean = np.zeros((n_ivs,))
    cov = np.block([[(iv_corr * np.ones((n_ivs - 3, n_ivs - 3)) + 
         (1 - iv_corr) * np.eye(n_ivs - 3)), np.zeros((7, 3))],
                    [np.zeros((3, 7)), np.eye(3)]])
    x = np.random.multivariate_normal(
      mean = mean, 
      cov = cov, 
      size = n_obs)
    if model == "linear":
        coef = np.array([.1, .2, .3, .4]).reshape(4, -1)
        cov_signal = cov[0:4, 0:4]
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = x[:,0:4]
    else:
        coef = np.array([.3, .3, .3, .4]).reshape(4, -1)
        sd_quad = np.sqrt(2)
        sd_prod = np.sqrt(1 + iv_corr**2)
        a = (2 * (iv_corr**2)) / (sd_quad * sd_quad)
        b = (2 * (iv_corr**2)) / (sd_quad * sd_prod)
        cov_signal = np.array(
            [[ 1.  ,  0.  , 0.  ,  0.  ],
             [ 0.  ,  1.  ,  a,  b],
             [0.  ,  a,  1.  ,  b],
             [ 0.  ,  b,  b,  1.  ]])
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = np.concatenate(
            (x[:,0:1], 
             (x[:,0:1]**2)  / sd_quad, 
             (x[:,1:2]**2) / sd_quad,
             (x[:,2:3] * x[:,3:4]) / sd_prod), 
            axis = 1)
    error = np.random.normal(
      loc = 0.0, 
      scale = np.sqrt(error_var), 
      size = (n_obs, ))
    y = (x_signal @ coef).reshape(-1,) + error
    r2 = 1 - error_var
    return x, y, r2

In [4]:
X, y, r2 = gen_xy(
        model = "linear",
        iv_corr = .3,
        n_obs= 200)
removed_column = 1

In [142]:
df = pd.read_csv("C:/Users/tommy/OneDrive/桌面/adult.csv", names=[
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
    'hours-per-week', 'native-country', 'income'
], skiprows=1, usecols=['age', 'sex', 'race','income'])
print(df.head())

   age    race     sex income
0   50   White    male  <=50K
1   38   White    male  <=50K
2   53   Black    male  <=50K
3   28   Black  female  <=50K
4   37   White  female  <=50K


In [143]:
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
removed_column = 2

In [144]:
print(df['income'])

0        0
1        0
2        0
3        0
4        0
        ..
32555    0
32556    1
32557    0
32558    0
32559    1
Name: income, Length: 32560, dtype: int64


In [145]:
X=df.drop('income', axis=1).to_numpy()
y=df['income'].to_numpy()
numeric_features = [list(df.columns).index('age')]
categorical_features_1 = [list(df.columns).index('race'), list(df.columns).index('sex')]
categorical_features_2 = [list(df.columns).index('race')]

In [147]:
X_new=df['sex'].map({'female': 1, 'male': 0}).to_numpy()
print(X_new)

[0 0 0 ... 1 0 1]


In [139]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold
preprocessor_1 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features_1)
    ])
preprocessor_2 = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features_2)
    ])

In [131]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, RepeatedKFold, ShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
splitter = RepeatedKFold(
    n_splits = 4, 
    n_repeats = 2, 
    random_state = 0)

In [148]:
learner = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('preprocessor', preprocessor_1),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
sampler = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('preprocessor', preprocessor_2),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
competitor = Crosser(
    GridSearchCV(
            estimator = Pipeline(
                [('preprocessor', preprocessor_2),
                 ('estimator', RandomForestRegressor(max_samples = .5))]), 
            param_grid = {
                "estimator__max_features": [3, 6, 9]}, 
            cv = 5),
    cv = splitter,
    scoring = "neg_mean_squared_error")
_ = learner.fit(X, y)
_ = sampler.fit(
    np.delete(X, removed_column, axis = 1), X_new)
_ = competitor.fit(
    np.delete(X, removed_column, axis = 1), y)

In [149]:
learner.summarize(combine = False, cross_fit = True)

Estimator: GridSearchCV
Cross-Validator: RepeatedKFold (n_repeats=2, n_folds=4)
Scoring Function: Neg Mean Squared Error (reverse=False)


Unnamed: 0_level_0,val_score,train_score,test_score
repeat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.146317,-0.151055,-0.155429
1,0.146511,-0.15103,-0.155679


In [150]:
crt = Inferer(
    learner, 
    sampler,
    "CRT" )
_ = crt.infer()
crt.summarize(cross_fit = True)

UnboundLocalError: cannot access local variable 'removed_column' where it is not associated with a value

In [105]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "normality",
    n_copies = 1)
_ = cpi.infer()
cpi.summarize(cross_fit=True)

UnboundLocalError: cannot access local variable 'removed_column' where it is not associated with a value

In [106]:
cpi = Inferer(
    learner, 
    sampler,
    "CPI",
    infer_type = "permutation",
    n_copies = 100)
_ = cpi.infer()
cpi.summarize(cross_fit = True)

UnboundLocalError: cannot access local variable 'removed_column' where it is not associated with a value

In [11]:
pie = Inferer(
    learner, 
    competitor,
    "PIE",
    infer_type = "normality")
_ = pie.infer()
pie.summarize()

Algorithm: PIE (double_split=True, perturb_size=None)
Inference Type: Normality (n_copies=None, n_permutations=None)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,size,estimate,std_error,p_value
split,repeat,fold,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0,50,-0.303159,0.27295,0.133355
1,0,1,50,-0.479834,0.389755,0.10914
2,0,2,50,0.06911,0.168031,0.659571
3,0,3,50,-0.728386,0.291135,0.006177
4,1,0,50,-0.140964,0.203148,0.243874
5,1,1,50,0.016049,0.328007,0.519512
6,1,2,50,-0.309677,0.399886,0.219343
7,1,3,50,0.04228,0.190848,0.587662


In [15]:
pie = Inferer(
    learner, 
    competitor,
    "PIE",
    infer_type = "permutation")
_ = pie.infer()
pie.summarize(
    cross_fit = True,
combine=True)

Algorithm: PIE (double_split=True, perturb_size=None)
Inference Type: Permutation (n_copies=None, n_permutations=2000)
Loss Function: Mean Squared Error (reverse=False)


Unnamed: 0_level_0,size,estimate,std_error,p_value
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
gmean,200.0,-0.229322,0.143393,0.127615
median,200.0,-0.229322,0.143393,0.2835
q1,200.0,-0.229322,0.143393,0.2755
min,200.0,-0.229322,0.143393,0.016
hmean,200.0,-0.229322,0.143393,0.029296
hommel,200.0,-0.229322,0.143393,0.024
cauchy,200.0,-0.229322,0.143393,0.015655
