In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
import numbers
from numpy import random 
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.metrics import get_scorer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import clone
from joblib import Parallel, delayed
import itertools

In [2]:
def gen_xy(
    model,
    iv_corr,
    n_obs):
    n_ivs = 10
    mean = np.zeros((n_ivs,))
    cov = np.block([[(iv_corr * np.ones((n_ivs - 3, n_ivs - 3)) + 
         (1 - iv_corr) * np.eye(n_ivs - 3)), np.zeros((7, 3))],
                    [np.zeros((3, 7)), np.eye(3)]])
    x = random.multivariate_normal(
      mean = mean, 
      cov = cov, 
      size = n_obs)
    if model == "linear":
        coef = np.array([.1, .2, .3, .4]).reshape(4, -1)
        cov_signal = cov[0:4, 0:4]
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = x[:,0:4]
    else:
        coef = np.array([.3, .3, .3, .4]).reshape(4, -1)
        sd_quad = np.sqrt(2)
        sd_prod = np.sqrt(1 + iv_corr**2)
        a = (2 * (iv_corr**2)) / (sd_quad * sd_quad)
        b = (2 * (iv_corr**2)) / (sd_quad * sd_prod)
        cov_signal = np.array(
            [[ 1.  ,  0.  , 0.  ,  0.  ],
             [ 0.  ,  1.  ,  a,  b],
             [0.  ,  a,  1.  ,  b],
             [ 0.  ,  b,  b,  1.  ]])
        error_var = 1 - (coef.T @ cov_signal @ coef).item()
        x_signal = np.concatenate(
            (x[:,0:1], 
             (x[:,0:1]**2)  / sd_quad, 
             (x[:,1:2]**2) / sd_quad,
             (x[:,2:3] * x[:,3:4]) / sd_prod), 
            axis = 1)
    error = random.normal(
      loc = 0.0, 
      scale = np.sqrt(error_var), 
      size = (n_obs, ))
    y = (x_signal @ coef).reshape(-1,) + error
    r2 = 1 - error_var
    return x, y, r2


In [61]:
class RPT():
    def __init__(
        self,
        learner,
        samplers,
        metric,
        mode = "x",
        splitter = 0.5,
        n_samples = 1000,
        n_jobs = None,
        random_state = None,
        greater_is_better = False,
        fit_learner = True,
        fit_samplers = True):

        if not (hasattr(learner, "fit") and hasattr(learner, "predict")):
            raise TypeError("`learner` must have `fit` and `predict` methods.")

        if splitter is not None:
            if isinstance(splitter, numbers.Integral):
                splitter = KFold(
                    n_splits = splitter, 
                    shuffle = True, 
                    random_state = random_state)
            elif isinstance(splitter, numbers.Real):
                splitter = ShuffleSplit(
                    n_splits = 1, 
                    test_size = splitter, 
                    random_state = random_state)
        
        self.learner = learner
        self.samplers = samplers
        self.metric = metric
        self.greater_is_better = greater_is_better
        self.mode = mode
        self.splitter = splitter
        self.n_samples = n_samples
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.fit_learner = fit_learner
        self.fit_samplers = fit_samplers
    
    def infer(
        self, x, y):
        if self.splitter is None:
            train_index = np.arange(len(x))
            test_index = np.arange(len(x))
            split_indices = ((train_index, test_index))
        else:
            split_indices = self.splitter.split(x)
                
        def cal_null_score_x(
            learner, 
            metric, 
            x_test, 
            y_test, 
            feature,
            dv_pred_test, 
            dv_res_test,
            seed):
            rng = np.random.RandomState(seed)
            x_perm = np.copy(x_test)
            x_perm[:, feature] = dv_pred_test + rng.permutation(dv_res_test)
            y_perm = learner.predict(x_perm)
            null_score = metric(y_test, y_perm)
            return null_score

        def cal_null_score_y(
            metric, 
            y_pred_test, 
            dv_pred_test, 
            dv_res_test,
            seed):
            rng = np.random.RandomState(seed)
            y_perm = dv_pred_test + rng.permutation(dv_res_test)
            null_score = metric(y_perm, y_pred_test)
            return null_score
        
        learner_dict = {}
        samplers_dict = {}
        target_score_dict = {}
        null_scores_dict = {}
        for i, (train_index, test_index) in enumerate(split_indices):
            x_train, y_train = x[train_index, :], y[train_index]
            x_test, y_test = x[test_index, :], y[test_index]
            learner_dict[i] = clone(self.learner)
            if self.fit_learner:
                _ = learner_dict[i].fit(x_train, y_train)
            y_pred_test = learner_dict[i].predict(x_test)
            target_score_dict[i] = self.metric(
                y_test, y_pred_test)

            samplers_dict[i] = {}
            null_scores_dict[i] = {}
            rng = np.random.RandomState(self.random_state)
            for feature, sampler in self.samplers.items():
                iv_train = np.delete(x_train, feature, 1)
                iv_test = np.delete(x_test, feature, 1)
                if self.mode == "x":
                    dv_train = x_train[:, feature]
                    dv_test = x_test[:, feature]
                elif self.mode == "y":
                    dv_train = y_train
                    dv_test = y_test
                if sampler is not None:
                    samplers_dict[i][feature] = clone(sampler)
                else:
                    samplers_dict[i][feature] = None
                if samplers_dict[i][feature] is not None:
                    if self.fit_samplers:
                        _ = samplers_dict[i][feature].fit(iv_train, dv_train)
                    dv_pred_test = samplers_dict[i][feature].predict(iv_test)
                else:
                    dv_pred_test = np.zeros_like(dv_test)
                dv_res_test = dv_test - dv_pred_test
                
                seeds = rng.randint(
                    2**32 - 1, 
                    size = self.n_samples)               
                if self.mode == "x":
                    null_scores_dict[i][feature] = np.array(
                        Parallel(n_jobs = self.n_jobs)(
                            delayed(cal_null_score_x)(
                                learner_dict[i], 
                                self.metric,
                                x_test, 
                                y_test, 
                                feature,
                                dv_pred_test, 
                                dv_res_test,
                                seed) for seed in seeds))
                elif self.mode == "y":
                    null_scores_dict[i][feature] = np.array(
                        Parallel(n_jobs = self.n_jobs)(
                            delayed(cal_null_score_y)(
                                self.metric,
                                y_pred_test,
                                dv_pred_test, 
                                dv_res_test,
                                seed) for seed in seeds))
        self._learner_dict = learner_dict
        self._samplers_dict = samplers_dict
        self._target_score_dict = target_score_dict
        self._null_scores_dict = null_scores_dict

    def all_scores(
        self):
        def tidy(
            split, 
            target_score, 
            null_scores):
            df = pd.DataFrame.from_dict(
                null_scores, 
                orient = "index")
            df["split"] = split
            df["target_score"] = target_score
            df.index.name = "feature"
            df = df.reset_index()
            df = df.melt(
                id_vars = ["split", "feature", "target_score"], 
                var_name='sample', 
                value_name='null_score')
            return df

        all_scores = pd.concat(
            [tidy(split, target_score, null_scores) 
             for (split, target_score), null_scores in zip(
                 self._target_score_dict.items(), 
                 self._null_scores_dict.values())],
            ignore_index = True)
        return all_scores
        
    def pfi(
        self):
        all_scores = self.all_scores()
        if self.greater_is_better:
            all_scores["diff_score"] = all_scores["target_score"] - all_scores["null_score"]
        else:
            all_scores["diff_score"] = all_scores["null_score"] - all_scores["target_score"]
        pfi = all_scores.groupby(
            ["feature"]).apply(
            lambda x: np.mean(x.diff_score))
        return pfi

    def pvalue(
        self, 
        aggregate = None):
        n_splits = len(self._learner_dict)
        all_scores = self.all_scores()
        if self.greater_is_better:
            all_scores["diff_score"] = all_scores["target_score"] - all_scores["null_score"]
        else:
            all_scores["diff_score"] = all_scores["null_score"] - all_scores["target_score"]

        if aggregate is None:
            if n_splits == 1:
                pvalue = all_scores.groupby(
                    ["feature"]).apply(
                    lambda x: np.mean(x.diff_score < 0))
            else:
                pvalue = all_scores.groupby(
                    ["split", "feature"]).apply(
                    lambda x: np.mean(x.diff_score < 0))
        else:
            if aggregate == "bonferroni":
                pvalue = all_scores.groupby(
                    ["split", "feature"]).apply(
                    lambda x: np.mean(x.diff_score < 0)).groupby(
                    ["feature"]).apply(
                    lambda x: np.minimum(n_splits * np.min(x), 1))
            elif aggregate == "average":
                pvalue = all_scores.groupby(
                    ["split", "feature"]).apply(
                    lambda x: np.mean(x.diff_score < 0)).groupby(
                    ["feature"]).apply(
                    lambda x: np.minimum(2 * np.mean(x), 1))
            elif aggregate == "mimic":
                pvalue = all_scores.groupby(
                    ["feature", "sample"]).apply(
                    lambda x: np.sum(x.diff_score)).groupby(
                    ["feature"]).apply(
                    lambda x: np.mean(x < 0))
            else:
                raise ValueError("Value of `aggregate` is unrecognized.")
        return pvalue
        

In [62]:
def do_one(
    model, 
    iv_corr, 
    n_obs, 
    rep):
    searchers = {
        "LR":LinearRegression(),
        "KNN":GridSearchCV(
            estimator = KNeighborsRegressor(), 
            param_grid = {
                "n_neighbors": [5, 10, 15, 20]}, 
            cv = n_splits, 
            n_jobs = n_jobs),
        "MLP":GridSearchCV(
            estimator = MLPRegressor(
                max_iter = 5000), 
            param_grid = {
                "hidden_layer_sizes": [(8,), (16,), (32,), (64,)]}, 
            cv = n_splits, 
            n_jobs = n_jobs),
        "DT": GridSearchCV(
            estimator = DecisionTreeRegressor(), 
            param_grid = {
                "min_samples_leaf": [1, 2, 4, 8]}, 
            cv = n_splits, 
            n_jobs = n_jobs),
        "RF":GridSearchCV(
            estimator = RandomForestRegressor(), 
            param_grid = {
                "max_features": [3, 6, 9]}, 
            cv = n_splits, 
            n_jobs = n_jobs)}
    x, y, r2 = gen_xy(
        model,
        iv_corr,
        n_obs)
    columns = list(itertools.chain(
        *[["Model",
           "Corr", 
           "Sample Size", 
           "Rep",
           "Algorithm",
           "Mode"],
          [str(i + 1) for i in [0, 1, 2, 3, 4, 7]]]))
    list_one = []
    for algorithm, searcher in searchers.items():
        for mode in ["x", "y", "vanilla"]:
            condition = [model, iv_corr, n_obs, rep, algorithm, mode]
            learner = clone(searcher)
            if mode == "vanilla":
                samplers = {feature: None
                     for feature in [0, 1, 2, 3, 4, 7]}
                mode = "x"
            else:
                samplers = {feature: clone(searcher) 
                     for feature in [0, 1, 2, 3, 4, 7]}
            rpt = RPT(learner, 
              samplers, 
              metric = mean_squared_error, 
              mode = mode,
              n_jobs = n_jobs)
            rpt.infer(x, y)
            pvalue = rpt.pvalue()
            list_one.append(list(
                itertools.chain(*[condition, pvalue])))
    df_one = pd.DataFrame(
            list_one, 
            columns = columns)
    return df_one

In [None]:
models = ["linear", "nonlinear"]
iv_corrs = [.3]
n_obss = [100, 200, 400, 800]
n_splits = 5
n_jobs = 8
n_reps = 1000
np.random.seed(46)
start_time = datetime.now()
for model in models:
    for iv_corr in iv_corrs:
        for n_obs in n_obss:
            print("Model:", model, "/", "Corr:", iv_corr, "/", "Sample Size:", n_obs)
            now = datetime.now()
            start_time_i = now.strftime("%Y-%m-%d %H:%M:%S")
            print("Starting Time =", start_time_i)
            result = [do_one(
                model, iv_corr, n_obs, rep) for rep in range(n_reps)]
            result = pd.concat(result, ignore_index = True)
            file_name = ("result_" + str(model) + 
                          "_" + str(iv_corr) +
                         "_" + str(n_obs) + ".csv")
            result.to_csv("../results_i/" + file_name, index = False)
            now = datetime.now()
            end_time_i = now.strftime("%Y-%m-%d %H:%M:%S")
            print("Ending Time =", end_time_i)
            print("----------------------------------------------")
end_time = datetime.now()
delta_time = start_time - end_time
print("Simulation Finished!!!!!")
print("Total Time (in Seconds) =", abs(delta_time).seconds)

Model: linear / Corr: 0.4 / Sample Size: 100
Starting Time = 2023-12-15 08:39:56
