In [1]:
feature_names = [
   'linear',            # 1
   'nonlinear_square',  # 2
   'nonlinear_sin',     # 3
   'interaction_1',     # 4
   'interaction_2',     # 5
   'interaction_3',     # 6
   'noise_1',           # 7
   'noise_2',           # 8
   'noise_3',           # 9
   'noise_4',           # 10
   'noise_5',           # 11
   'noise_6',           # 12
   'noise_7',           # 13
   'noise_8',           # 14
   'noise_9',           # 15
   'noise_10'           # 16
]

In [2]:
def X2y(X, with_error = True):
    
    # functional form of the dependence between y and X
    y_star = X['linear'] + X['nonlinear_square'] ** 2 + np.sin(3 * X['nonlinear_sin']) + (X['interaction_1'] * X['interaction_2'] * X['interaction_3'])
    
    # add random error called epsilon (this will be used for creating y)
    if with_error:
        np.random.seed(0)
        epsilon = np.random.normal(0, .1, len(y_star))
        return y_star + epsilon
    
    # do not add error (this will be used for prediction)
    else:
        return y_star

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# make X and y
np.random.seed(0)
X = pd.DataFrame(np.random.normal(size = (20_000, len(feature_names))), columns = feature_names)
y = X2y(X, with_error = True)

# make X_trn, X_tst, y_trn, y_tst
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size = .5, random_state = 0)

In [4]:
# define a sklearn compatible wrapper for our data generating function
class UnbeatableRegressor():
    
    def __init__(self):
        pass
    
    def fit(self, X, y):
        pass  
    
    def predict(self, X):
        return np.array(X2y(X, with_error = False))
    
    def score(self, X, y):
        return mean_absolute_error(y, self.predict(X))

In [5]:
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# dictionary of models that will be used for comparison
models = {
    'DummyRegressor': DummyRegressor(),
    'LinearRegression': LinearRegression(),
    'KNeighborsRegressor': KNeighborsRegressor(n_neighbors = int(np.sqrt(len(X_trn)))),
    'SupportVectorRegressor': SVR(C = .1),
    'RandomForestRegressor': RandomForestRegressor(max_depth = 5),
    'XGBRegressor': XGBRegressor(max_depht = 5),
    'LGBMRegressor': LGBMRegressor(num_leaves = 10),
    'UnbeatableRegressor': UnbeatableRegressor()
}

In [6]:
from sklearn.metrics import mean_absolute_error
from eli5.sklearn import PermutationImportance

mae = pd.DataFrame(columns = ['train', 'test'])
fi = pd.DataFrame(columns = feature_names)

for model_name in list(models.keys()):

    # fit model
    models[model_name].fit(X_trn, y_trn)
    
    # compute mean absolute error of model in train and test set
    mae.loc[model_name,:] = [mean_absolute_error(y_trn, models[model_name].predict(X_trn)), mean_absolute_error(y_tst, models[model_name].predict(X_tst))]
    
    # compute feature importances of model
    try:
        feature_importances_ = models[model_name].feature_importances_
    except:
        feature_importances_ = PermutationImportance(models[model_name], cv = 'prefit', n_iter = 3).fit(X_trn, y_trn).feature_importances_
    fi.loc[model_name, :] = feature_importances_ / feature_importances_.sum()
    
fi.fillna(0, inplace = True)

Using TensorFlow backend.


KeyboardInterrupt: 