In [2]:
from boruta import BorutaPy
from sklearn.linear_model import Ridge
from gplearn.genetic import SymbolicTransformer 
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.utils import check_random_state

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns

train_df = pd.read_csv('train.csv.zip')
test_df = pd.read_csv('test.csv.zip')

train_df = train_df.drop(['id', 'timecc'], axis=1)
test_df = test_df.drop(['id', 'timecc'], axis=1)

In [4]:
# 基于sklearn BaseEstimator, 交替进行SymbolicTransformer和boruta,创建一个sklearn-compatible api
class gpbo_transformer(BaseEstimator,TransformerMixin):
    def __init__(self, estimater, iter, n_jobs=-1, random_state=1, verbose=1):
        self.estimater = estimater # boruta的estimater
        self.iter = iter  # 循环次数
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose
        self.gp_list = []
        self.fs_list = []

    def fit(self, X, y):
        for i in range(self.iter):
            # gplearn新建特征
            gp = SymbolicTransformer(
                                generations=20,  # 进化的代数
                                population_size=1000, # 每一代程序的数量
                                hall_of_fame=100, 
                                n_components=int(X.shape[-1]/2), # 最后生成的新特征
                                function_set=["add","sub","mul","div","sqrt","log","abs","neg","inv","max","min","sin","cos","tan",],
                                parsimony_coefficient=0.0005, # 惩罚系数
                                max_samples=0.9, 
                                verbose=self.verbose,
                                random_state=self.random_state, 
                                n_jobs=self.n_jobs
            )
            gp.fit(X, y)
            X = np.hstack((X, gp.transform(X)))
            self.gp_list.append(gp)
            fs = BorutaPy(self.estimater, n_estimators='auto', verbose=self.verbose, random_state=self.random_state)
            fs.fit(X, y)
            X = fs.transform(X)
            self.fs_list.append(fs)
        return self

    def transform(self, X):
        for i in range(self.iter):
            X = np.hstack((X, self.gp_list[i].transform(X)))
            X = self.fs_list[i].transform(X)
        return X

In [5]:
gpbo = gpbo_transformer(RandomForestRegressor(n_jobs=-1),iter=3, n_jobs=-1, verbose=0)