In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning' : 0})
import seaborn as sns


### 多模型融合Stacking方法

In [2]:
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
from scipy import sparse
import xgboost
import lightgbm

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

def stacking_reg(clf, train_x, train_y, test_x, clf_name, kf,
                 label_split=None):
    train = np.zeros((train_x.shape[0], 1)) # n * 1
    test = np.zeros((test_x.shape[0], 1)) # n * 1
    test_pre = np.empty((folds, test_x.shape[0], 1))
    cv_scores = []
    for i ,(train_index, test_index) in enumerate(kf.split(train_x, label_split)):
        tr_x = train_x[train_index]
        tr_y = train_y[train_index]
        te_x = train_x[test_index]
        te_y = train_y[test_index]

        if clf_name in ['rf', 'ada', 'gb', 'et', 'lr', 'lsvc', 'knn']:
            clf.fit(tr_x, tr_y)
            pre = clf.predict(te_x).reshape(-1, 1)
            train[test_index] = pre
            test_pre[i, :] = clf.predict(test_x).reshape(-1, 1)
            cv_scores.append(mean_squared_error(te_y, pre))
        elif clf_name in ['xgb']:
            train_matrix = clf.DMatrix(tr_x, label=tr_y, missing=-1)
            test_matrix = clf.DMatrix(te_x, label=te_y, missing=-1)
            z = clf.DMatrix(test_x, label=te_y, missing=-1)
            params = {
                'booster': 'gbtree',
                'eval_metric': 'rmse',
                'gamma': 1,
                'min_child_weight': 1.5,
                'max_depth': 5,
                'lambda': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'eta': 0.03,
                'tree_method': 'exact',
                'seed': 2021,
                'nthread': 12
            }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(
                    params,
                    train_matrix,
                    num_boost_round=num_round,
                    evals=watchlist,
                    early_stopping_rounds=early_stopping_rounds
                )
                pre = model.predict(
                    test_matrix,
                    ntree_limit=model.best_ntree_limit
                ).reshape(-1,1)
                train[test_index] = pre
                test_pre[i, :] = model.predict(
                    z, ntree_limit=model.best_ntree_limit
                ).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))
        elif clf_name in ['lgb']:
            train_matrix = clf.Dataset(tr_x, label=tr_y)
            test_matrix = clf.Dataset(te_x, label=te_y)
            params = {
                'boosting_type': 'gbdt',
                'objective': 'regression_l2',
                'metric': 'mse',
                'min_child_weight': 1.5,
                'num_leaves': 2**5,
                'lambda_l2': 10,
                'subsample': 0.7,
                'colsample_bytree': 0.7,
                'colsample_bylevel': 0.7,
                'learning_rate': 0.03,
                'tree_method': 'exact',
                'seed': 2021,
                'nthread': 12,
                'silent': True
            }
            num_round = 10000
            early_stopping_rounds = 100
            watchlist = [(train_matrix, 'train'), (test_matrix, 'eval')]
            if test_matrix:
                model = clf.train(
                    params,
                    train_matrix,
                    num_round,
                    valid_sets=test_matrix,
                    early_stopping_rounds=early_stopping_rounds
                )
                pre = model.predict(
                    te_x,
                    num_iteration=model.best_iteration
                ).reshape(-1,1)
                train[test_index] = pre
                test_pre[i, :] = model.predict(
                    test_x, num_iteration=model.best_iteration
                ).reshape(-1,1)
                cv_scores.append(mean_squared_error(te_y, pre))
        else:
            raise IOError('Please add new clf.')
        print('%s now score is:' %clf_name, cv_scores)
    test[:] = test_pre.mean(axis=0)
    print('%s_score_list:' %clf_name, cv_scores)
    print('%s_score_mean:' %clf_name, np.mean(cv_scores))
    return train.reshape(-1,1), test.reshape(-1,1)

In [3]:
def lr_reg(x_train, y_train, x_valid, kf, label_split=None):
    lr_reg = LinearRegression(n_jobs=-1)
    lr_train, lr_test = stacking_reg(lr_reg,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'lr',
                                     kf,
                                     label_split=label_split)
    return lr_train, lr_test, 'lr_reg'

def lgb_reg(x_train, y_train, x_valid, kf, label_split=None):
    lgb_train, lgb_test = stacking_reg(lightgbm,
                                     x_train,
                                     y_train,
                                     x_valid,
                                     'lgb',
                                     kf,
                                     label_split=label_split)
    return lgb_train, lgb_test, 'lgb_reg'

In [9]:
def stacking_pred(
        x_train, y_train,
        x_valid, kf,
        clf_list, label_split=None,
        clf_fin='lgb',
        if_concat_origin=True):
    for k, clf_list in enumerate(clf_list):
        clf_list = [clf_list]
        column_list = []
        train_data_list = []
        test_data_list = []
        for clf in clf_list:
            train_data, test_data, clf_name = clf(
                x_train,
                y_train,
                x_valid,
                kf,
                label_split=label_split
            )
            train_data_list.append(train_data)
            test_data_list.append(test_data)
            column_list.append('clf_%s' % clf_name)
    train = np.concatenate(train_data_list, axis=1)
    test = np.concatenate(test_data_list, axis=1)

    if if_concat_origin:
        train = np.concatenate([x_train, train], axis=1)
        test = np.concatenate([x_valid, test], axis=1)

    print(x_train.shape)
    print(train.shape)
    print(clf_name)
    print(clf_name in ['lgb'])

    # if clf_fin in ['rf', 'ada', 'gb', 'et', 'lr', 'lsvc', 'knn']:
    # elif clf_fin in ['xgb']:
    if clf_fin in ['lgb']:
        print(clf_name)
        clf = lightgbm
        train_matrix = clf.Dataset(train, label=y_train)
        test_matrix = clf.Dataset(train, label=y_train)
        params = {
            'boosting_type': 'gbdt',
            'objective': 'regression_l2',
            'metric': 'mse',
            'min_child_weight': 1.5,
            'num_leaves': 2**5,
            'lambda_l2': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'colsample_bylevel': 0.7,
            'learning_rate': 0.03,
            'tree_method': 'exact',
            'seed': 2021,
            'nthread': 12,
            'silent': True
        }
        num_round = 10000
        early_stopping_rounds = 100
        model = clf.train(
            params,
            train_matrix,
            num_round,
            valid_sets=test_matrix,
            early_stopping_rounds=early_stopping_rounds
        )
        print('pred')
        pre = model.predict(
            test,
            num_iteration=model.best_iteration
        ).reshape(-1,1)
        print(pre)
        return pre

In [10]:
# 模型验证
with open('./dataset/zhengqi_train.txt') as fr:
    data_train = pd.read_table(fr, sep='\t')
with open('./dataset/zhengqi_test.txt') as fr_test:
    data_test = pd.read_table(fr_test, sep='\t')

# K折交叉验证
from sklearn.model_selection import StratifiedKFold, KFold
folds = 5
seed = 1
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [11]:
x_train = data_train[data_test.columns].values
x_valid = data_test[data_test.columns].values
y_train = data_train['target'].values

clf_list = [lr_reg, lgb_reg]

## 很容易过拟合
pred = stacking_pred(x_train, y_train, x_valid, kf, clf_list, label_split=None, clf_fin='lgb', if_concat_origin=True)

lr now score is: [0.11573216950871248]
lr now score is: [0.11573216950871248, 0.09417486426618935]
lr now score is: [0.11573216950871248, 0.09417486426618935, 0.10805046561851063]
lr now score is: [0.11573216950871248, 0.09417486426618935, 0.10805046561851063, 0.1242088706560155]
lr now score is: [0.11573216950871248, 0.09417486426618935, 0.10805046561851063, 0.1242088706560155, 0.11940113841914012]
lr_score_list: [0.11573216950871248, 0.09417486426618935, 0.10805046561851063, 0.1242088706560155, 0.11940113841914012]
lr_score_mean: 0.11231350169371361
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8846
[LightGBM] [Info] Number of data points in the train set: 2310, number of used features: 38
[LightGBM] [Info] Start training from score 0.126200
[1]	valid_0's l2: 0.993093
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 0.949112
[3]	valid_0's l2: 0.906972
[4]	valid_0's l2: 0.866834
[5]	valid_0's l2: 0.829243
[6]	vali