## 赛题特征优化

In [6]:
import pandas as pd
# 导入数据
train_data_file = './dataset/zhengqi_train.txt'
test_data_file = './dataset/zhengqi_test.txt'
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [9]:
# 特征构造方法
epsilon = 1e-5
# 组合交叉特征，可以自行定义，如增加x*x/y. log(x)/y 等
func_dict = {
    'add': lambda x, y: x + y,
    'mins': lambda x, y: x - y,
    'div': lambda x, y: x / (y + epsilon),
    'multi': lambda x, y: x * y
}

In [12]:
# 特征构造函数
def auto_features_make(train_data, test_data, func_dict, col_list):
    train_data, test_data = train_data.copy(), test_data.copy()
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():
                for data in [train_data, test_data]:
                    func_features = func(data[col_i], data[col_j])
                    col_func_features = '-'.join([col_i, func_name, col_j])
                    data[col_func_features] = func_features
    return train_data, test_data

In [13]:
# 特征降维处理
train_data2, test_data2 = auto_features_make(
    train_data, test_data, func_dict, col_list=test_data.columns
)
from sklearn.decomposition import PCA

# PCA 降维
pca = PCA(n_components=500)
train_data2_pca = pca.fit_transform(train_data2.iloc[:, 0:-1])
test_data2_pca = pca.transform(test_data2)
train_data2_pca = pd.DataFrame(train_data2_pca)
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

In [17]:
train_data2

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V37-div-V35,V37-multi-V35,V37-add-V36,V37-mins-V36,V37-div-V36,V37-multi-V36,V37-add-V37,V37-mins-V37,V37-div-V37,V37-multi-V37
0,0.566,0.016,-0.143,0.407,0.452,-0.901,-1.812,-2.360,-0.436,-2.114,...,0.687710,17.894308,-6.116,-0.900,1.345097,9.148864,-7.016,0.0,1.000003,12.306064
1,0.968,0.437,0.066,0.566,0.194,-0.893,-1.566,-2.360,0.332,-2.114,...,-2.005439,-0.265720,-1.065,-0.395,2.179170,0.244550,-1.460,0.0,1.000014,0.532900
2,1.013,0.568,0.235,0.370,0.112,-0.797,-1.367,-2.360,0.396,-2.114,...,-1.618087,-0.214396,0.176,-1.354,-0.769925,-0.450585,-1.178,0.0,1.000017,0.346921
3,0.733,0.368,0.283,0.165,0.599,-0.679,-1.200,-2.086,0.403,-2.114,...,-0.307684,-0.040768,0.221,-0.445,-0.336326,-0.037296,-0.224,0.0,1.000089,0.012544
4,0.684,0.638,0.260,0.209,0.337,-0.454,-1.073,-2.086,0.314,-2.114,...,-0.076921,-0.010192,-0.308,0.252,0.100004,0.007840,-0.056,0.0,1.000357,0.000784
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2883,0.190,-0.025,-0.138,0.161,0.600,-0.212,0.757,0.584,-0.026,0.904,...,-1.965672,-0.239414,1.262,0.110,1.190952,0.395136,1.372,0.0,0.999985,0.470596
2884,0.507,0.557,0.296,0.183,0.530,-0.237,0.749,0.584,0.537,0.904,...,1.088856,0.132620,-0.995,0.235,0.617896,0.233700,-0.760,0.0,1.000026,0.144400
2885,-0.394,-0.721,-0.485,0.084,0.136,0.034,0.655,0.614,-0.818,0.904,...,-2.143328,-0.261052,1.699,-0.203,0.786532,0.711348,1.496,0.0,0.999987,0.559504
2886,-0.219,-0.282,-0.344,-0.049,0.449,-0.140,0.560,0.583,-0.596,0.904,...,-2.413148,-0.127650,0.254,0.856,-1.843915,-0.167055,1.110,0.0,0.999982,0.308025


In [18]:
train_data2_pca

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,-12256.984223,1.062884e+06,3384.526732,492.181659,755.058519,176.172654,52.889313,223.894855,243.994114,194.737438,...,0.008695,0.008276,-0.001023,-0.005114,-0.004074,0.000874,0.006211,-0.000636,0.001124,0.175
1,-3700.404138,2.904250e+05,435.268484,-182.127640,-326.943890,-99.553956,-63.187717,-292.848583,-346.912710,-330.297786,...,-0.015003,-0.008769,-0.010266,0.047813,-0.011946,0.037371,0.051753,-0.031315,0.018139,0.676
2,-400.794599,-4.804747e+02,-520.505918,-282.465779,-472.893128,-121.841076,-52.377128,-216.458004,-242.163986,-193.419677,...,2.534576,54.307973,-26.881556,-4.846426,27.445718,-2.193562,-33.622737,8.346143,-11.339905,0.633
3,-374.839243,-4.833002e+02,-520.289854,-281.931934,-472.155638,-120.970863,47.355738,-207.326774,-242.160490,-195.489113,...,8.899746,-24.686528,22.826315,23.363596,11.467001,8.036323,33.623845,2.638719,-13.928653,0.206
4,-379.456705,-4.848447e+02,-520.206871,-281.568939,-468.852123,-121.804713,-57.559685,-151.724643,-243.033735,-198.276163,...,3.899612,-5.902992,-0.828168,6.660238,-1.554214,-5.866778,3.043772,-4.839015,-6.361681,0.384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2883,-385.042290,-4.799649e+02,-518.225731,-281.815435,-469.190313,-122.918416,-54.932413,-218.033043,-245.482561,-196.303135,...,3.148206,-10.080571,-10.675553,12.172861,3.124670,6.251132,0.870390,-1.847565,16.498988,0.235
2884,-368.915696,-4.775296e+02,-518.382450,-282.313167,-469.878579,-122.916922,-46.787552,-219.807216,-246.036461,-197.207481,...,1.708060,-1.445543,-0.965368,2.754528,4.190212,-2.141675,6.191564,0.055531,-2.900506,1.042
2885,-379.566154,-4.750647e+02,-519.837822,-280.352795,-471.180622,-118.069445,-58.031928,-217.147063,-245.325604,-197.118545,...,1.348365,-6.279074,3.650062,-3.235274,0.818062,-1.003041,0.537602,-2.545108,0.803532,0.005
2886,-378.755063,-4.759571e+02,-521.005429,-281.151985,-471.086527,-121.019342,-49.876102,-216.386182,-243.514385,-200.485135,...,-18.363891,-19.401707,22.620676,-3.273006,1.056308,-2.375548,7.979227,23.375771,-5.043796,0.350


In [20]:
# 模型训练和评估
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
Folds = 5
kf = KFold(n_splits=Folds, shuffle=True, random_state=2021)

# 记录训练和预测MSE
MSE_DICT = {'train_mse':[], 'test_mse':[]}

# 线下训练模型
for i, (train_index, test_index) in enumerate(kf.split(X_train2)):
    # LGB树模型
    lgb_reg = lgb.LGBMRegressor(
        learning_rate=0.01,
        max_depth=-1,
        # n_estimators=5000,
        n_estimators=1000,
        boosting_type='gbdt',
        random_state=2021,
        objective='regression'
    )

    # 切分训练集和测试集
    X_train_KFold = X_train2[train_index]
    X_test_KFold = X_train2[test_index]
    y_train_KFold = y_train[train_index]
    y_test_KFold = y_train[test_index]

    # 训练模型
    lgb_reg.fit(X=X_train_KFold,
                y=y_train_KFold,
                eval_set=[(X_train_KFold, y_train_KFold),
                          (X_test_KFold, y_test_KFold)],
                eval_names=['Train', 'Test'],
                early_stopping_rounds=100,
                eval_metric='MSE',
                verbose=50)

    # 训练集和测试集预测
    y_train_KFold_predict = lgb_reg.predict(
        X_train_KFold, num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(
        X_test_KFold, num_iteration=lgb_reg.best_iteration_)

    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))

    train_mse = mean_squared_error(y_train_KFold, y_train_KFold_predict)
    print('------\n', '训练MSE\n', train_mse, '\n------')

    test_mse = mean_squared_error(y_test_KFold, y_test_KFold_predict)
    print('------\n', '预测MSE\n', test_mse, '\n------')

    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)

print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')

print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')

Training until validation scores don't improve for 100 rounds
[50]	Train's l2: 0.419263	Test's l2: 0.428263
[100]	Train's l2: 0.199641	Test's l2: 0.235593
[150]	Train's l2: 0.108679	Test's l2: 0.161073
[200]	Train's l2: 0.0679627	Test's l2: 0.13033
[250]	Train's l2: 0.047303	Test's l2: 0.117391
[300]	Train's l2: 0.0353126	Test's l2: 0.111158
[350]	Train's l2: 0.0275163	Test's l2: 0.107831
[400]	Train's l2: 0.0220216	Test's l2: 0.106154
[450]	Train's l2: 0.0179068	Test's l2: 0.105175
[500]	Train's l2: 0.0147582	Test's l2: 0.104496
[550]	Train's l2: 0.0122418	Test's l2: 0.104084
[600]	Train's l2: 0.0102208	Test's l2: 0.103906
[650]	Train's l2: 0.00858774	Test's l2: 0.103492
[700]	Train's l2: 0.00725132	Test's l2: 0.103369
[750]	Train's l2: 0.00615626	Test's l2: 0.103201
[800]	Train's l2: 0.00524795	Test's l2: 0.10323
[850]	Train's l2: 0.00449195	Test's l2: 0.103197
[900]	Train's l2: 0.00386922	Test's l2: 0.103197
[950]	Train's l2: 0.00333871	Test's l2: 0.1032
[1000]	Train's l2: 0.0028942

KeyboardInterrupt: 