In [1]:
# 下载需要用到的数据集
!wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_test.txt
!wget http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_train.txt

--2021-03-16 15:19:12--  http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_test.txt
Resolving tianchi-media.oss-cn-beijing.aliyuncs.com (tianchi-media.oss-cn-beijing.aliyuncs.com)... 47.95.85.21
Connecting to tianchi-media.oss-cn-beijing.aliyuncs.com (tianchi-media.oss-cn-beijing.aliyuncs.com)|47.95.85.21|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 466959 (456K) [text/plain]
Saving to: ‘zhengqi_test.txt.2’


2021-03-16 15:19:13 (3.31 MB/s) - ‘zhengqi_test.txt.2’ saved [466959/466959]

--2021-03-16 15:19:13--  http://tianchi-media.oss-cn-beijing.aliyuncs.com/DSW/Industrial_Steam_Forecast/zhengqi_train.txt
Resolving tianchi-media.oss-cn-beijing.aliyuncs.com (tianchi-media.oss-cn-beijing.aliyuncs.com)... 47.95.85.21
Connecting to tianchi-media.oss-cn-beijing.aliyuncs.com (tianchi-media.oss-cn-beijing.aliyuncs.com)|47.95.85.21|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 714370 (698K) [text/plain]

In [2]:
import pandas as pd

train_data_file = "./zhengqi_train.txt"
test_data_file =  "./zhengqi_test.txt"

train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')

In [3]:
epsilon=1e-5

#组交叉特征，可以自行定义，如增加： x*x/y, log(x)/y 等等
#针对中小型数据集常用的一种用于观察模型稳定性的方法——交叉验证。

#为啥使用组交叉特征
#交叉验证可以用于评估模型的预测性能，尤其是训练好的模型在新数据上的表现，在一定程度上减小过拟合。
#可以从有限的数据中获取尽可能多的有效信息。
#在数据量较少时，更方便找到适合的模型参数。

#一些lambda函数示例：
#lambda x, y: xy；函数输入是x和y，输出是它们的积xy
#lambda *args: sum(args); 输入是任意个数的参数，输出是它们的和(注意要求输入参数必须能够进行加法运算)
#lambda **kwargs: 1；输入是任意键值对参数，输出是1

#组合交叉验证构造加减乘除
#构造字典是函数名称=中括号
func_dict = {
            'add': lambda x,y: x+y,#add加了引号,每行有逗号
            'mins': lambda x,y: x-y,
            'div': lambda x,y: x/(y+epsilon),
            'multi': lambda x,y: x*y
            }

In [4]:
def auto_features_make(train_data,test_data,func_dict,col_list):
    #函数构造末尾是有冒号的
    train_data, test_data = train_data.copy(), test_data.copy()#连续复制一个数据集
    for col_i in col_list:
        for col_j in col_list:
            for func_name, func in func_dict.items():#返回可遍历的(键, 值) 元组数组。
                for data in [train_data,test_data]:
                    func_features = func(data[col_i],data[col_j])
                    col_func_features = '-'.join([col_i,func_name,col_j])
                    #这个join里面有中括号
                    data[col_func_features] = func_features
    return train_data,test_data

In [5]:
train_data2, test_data2 = auto_features_make(train_data,test_data,func_dict,col_list=test_data.columns)

In [6]:
from sklearn.decomposition import PCA   #主成分分析法

#PCA方法降维
pca = PCA(n_components=500)#意义：PCA算法中所要保留的主成分个数n，也即保留下来的特征个数n
#用X来训练PCA模型，同时返回降维后的数据。
#因为PCA是一种无监督的训练，因此没有设置Y值
train_data2_pca = pca.fit_transform(train_data2.iloc[:,0:-1])

#将数据X转换成降维后的数据。当模型训练好后，对于新输入的数据，都可以用transform方法来降维。
test_data2_pca = pca.transform(test_data2)

train_data2_pca = pd.DataFrame(train_data2_pca)#注意这里是大写的DataFrame
test_data2_pca = pd.DataFrame(test_data2_pca)
train_data2_pca['target'] = train_data2['target']

In [7]:
X_train2 = train_data2[test_data2.columns].values
y_train = train_data2['target']

In [None]:
# ls_validation i
from sklearn.cross_validation import KFold#from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import numpy as np

# 5折交叉验证
#K折交叉验证：sklearn.model_selection.KFold(n_splits=3, shuffle=False, random_state=None)
#思路：将训练/测试数据集划分n_splits个互斥子集，每次用其中一个子集当作验证集，
#剩下的n_splits-1个作为训练集，进行n_splits次训练和测试，得到n_splits个结果
#n_splits：表示划分几等份
#shuffle：在每次划分时，是否进行洗牌
# random_state随机数种子

Folds=5
kf = KFold(len(X_train2), n_folds=Folds, shuffle=True, random_state=2019)

# 记录训练和预测MSE
MSE_DICT = {
    'train_mse':[],
    'test_mse':[]
}

# 线下训练预测
for i, (train_index, test_index) in enumerate(kf):
#enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，
#同时列出数据和数据下标，
    # lgb树模型
    lgb_reg = lgb.LGBMRegressor(#就是LGB的一个模型
        learning_rate=0.01,
        max_depth=-1,
        n_estimators=5000,
        boosting_type='gbdt',
        random_state=2019,
        objective='regression',
    )
   
    # 切分训练集和预测集
    X_train_KFold, X_test_KFold = X_train2[train_index], X_train2[test_index]
    y_train_KFold, y_test_KFold = y_train[train_index], y_train[test_index]
    
    # 训练模型
    lgb_reg.fit(
            X=X_train_KFold,y=y_train_KFold,
            eval_set=[(X_train_KFold, y_train_KFold),(X_test_KFold, y_test_KFold)],
            eval_names=['Train','Test'],
            early_stopping_rounds=100,
            eval_metric='MSE',
            verbose=50
        )


    # 训练集预测 测试集预测
    y_train_KFold_predict = lgb_reg.predict(X_train_KFold,num_iteration=lgb_reg.best_iteration_)
    y_test_KFold_predict = lgb_reg.predict(X_test_KFold,num_iteration=lgb_reg.best_iteration_) 
    
    print('第{}折 训练和预测 训练MSE 预测MSE'.format(i))
    train_mse = mean_squared_error(y_train_KFold_predict, y_train_KFold)
    print('------\n', '训练MSE\n', train_mse, '\n------')
    test_mse = mean_squared_error(y_test_KFold_predict, y_test_KFold)
    print('------\n', '预测MSE\n', test_mse, '\n------\n')
    
    MSE_DICT['train_mse'].append(train_mse)
    MSE_DICT['test_mse'].append(test_mse)
print('------\n', '训练MSE\n', MSE_DICT['train_mse'], '\n', np.mean(MSE_DICT['train_mse']), '\n------')
print('------\n', '预测MSE\n', MSE_DICT['test_mse'], '\n', np.mean(MSE_DICT['test_mse']), '\n------')



Training until validation scores don't improve for 100 rounds
