### 1 导入数据分析相关的库

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression  #线性回归
from sklearn.ensemble import RandomForestRegressor  #随机森林回归
from sklearn.ensemble import GradientBoostingRegressor  #梯度提升回归
from sklearn.svm import SVR  #支持向量机回归
import lightgbm as lgb  #LightGBM
from xgboost import XGBRegressor  #XGBoost
from sklearn.model_selection import learning_curve  # 用于学习曲线绘制
from sklearn.model_selection import ShuffleSplit  # 用于数据集划分
from sklearn.model_selection import train_test_split  # 用于数据集划分
from sklearn.metrics import mean_squared_error  #均方误差
import warnings

warnings.filterwarnings('ignore')

### 2 数据加载

##### 2.1 未降维数据

In [None]:
all_data = pd.read_csv('./processed_zhengqi_data.csv')

# 训练数据
cond1 = all_data['label'] == 'train'
train_data = all_data[cond1]
train_data.drop(labels=['label'], axis=1, inplace=True)
# 切分数据
X_train, X_valid, y_train, y_valid = train_test_split(train_data.drop(labels=['target'], axis=1), train_data['target'],
                                                      test_size=0.2)
# 测试数据
cond2 = all_data['label'] == 'test'
test_data = all_data[cond2]
test_data.drop(labels=['label', 'target'], axis=1, inplace=True)

##### 2.2 降维数据

In [None]:
# 采用pca保留特征的数据
train_data_pca = np.load('./train_data_pca.npz')['X_train']
target_data_pca = np.load('./train_data_pca.npz')['y_train']
# 切分数据
X_train_pca, X_valid_pca, y_train_pca, y_valid_pca = train_test_split(train_data_pca, target_data_pca, test_size=0.2)
test_data_pca = np.load('./test_data_pca.npz')['X_test']

### 3 定义绘制模型学习曲线的函数

In [None]:
def plot_learn_curve(model, title, X, y, cv=None):
    """
    绘制模型学习曲线
    :param model: 模型
    :param title: 标题
    :param X: 特征
    :param y: 标签
    :param cv: 交叉验证划分器
    """

    # 学习曲线
    train_sizes, train_scores, test_scores = learning_curve(model, X, y, cv=cv)
    # 计算训练集和测试集的平均分数和标准差
    train_scores_std = np.std(train_scores, axis=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    # 训练数据得分可视化
    plt.plot(train_sizes, train_scores_mean, 'o-', label="Training score", color="r")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")

    # 测试数据得分可视化
    plt.plot(train_sizes, test_scores_mean, 'o-', label="Cross-validation score", color="g")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1,
                     color="g")

    # 画图设置
    plt.grid()  # 网格
    plt.legend(loc="best")
    plt.title(title)  # 标题
    plt.xlabel("Training examples")  # x轴标签
    plt.ylabel("Score")  # y轴标签

### 4 多元线性回归

 ##### 4.1 模型训练

In [None]:
# 降维数据建模验证
clf = LinearRegression()
clf.fit(X_train_pca, y_train_pca)
score = mean_squared_error(y_train_pca, clf.predict(X_train_pca))
print("LinearRegression train score: ", score)

In [None]:
# 未降维数据建模验证
clf = LinearRegression()
clf.fit(X_train, y_train)
score = mean_squared_error(y_train, clf.predict(X_train))
print("LinearRegression train score: ", score)

##### 4.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (Linear Regression)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = LinearRegression()  # 线性回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./多元线性回归降维数据学习曲线.png", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (Linear Regression)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = LinearRegression()  # 线性回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./多元线性回归未降维数据学习曲线.png", dpi=300, bbox_inches='tight')

##### 4.3 模型预测

In [None]:
# 降维数据预测
model = LinearRegression()
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(train_data_pca)
np.savetxt("./多元线性回归模型预测（降维数据）.csv", y_pred, delimiter=",")

In [None]:
# 未降维数据预测
model = LinearRegression()
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./多元线性回归模型预测（未降维数据）.csv", y_pred, delimiter=",")

### 5 随机森林回归

##### 5.1 模型训练

In [None]:
# 降维数据建模验证
model = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                              min_samples_split=40, criterion='squared_error')
model.fit(X_train_pca, y_train_pca)
score = mean_squared_error(y_valid_pca, model.predict(X_valid_pca))
print("RandomForestRegressor train score: ", score)

In [None]:
# 未降维数据建模验证
model = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                              min_samples_split=40, criterion='squared_error')
model.fit(X_train, y_train)
score = mean_squared_error(y_valid, model.predict(X_valid))
print("RandomForestRegressor train score: ", score)

##### 5.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (Random Forest)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, criterion='squared_error')  # 随机森林回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./随机森林回归降维数据学习曲线.png", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (Random Forest)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, criterion='squared_error')  # 随机森林回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./随机森林回归未降维数据学习曲线.png", dpi=300, bbox_inches='tight')

##### 5.3 模型预测

In [None]:
# 降维数据预测
model = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                              min_samples_split=40, criterion='squared_error')
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(test_data_pca)
np.savetxt("./随机森林回归模型预测（降维数据）.csv", y_pred, delimiter=",")

In [None]:
# 未降维数据预测
model = RandomForestRegressor(n_estimators=200, max_depth=10, max_features='sqrt', min_samples_leaf=10,
                              min_samples_split=40, criterion='squared_error')
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./随机森林回归模型预测（未降维数据）.csv", y_pred, delimiter=",")

### 6 SVR支持向量机回归

##### 6.1 模型训练

In [None]:
# 降维数据建模验证
model = SVR(kernel='rbf', C=1.0, gamma=0.01, tol=0.0001, epsilon=0.3)
model.fit(X_train_pca, y_train_pca)
score = mean_squared_error(y_valid_pca, model.predict(X_valid_pca))
print("SVR train score: ", score)

In [None]:
# 未降维数据建模验证
model = SVR(kernel='rbf')
model.fit(X_train, y_train)
score = mean_squared_error(y_valid, model.predict(X_valid))
print("SVR train score: ", score)

##### 6.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (SVR)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = SVR(kernel='rbf', C=1.0, gamma=0.01, tol=0.0001, epsilon=0.3)  # 支持向量机回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./SVR降维数据学习曲线.png", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (SVR)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = SVR(kernel='rbf')  # 支持向量机回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./SVR未降维数据学习曲线.png", dpi=300, bbox_inches='tight')

##### 6.3 模型预测

In [None]:
# 降维数据预测
model = SVR(kernel='rbf', C=1.0, gamma=0.01, tol=0.0001, epsilon=0.3)
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(test_data_pca)
np.savetxt("./SVR模型预测（降维数据）.csv", y_pred, delimiter=",")

In [None]:
# 未降维数据预测
model = SVR(kernel='rbf')
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./SVR模型预测（未降维数据）.csv", y_pred, delimiter=",")

### 7 梯度提升回归

##### 7.1 模型训练

In [None]:
# 降维数据建模验证
model = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)
model.fit(X_train_pca, y_train_pca)
score = mean_squared_error(y_valid_pca, model.predict(X_valid_pca))
print("GradientBoostingRegressor train score: ", score)

In [None]:
# 未降维数据建模验证
model = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)
model.fit(X_train, y_train)
score = mean_squared_error(y_valid, model.predict(X_valid))
print("GradientBoostingRegressor train score: ", score)

##### 7.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (Gradient Boosting)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                      min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)  # 梯度提升回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./梯度提升回归降维数据学习曲线.png", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (Gradient Boosting)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                      min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)  # 梯度提升回归模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./梯度提升回归未降维数据学习曲线.png", dpi=300, bbox_inches='tight')

##### 7.3 模型预测

In [None]:
# 降维数据预测
model = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(test_data_pca)
np.savetxt("./梯度提升回归模型预测（降维数据）.csv", y_pred, delimiter=",")

In [None]:
# 未降维数据预测
model = GradientBoostingRegressor(n_estimators=300, max_depth=14, max_features='sqrt', min_samples_leaf=10,
                                  min_samples_split=40, learning_rate=0.03, loss='huber', subsample=0.8)
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./梯度提升回归模型预测（未降维数据）.csv", y_pred, delimiter=",")

### 8 LightGBM

##### 8.1 模型训练

In [None]:
# 降维数据建模验证
model = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                          min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25, colsample_bytree=0.8,
                          reg_alpha=0.5, reg_lambda=0.1)
model.fit(train_data_pca, target_data_pca)
score = mean_squared_error(y_valid_pca, model.predict(X_valid_pca))
print("LightGBM train score: ", score)

In [None]:
# 未降维数据建模验证
model = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                          min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25, colsample_bytree=0.8,
                          reg_alpha=0.5, reg_lambda=0.1)
model.fit(train_data.drop('target', axis=1), train_data['target'])
score = mean_squared_error(y_valid, model.predict(X_valid))
print("LightGBM train score: ", score)

##### 8.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (LightGBM)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                              min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25,
                              colsample_bytree=0.8,
                              reg_alpha=0.5, reg_lambda=0.1)  # LightGBM模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./LightGBM降维数据学习曲线.png", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (LightGBM)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                              min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25,
                              colsample_bytree=0.8,
                              reg_alpha=0.5, reg_lambda=0.1)  # LightGBM模型
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("LightGBM未降维数据学习曲线.png", dpi=300, bbox_inches='tight')

##### 8.3 模型预测

In [None]:
# 降维数据预测
model = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                          min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25,
                          colsample_bytree=0.8,
                          reg_alpha=0.5, reg_lambda=0.1)
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(test_data_pca)
np.savetxt("./lightGBM回归模型预测（降维数据）.csv", y_pred, delimiter=",")

In [None]:
# 未降维数据预测
model = lgb.LGBMRegressor(n_estimators=300, max_depth=5, max_bin=100, min_child_samples=10,
                          min_child_weight=0.1, learning_rate=0.05, subsample=0.8, num_leaves=25,
                          colsample_bytree=0.8,
                          reg_alpha=0.5, reg_lambda=0.1)
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./lightGBM回归模型预测（未降维数据）.csv", y_pred, delimiter=",")

### 9 Xgboost

##### 9.1 模型训练

In [9]:
# 降维数据
model = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05, gamma=0,
                     reg_alpha=0, reg_lambda=0, verbosity=1)
model.fit(X_train_pca, y_train_pca)
score = mean_squared_error(y_valid_pca, model.predict(X_valid_pca))
print("XGB train score: ", score)

XGB train score:  0.13416351341874852


In [10]:
# 未降维数据
model = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05, gamma=0,
                     reg_alpha=0, reg_lambda=0, verbosity=1)
model.fit(X_train, y_train)
score = mean_squared_error(y_valid, model.predict(X_valid))
print("XGB train score: ", score)

XGB train score:  0.0931772228420013


##### 9.2 模型学习曲线

In [None]:
# 降维数据学习曲线
X = X_train_pca
y = y_train_pca
title = "Learning Curves (XGB)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05,
                         gamma=0,
                         reg_alpha=0, reg_lambda=0, verbosity=1)
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./Xgboost降维数据学习曲线", dpi=300, bbox_inches='tight')

In [None]:
# 未降维数据学习曲线
X = X_train
y = y_train
title = "Learning Curves (XGB)"
cv = ShuffleSplit(n_splits=100, test_size=0.25, random_state=0)
estimator = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05,
                         gamma=0,
                         reg_alpha=0, reg_lambda=0, verbosity=1)
plot_learn_curve(estimator, title, X, y, cv=cv)
plt.savefig("./Xgboost未降维数据学习曲线", dpi=300, bbox_inches='tight')

##### 9.3 模型预测

In [11]:
# 降维数据
model = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05,
                     gamma=0,
                     reg_alpha=0, reg_lambda=0, verbosity=1)
model.fit(train_data_pca, target_data_pca)
y_pred = model.predict(test_data_pca)
np.savetxt("./Xgboost回归模型预测（降维数据）.csv", y_pred, delimiter=",")

In [12]:
# 未降维数据
model = XGBRegressor(n_estimators=300, max_depth=15, subsample=0.8, colsample_bytree=0.8, learning_rate=0.05,
                     gamma=0,
                     reg_alpha=0, reg_lambda=0, verbosity=1)
model.fit(train_data.drop('target', axis=1), train_data['target'])
y_pred = model.predict(test_data)
np.savetxt("./Xgboost回归模型预测（未降维数据）.csv", y_pred, delimiter=",")