In [153]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from scipy.stats import norm,skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline


In [154]:
"""数据集加载"""
train_file_path = "../数据/清理后的数据/clean_train.csv"
train = pd.read_csv(train_file_path)
test_file_path = "../数据/清理后的数据/clean_test.csv"
test = pd.read_csv(test_file_path)
print("完整训练集大小是 {}".format(train.shape))
print("完整测试集大小是 {}".format(test.shape))


完整训练集大小是 (3253, 7)
完整测试集大小是 (120, 6)


In [155]:
ntrain = train.shape[0]
ntest = test.shape[0]
y_train = train.SoH.values
dataset_df = pd.concat((train, test)).reset_index(drop=True)
dataset_df.drop(['SoH'], axis=1, inplace=True)
# dataset_df.drop(['CS_Name'], axis=1, inplace=True)
print("all_data size is : {}".format(dataset_df.shape))
dataset_df.head(1000)

all_data size is : (3373, 6)


Unnamed: 0,cycle,capacity,resistance,CCCT,CVCT,CS_Name
0,1.0,1.126385,0.094009,6613.059052,2251.498033,CS2_35
1,2.0,1.126160,0.091661,6612.402800,2231.967052,CS2_35
2,3.0,1.125966,0.094649,6608.560673,2228.216959,CS2_35
3,4.0,1.118508,0.091413,6604.732222,2247.561061,CS2_35
4,5.0,1.117210,0.091413,6629.211049,2077.692393,CS2_35
...,...,...,...,...,...,...
995,385.0,0.969525,0.093080,5500.193208,2503.694372,CS2_36
996,386.0,0.978220,0.088546,5475.849651,2541.521603,CS2_36
997,387.0,0.978875,0.087654,5547.057277,2366.262200,CS2_36
998,388.0,0.969841,0.090171,5607.496941,2322.231989,CS2_36


In [156]:

"""新增特征"""

#恒压充电和恒流充电时间加总
# dataset_df['Total_time'] = (dataset_df['CCCT'] + dataset_df['CVCT'])


'新增特征'

In [157]:
"""对CS_name进行独热编码"""
dataset_df = pd.get_dummies(dataset_df)
dataset_df.head(3)

Unnamed: 0,cycle,capacity,resistance,CCCT,CVCT,CS_Name_CS2_35,CS_Name_CS2_36,CS_Name_CS2_37,CS_Name_CS2_38
0,1.0,1.126385,0.094009,6613.059052,2251.498033,True,False,False,False
1,2.0,1.12616,0.091661,6612.4028,2231.967052,True,False,False,False
2,3.0,1.125966,0.094649,6608.560673,2228.216959,True,False,False,False


In [158]:
# """归一化"""
# def min_max_normalization(data):
#     min_val = np.min(data)
#     max_val = np.max(data)
#     return (data - min_val) / (max_val - min_val)
#
# # 示例
# dataset_df['CCCT'] = min_max_normalization(dataset_df['CCCT'])
# dataset_df['CVCT'] = min_max_normalization(dataset_df['CVCT'])
# dataset_df['Total_time'] = min_max_normalization(dataset_df['Total_time'])
# print(dataset_df)

In [159]:
"""模型构建与评估"""

"""训练测试集分隔"""
clean_train = dataset_df[:ntrain]
clean_test = dataset_df[ntrain:]
clean_train = pd.concat([clean_train, pd.Series(y_train, name='SoH')], axis=1)
clean_train.shape,clean_test.shape
clean_train.head(1000)

Unnamed: 0,cycle,capacity,resistance,CCCT,CVCT,CS_Name_CS2_35,CS_Name_CS2_36,CS_Name_CS2_37,CS_Name_CS2_38,SoH
0,1.0,1.126385,0.094009,6613.059052,2251.498033,True,False,False,False,0.825175
1,2.0,1.126160,0.091661,6612.402800,2231.967052,True,False,False,False,0.815965
2,3.0,1.125966,0.094649,6608.560673,2228.216959,True,False,False,False,0.815977
3,4.0,1.118508,0.091413,6604.732222,2247.561061,True,False,False,False,0.825194
4,5.0,1.117210,0.091413,6629.211049,2077.692393,True,False,False,False,0.806900
...,...,...,...,...,...,...,...,...,...,...
995,385.0,0.969525,0.093080,5500.193208,2503.694372,False,True,False,False,0.733665
996,386.0,0.978220,0.088546,5475.849651,2541.521603,False,True,False,False,0.715319
997,387.0,0.978875,0.087654,5547.057277,2366.262200,False,True,False,False,0.724505
998,388.0,0.969841,0.090171,5607.496941,2322.231989,False,True,False,False,0.715312


In [160]:
"""模型训练与预测"""
# 定义评价指标
def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

X = clean_train.drop(columns='SoH')
y = clean_train['SoH']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=10)
# 定义交叉验证模式
kf = KFold(n_splits=8, random_state=50, shuffle=True)

warnings.filterwarnings('ignore')
# 建立基线模型
lgb = LGBMRegressor(objective='regression', random_state=50)
xgb = XGBRegressor(objective='reg:squarederror',random_state=50)
ridge = make_pipeline(RobustScaler(), RidgeCV(cv=kf))
svr = make_pipeline(RobustScaler(), SVR())
gbr = GradientBoostingRegressor(random_state=50)
rf = RandomForestRegressor(random_state=50)
scores = {}
# 基线模型评估
models = [lgb, xgb, ridge, svr]
model_names = ['lgb','xgb','ridge','svr']
for i, model in enumerate(models):
    score = rmse_cv(model)
    print('{} rmse score: {:.4f}, rmse std: {:.4f}'.format(model_names[i], score.mean(), score.std()))
    scores[model_names[i]] = (score.mean(), score.std())

rmse_df = pd.DataFrame(scores, index=['rmse_score','rmse_std'])
rmse_df.sort_values('rmse_score', axis=1, inplace=True)
rmse_df



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 2602, number of used features: 9
[LightGBM] [Info] Start training from score 0.655701
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 2602, number of used features: 9
[LightGBM] [Info] Start training from score 0.659202
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 2602, number of used features: 9
[LightGBM] [Info] Start training 

Unnamed: 0,ridge,xgb,lgb,svr
rmse_score,0.025468,0.038279,0.039293,0.097092
rmse_std,0.020327,0.051887,0.054035,0.082041


In [161]:
"""模型Stacking"""
class StackingRegressor(object):

    def __init__(self, fir_models, fir_model_names, sec_model, cv):
        # 第一层的基模型
        self.fir_models = fir_models
        self.fir_model_names = fir_model_names
        # 第二层用来预测结果的模型
        self.sec_model = sec_model
        # 交叉验证模式，必须为k_fold对象
        self.cv = cv

    def fit_predict(self, X, y, test):    # X,y,test必须为DataFrame
        # 创建空DataFrame
        stacked_train = pd.DataFrame()
        stacked_test = pd.DataFrame()
        # 初始化折数
        n_fold = 0

        # 遍历每个模型，做交叉验证
        for i, model in enumerate(self.fir_models):
            # 初始化stacked_train
            stacked_train[self.fir_model_names[i]] = np.zeros(shape=(X.shape[0], ))

            #遍历每一折交叉验证
            for train_index, valid_index in self.cv.split(X):
                # 初始化stacked_test
                n_fold += 1
                stacked_test[self.fir_model_names[i] + str(n_fold)] = np.zeros(shape=(test.shape[0], ))

                # 划分数据集
                X_train, y_train = X.iloc[train_index, :], y.iloc[train_index]
                X_valid, y_valid = X.iloc[valid_index, :], y.iloc[valid_index]

                # 训练模型并预测结果
                model.fit(X_train, y_train)
                stacked_train.loc[valid_index, self.fir_model_names[i]] = model.predict(X_valid)
                stacked_test.loc[:, self.fir_model_names[i] + str(n_fold)] = model.predict(test)
            print('{} is done.'.format(self.fir_model_names[i]))

        # stacked_train加上真实值标签
        y.reset_index(drop=True, inplace=True)
        stacked_train['y_true'] = y

        # 计算stacked_test中每个模型预测结果的平均值
        for i, model_name in enumerate(self.fir_model_names):
            stacked_test[model_name] = stacked_test.iloc[:, :8].mean(axis=1)
            stacked_test.drop(stacked_test.iloc[:, :8], axis=1, inplace=True)

        # 打印stacked_train和stacked_test
        print('----stacked_train----\n', stacked_train)
        print('----stacked_test----\n', stacked_test)

        # 用sec_model预测结果
        self.sec_model.fit(stacked_train.drop(columns='y_true'), stacked_train['y_true'])
        y_pred = self.sec_model.predict(stacked_test)
        return y_pred

sr = StackingRegressor(models, model_names, ridge, kf)
stacking_pred = sr.fit_predict(Xtrain, ytrain, Xtest)

def rmse(y, y_pred):
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    return rmse

stacking_score = rmse(ytest, stacking_pred)
print(stacking_score)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 1992, number of used features: 9
[LightGBM] [Info] Start training from score 0.669763
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000089 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 1992, number of used features: 9
[LightGBM] [Info] Start training from score 0.671486
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1283
[LightGBM] [Info] Number of data points in the train set: 1992, number of used features: 9
[LightGBM] [Info] Start training 

In [162]:
# """结果注册"""
# sr = StackingRegressor(models, model_names, ridge, kf)
# sample_submission_df = pd.read_csv('../数据/清理后的数据/submission_example.csv')
# sample_submission_df['result'] = sr.fit_predict(X, y, clean_test)
# sample_submission_df.to_csv('../结果/submission_3.csv', index=False)
# sample_submission_df.head()