# 1.问题分析

## 1.1.问题类别

## 1.2.特征分析 

## 1.3.数据挖掘

# 2.建立基线

## 2.1.准备工具包

In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
import math


from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import StratifiedKFold


%matplotlib inline

  from numpy.core.umath_tests import inner1d


## 2.2.数据准备(不进行清洗)

In [23]:
train_raw = pd.read_csv('../data/public_raw.train.csv')
test_raw = pd.read_csv('../data/public_raw.test.csv')

train_raw['is_train']=1
test_raw['is_train']=0

df = pd.concat([train_raw, test_raw],sort=False)

rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

df.rename(index=str, columns=rep_cols, inplace=True)

df.sort_values(by=['ID'],ascending=True, inplace=True)

df.reset_index(drop=True, inplace=True)

## 2.3.特征工程(完全使用原始特征)

## 2.4.训练

### 2.4.1.准备工作

In [24]:
# CV时不方便shuffle，因此准备训练集和测试集时shuffle
df = df.sample(frac=1).reset_index(drop=True)

# 准备训练集和测试集
train = df[df['is_train']==1]
test = df[df['is_train']==0]
print('训练集数量:'+str(len(train)))
print('测试集数量:'+str(len(test)))

# 准备训练集合输入矩阵和输出矩阵
train_X = train.drop(['y','is_train','I_B','I_C'],axis=1)
train_Y = train['y']

# 准备测试集合输入矩阵
test_X = test.drop(['y','is_train','I_B','I_C'],axis=1)

# 准备测试集合输出矩阵容器
ans=pd.DataFrame()
ans['ID']=test_X['ID']


训练集数量:9000
测试集数量:8409


In [25]:
# 定义打分函数,  SCORE = 1/(1+RMSE)
def cal_score(mse):
    if isinstance(mse, float):
        return 1 / (1 + math.sqrt(mse))
    else:
        return np.divide(1, 1 + np.sqrt(mse))

# def cal_score(mse):
#     return np.divide(1, 1 + np.sqrt(mse))


# 定义交叉验证函数  
def cross_validate(models, X, Y, cv=5):
    model_name, mse_avg, score_avg = [], [], []
    for i, model in enumerate(models):
        #获取模型名
        name = str(i + 1) + '.' + str(model) 
#         print(i + 1,'- Model:', str(model).split('(')[0])
        print(name)
#         model_name.append(str(i + 1) + '.' + str(model).split('(')[0])
        model_name.append(name.split('(')[0])
        #计算metric
#         strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
#         folds=strat_k_fold.split(X,Y)
        #apply shuffling to cross_val_score
#         strat_k_fold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=0)
        nmse = cross_val_score(model, X, Y, cv=cv, scoring='neg_mean_squared_error')
    
#         nmse = cross_val_score(model, X, Y, cv=cv, scoring='neg_mean_squared_error')
        avg_mse = np.average(-nmse)
        mse_avg.append(avg_mse)
        #计算分数
        scores = cal_score(-nmse)
        avg_score = np.average(scores)    
        score_avg.append(avg_score)
        print('MSE:', -nmse)
        print('Score:', scores)
        print('Average MSE:', avg_mse, ' - Score:', avg_score, '\n')
    res = pd.DataFrame()
    res['Model'] = model_name
    res['Avg MSE'] = mse_avg
    res['Avg Score'] = score_avg
    return res

### 2.4.2.Cross Validation（LightGBM/XGBoost/RF/GBM）

In [26]:
#基学习器

xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=321, n_jobs=8)
xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=456, n_jobs=8)
xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=789, n_jobs=8)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=800, max_depth=4, max_features='log2', random_state=123,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=900, max_depth=4, max_features='log2', random_state=456,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=800, max_features='sqrt', random_state=7, n_jobs=8)
forest2 = RandomForestRegressor(n_estimators=900, max_features='log2', random_state=9, n_jobs=8)
forest3 = RandomForestRegressor(n_estimators=900, max_features='sqrt', random_state=11, n_jobs=8) 

lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=5, n_jobs=8) 
lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=7, n_jobs=8)
lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=9, n_jobs=8)


In [28]:
regrs = [
    xgbt1, gbdt1, forest1, lgb1,
    xgbt2, gbdt2, forest2, lgb2,
    xgbt3, gbdt3, forest3, lgb3
]

In [27]:
cross_validate(
    models=[    
        xgbt3, gbdt3, forest3, lgb3
    ],
    X = train_X, 
    Y = train_Y
)

1.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, max_features='sqrt', min_child_weight=1, missing=None,
       n_estimators=1100, n_jobs=8, nthread=None, objective='reg:linear',
       random_state=789, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
MSE: [0.02068348 0.02547938 0.08352001 0.03153837 0.01621974]
Score: [0.87426533 0.86234952 0.77579622 0.84919166 0.88703067]
Average MSE: 0.035488194758418856  - Score: 0.8497266809807673 

2.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.08, loss='ls', max_depth=5,
             max_features='log2', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=1000,
             presort='auto', ran

Unnamed: 0,Model,Avg MSE,Avg Score
0,1.XGBRegressor,0.035488,0.849727
1,2.GradientBoostingRegressor,0.029746,0.862041
2,3.RandomForestRegressor,0.034705,0.852303
3,4.LGBMRegressor,0.031874,0.85756


### 2.4.3.使用LightGBM/XGBoost/RF/GBM多种模型训练后融合

In [4]:
class Stacker(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
    
    # Train_X: 原始训练集输入矩阵, Train_Y: 原始训练集输出矩阵, Test_X: 原始测试集输入矩阵
    def fit_predict(self, Train_X, Train_Y, Test_X):
        Train_X = np.array(Train_X)
        Train_Y = np.array(Train_Y)
        Test_X = np.array(Test_X)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2018).split(Train_X, Train_Y))       
        
        # 以基学习器预测结果为特征的 stacker训练数据 与 stacker预测数据
        # 原始训练集预测结果容器
        S_train = np.zeros((Train_X.shape[0], len(self.base_models)))
        # 原始测试集预测结果容器
        S_predict = np.zeros((Test_X.shape[0], len(self.base_models)))
        
        for n_model, regr in enumerate(self.base_models):
            print(n_model + 1, 'Base model:', str(regr).split('(')[0])
            S_predict_i = np.zeros((Test_X.shape[0], self.n_splits))
            
            for n_fold, (train_idx, test_idx) in enumerate(folds):
                # 将X分为训练集与测试集
                X_train_fold, Y_train_fold, X_test_fold, Y_test_fold = Train_X[train_idx], Train_Y[train_idx], Train_X[test_idx], Train_Y[test_idx]
                print ('Fit fold', (n_fold+1), '...')
                regr.fit(X_train_fold, Y_train_fold)
                Y_pred = regr.predict(X_test_fold)
                # 每折训练得到的模型根据原始训练集中的测试折的输入矩阵预测
                S_train[test_idx, n_model] = Y_pred
                # 每折训练得到的模型根据原始测试集输入矩阵预测
                S_predict_i[:, n_fold] = regr.predict(Test_X)
            
            S_predict[:, n_model] = S_predict_i.mean(axis=1)

        nmse_score = cross_val_score(self.stacker, S_train, Train_Y, cv=5, scoring='neg_mean_squared_error')
        print('CV MSE:', -nmse_score)
        print('Stacker AVG MSE:', -nmse_score.mean(), 'Stacker AVG Score:', np.mean(np.divide(1, 1 + np.sqrt(-nmse_score))))

        self.stacker.fit(S_train, Train_Y)
        res = self.stacker.predict(S_predict)
        return res, S_train, S_predict

In [130]:
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [None]:
#local CV: 0.7909689115097466

ans['y_n_shuffle'] = pred_stack
ans['y_n_shuffle'].describe()

In [115]:
#local CV: 0.8697213790417854

ans['y_shuffle'] = pred_stack
ans['y_shuffle'].describe()

count    8409.000000
mean        5.695812
std         3.459761
min        -0.131349
25%         2.505766
50%         5.704603
75%         8.887224
max        12.348426
Name: y_shuffle, dtype: float64

# 3.增加数据清洗工序

## 3.1.去重

In [5]:
# 准备训练集和测试集
train = df[df['is_train']==1]
test = df[df['is_train']==0]
print('训练集数量:'+str(len(train)))
print('测试集数量:'+str(len(test)))

# 对训练集进行去重
train.drop_duplicates(train.columns.drop('ID'), keep='first', inplace=True)
print('去重后训练集数量:'+str(len(train)))

# 准备训练集合输入矩阵和输出矩阵
train_X = train.drop(['y','is_train','I_B','I_C'],axis=1)
train_Y = train['y']

# 准备测试集合输入矩阵
test_X = test.drop(['y','is_train','I_B','I_C'],axis=1)

# 准备测试集合输出矩阵容器
ans=pd.DataFrame()
ans['ID']=test_X['ID']

训练集数量:9000
测试集数量:8409
去重后训练集数量:8918


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [10]:
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [11]:
#local CV: 0.8710764504503057

ans['y_drop_duplicate'] = pred_stack
ans['y_drop_duplicate'].describe()

count    8409.000000
mean        5.696612
std         3.457480
min        -0.117571
25%         2.510122
50%         5.704925
75%         8.886383
max        12.132167
Name: y_drop_duplicate, dtype: float64

## 3.2.异常值处理:增加异常修正值特征(异常通过离群程度定义)

In [5]:
rolling_mask_two = [-i for i in range(1,2)]+[i for i in range(1,2)]
rolling_mask_four = [-i for i in range(1,3)]+[i for i in range(1,3)]
rolling_mask_six = [-i for i in range(1,4)]+[i for i in range(1,4)]
rolling_mask_eight = [-i for i in range(1,5)]+[i for i in range(1,5)]
rolling_mask_ten = [-i for i in range(1,6)]+[i for i in range(1,6)]

In [6]:
# 由于训练集中存在重复数据:1)连续重复数据 2)电学测量值为0的重复数据

# 对于训练集,重复项全部去除
train_raw = pd.read_csv('../data/public_raw.train.csv')
print('去重前训练集条数:' +str(train_raw.shape[0]))
train_raw.drop_duplicates(train_raw.columns.drop('ID'), keep='first', inplace=True)
print('去重后训练集条数:' +str(train_raw.shape[0]))

# 对于测试集,仅仅通过ID去除电学测量值为0的重复数据
test_raw = pd.read_csv('../data/public_raw.test.csv')

# 在去重前准备完整测试集ID
ans=pd.DataFrame()
ans['ID']=test_raw['ID']

# 电学测量量全为0的异常点个数
zero_sample_ID = test_raw[test_raw[(test_raw == 0) | (test_raw == 0.)].count(axis=1) > 13]['ID'].tolist()
print('测试集中电学测量量全为0的异常点个数为: '+str(len(zero_sample_ID)))

# 根据ID全部去除
print('去重前测试集条数:' +str(test_raw.shape[0]))
test_raw = test_raw[~test_raw['ID'].isin(zero_sample_ID)].reset_index(drop=True)
print('去重后测试集条数:' +str(test_raw.shape[0]))




train_raw['is_train']=1
test_raw['is_train']=0

df = pd.concat([train_raw, test_raw],sort=False)

rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

df.rename(index=str, columns=rep_cols, inplace=True)

df.sort_values(by=['ID'],ascending=True, inplace=True)

df.reset_index(drop=True, inplace=True)









去重前训练集条数:9000
去重后训练集条数:8918
测试集中电学测量量全为0的异常点个数为: 46
去重前测试集条数:8409
去重后测试集条数:8363


In [59]:
# #非零连续重复记录
# df_fd = df.copy()
# df_fd['is_duplicate']=df_fd.duplicated(df_fd.columns.drop('ID'))

# df_fd[(df_fd['is_duplicate']==True) & (df_fd['V_A']!=0)]


In [7]:
#计算偏差率的辅助列
# for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
for c in ['I_A','I_B','I_C']:
    df[c+'_avg_sequence'] = np.nanmean([df[c].shift(i) for i in rolling_mask_eight],axis=0)
    df[c+'_exception_ratio'] = np.abs(df[c]-df[c+'_avg_sequence'])/df[c+'_avg_sequence']
    df[c+'_cor'] = df[c]
    
    #out of range
    oor_index = df[df[c]>20].index
    print(oor_index)
    
    outlier_index = df[df[c+'_exception_ratio']>1.6].index
    print(outlier_index)
    
    ab_index = pd.Int64Index(set(list(oor_index)+list(outlier_index)))
    print(ab_index)
    
    ab_data = df.loc[ab_index].sort_values(by='ID', ascending=True)
    
    # 上下记录均值替代异常值
    for idx, line in ab_data.iterrows():
        ID = line['ID']
        value = line[c]
        
        index = df[df['ID'] == ID].index
            
        before_offset = 1
        while (idx - before_offset)in ab_index:
            before_offset += 1

        after_offset = 1
        while (idx + after_offset) in ab_index:
            after_offset += 1
    
        print('ID :' + str(ID))
        print(value, 'is abnormal as value of ',c)
        replace_value = (df.loc[index - before_offset, c].values + df.loc[index + after_offset, c].values) / 2
        df.loc[index, c+'_cor'] = replace_value[0]
        print('Has been replaced by '+str(replace_value[0]))

        
        
        
        
        
for c in ['V_A','V_B','V_C']:
    df[c+'_avg_sequence'] = np.nanmean([df[c].shift(i) for i in rolling_mask_eight],axis=0)
    df[c+'_exception_ratio'] = np.abs(df[c]-df[c+'_avg_sequence'])/df[c+'_avg_sequence']
    df[c+'_cor'] = df[c]
    
    #out of range
    oor_index = df[(df[c]>800)|(df[c]<500)].index
    print(oor_index)
    
    outlier_index = df[df[c+'_exception_ratio']>1.6].index
    print(outlier_index)
    
    ab_index = pd.Int64Index(set(list(oor_index)+list(outlier_index)))
    print(ab_index)
    
    ab_data = df.loc[ab_index].sort_values(by='ID', ascending=True)
    
    # 上下记录均值替代异常值
    for idx, line in ab_data.iterrows():
        ID = line['ID']
        value = line[c]
        
        index = df[df['ID'] == ID].index
            
        before_offset = 1
        while (idx - before_offset)in ab_index:
            before_offset += 1

        after_offset = 1
        while (idx + after_offset) in ab_index:
            after_offset += 1
    
        print('ID :' + str(ID))
        print(value, 'is abnormal as value of ',c)
        replace_value = (df.loc[index - before_offset, c].values + df.loc[index + after_offset, c].values) / 2
        df.loc[index, c+'_cor'] = replace_value[0]
        print('Has been replaced by '+str(replace_value[0]))

        
        

        
df['P_A_cor']=df['I_A_cor']*df['V_A_cor']
df['P_B_cor']=df['I_B_cor']*df['V_B_cor']
df['P_C_cor']=df['I_C_cor']*df['V_C_cor']
df['P_avg_cor']=1/3*(df['P_A_cor']+df['P_B_cor']+df['P_C_cor'])


df.drop(columns=['I_A','I_B','I_C','V_A','V_B','V_C','P_A','P_B','P_C','P_avg'],axis=1,inplace=True)
# df.drop(columns=['I_A_avg_sequence','I_A_exception_ratio','I_B_avg_sequence','I_B_exception_ratio','I_C_avg_sequence','I_C_exception_ratio','V_B_avg_sequence','V_B_exception_ratio','V_A_avg_sequence','V_A_exception_ratio','V_C_avg_sequence','V_C_exception_ratio',],axis=1,inplace=True)
# 0.85661

df.head(20)
    
    


    
    
    
    
    
    
    
    
    

# #异常修正值由滚动平均值替代（前后各4个点的平均值）
# for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:

    
# for idx, line in df.iterrows():
#     for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
#         if line[c+'_exception_ratio']>1.6:
#             print(str(line[c]) + ' is abnormal as value of ' + c)
#             line.loc[c+'_corrected'] = line[c+'_avg_sequence']
#             print('Has been replaced by '+str(line[c+'_avg_sequence'])) 

            

Int64Index([], dtype='int64')
Int64Index([  522,   524,   526,   668,   852,  1362,  1549,  1709,  2056,
             3085,  3230,  3232,  3279,  3281,  3287,  3292,  3294,  3405,
             5037, 16292],
           dtype='int64')
Int64Index([ 3232, 16292,  2056,   522,   524,  1549,   526,  1709,  3085,
             3279,  1362,  3281,   852,  3292,  3405,  3287,   668,  3294,
             3230,  5037],
           dtype='int64')
ID :591.0
6.68 is abnormal as value of  I_A
Has been replaced by 0.9400000000000001
ID :593.0
6.68 is abnormal as value of  I_A
Has been replaced by 1.56
ID :595.0
6.77 is abnormal as value of  I_A
Has been replaced by 0.9650000000000001
ID :737.0
6.53 is abnormal as value of  I_A
Has been replaced by 0.345
ID :948.0
6.87 is abnormal as value of  I_A
Has been replaced by 0.33
ID :1519.0
7.04 is abnormal as value of  I_A
Has been replaced by 0.77
ID :1717.0
6.81 is abnormal as value of  I_A
Has been replaced by 0.33999999999999997
ID :1894.0
7.1 is abnormal a

Has been replaced by 694.0
ID :3607.0
65420.0 is abnormal as value of  V_A
Has been replaced by 695.0
ID :3610.0
36.0 is abnormal as value of  V_A
Has been replaced by 708.5
ID :3612.0
37.0 is abnormal as value of  V_A
Has been replaced by 704.0
ID :3723.0
39.0 is abnormal as value of  V_A
Has been replaced by 718.5
ID :5521.0
36.0 is abnormal as value of  V_A
Has been replaced by 689.5
ID :7437.0
807.0 is abnormal as value of  V_A
Has been replaced by 646.5
Int64Index([   13,   169,   495,   522,   523,   524,   525,   526,   668,
              852,   974,  1057,  1058,  1059,  1170,  1228,  1317,  1362,
             1408,  1509,  1549,  1709,  1936,  2022,  2056,  2199,  2348,
             2437,  2550,  2611,  2652,  2722,  2870,  3085,  3113,  3230,
             3231,  3232,  3279,  3281,  3285,  3287,  3289,  3292,  3293,
             3294,  3405,  4040,  5037,  5437,  5514,  6912,  7214, 14107,
            15715, 15852],
           dtype='int64')
Int64Index([ 522,  523,  524,  525

Has been replaced by 671.0
ID :13189.0
10.0 is abnormal as value of  V_C
Has been replaced by 619.0
ID :14682.0
316.0 is abnormal as value of  V_C
Has been replaced by 655.0
ID :16437.0
26.0 is abnormal as value of  V_C
Has been replaced by 594.5


Unnamed: 0,ID,board_t,env_t,light_strength,efficiency,efficiency_A,efficiency_B,efficiency_C,wind_speed,wind_direction,...,V_B_avg_sequence,V_B_exception_ratio,V_B_cor,V_C_avg_sequence,V_C_exception_ratio,V_C_cor,P_A_cor,P_B_cor,P_C_cor,P_avg_cor
0,9,-19.33,-17.5,13,198.32,259.11,42.17,293.66,0.3,273,...,715.75,0.015019,705.0,725.0,0.005517,721.0,909.72,148.05,1031.03,696.266667
1,10,-19.14,-17.4,34,80.55,106.32,16.98,118.36,0.6,272,...,717.2,0.011433,709.0,724.4,0.000828,725.0,976.86,155.98,1087.5,740.113333
2,11,-18.73,-17.3,30,99.9,139.0,21.2,139.51,0.8,275,...,717.833333,0.001161,717.0,724.833333,0.00161,726.0,1128.4,172.08,1132.56,811.013333
3,12,-17.54,-17.0,41,82.48,114.86,14.91,117.66,1.1,283,...,718.571429,0.004771,722.0,725.571429,0.007679,720.0,1279.25,166.06,1310.4,918.57
4,13,-16.68,-16.6,50,73.59,97.95,14.7,108.12,0.9,277,...,720.0,0.006944,715.0,724.25,0.006559,729.0,1334.07,200.2,1472.58,1002.283333
5,14,-15.43,-16.6,53,73.98,101.72,15.55,104.67,0.9,280,...,721.5,0.007623,727.0,725.375,0.000862,726.0,1474.6,225.37,1517.34,1072.436667
6,15,-14.6,-16.3,65,64.62,86.86,13.09,93.92,1.1,280,...,723.375,0.007776,729.0,725.125,0.003965,728.0,1548.51,233.28,1674.4,1152.063333
7,16,-14.1,-16.2,76,61.97,77.59,25.8,82.53,0.9,278,...,723.875,0.005698,728.0,725.0,0.001379,724.0,1619.93,538.72,1723.12,1293.923333
8,17,-13.27,-16.2,83,75.36,73.55,73.36,79.16,0.7,280,...,724.625,0.002243,723.0,725.125,0.001551,724.0,1681.68,1677.36,1810.0,1723.013333
9,18,-12.41,-16.2,86,76.06,75.89,73.95,78.34,1.0,279,...,635.375,0.147354,729.0,647.0,0.123648,727.0,1802.96,1756.89,1861.12,1806.99


In [8]:
# CV时不方便shuffle，因此准备训练集和测试集时shuffle
# df = df.sample(frac=1).reset_index(drop=True)


# 准备训练集和测试集
train = df[df['is_train']==1]
test = df[df['is_train']==0]
print('训练集数量:'+str(len(train)))
print('测试集数量:'+str(len(test)))


# 准备训练集合输入矩阵和输出矩阵
train_X = train.drop(['y','is_train','I_B_cor','I_C_cor'],axis=1)
train_Y = train['y']

# 准备测试集合输入矩阵
test_X = test.drop(['y','is_train','I_B_cor','I_C_cor'],axis=1)

# 准备测试集合输出矩阵容器
ans=pd.DataFrame()
ans['ID']=test_X['ID']

训练集数量:8918
测试集数量:8363


In [9]:
regrs_light = [
    xgbt3, gbdt3, forest3, lgb3
]

In [64]:
# 训练集未shuflle时
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs_light)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
CV MSE: [0.03820138 0.01411471 0.01500888 0.01062797 0.08564431]
Stacker AVG MSE: 0.03271945123206558 Stacker AVG Score: 0.860264287075261


In [10]:
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs_light)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
CV MSE: [0.02937148 0.03176862 0.01071031 0.03045894 0.0620819 ]
Stacker AVG MSE: 0.032878251749318445 Stacker AVG Score: 0.8521156276701483


In [56]:
 df.head(30)

Unnamed: 0,ID,board_t,env_t,light_strength,efficiency,efficiency_A,efficiency_B,efficiency_C,wind_speed,wind_direction,...,V_B_avg_sequence,V_B_exception_ratio,V_B_cor,V_C_avg_sequence,V_C_exception_ratio,V_C_cor,P_A_cor,P_B_cor,P_C_cor,P_avg_cor
0,9,-19.33,-17.5,13,198.32,259.11,42.17,293.66,0.3,273,...,715.75,0.015019,705.0,725.0,0.005517,721.0,909.72,148.05,1031.03,696.266667
1,10,-19.14,-17.4,34,80.55,106.32,16.98,118.36,0.6,272,...,717.2,0.011433,709.0,724.4,0.000828,725.0,976.86,155.98,1087.5,740.113333
2,11,-18.73,-17.3,30,99.9,139.0,21.2,139.51,0.8,275,...,717.833333,0.001161,717.0,724.833333,0.00161,726.0,1128.4,172.08,1132.56,811.013333
3,12,-17.54,-17.0,41,82.48,114.86,14.91,117.66,1.1,283,...,718.571429,0.004771,722.0,725.571429,0.007679,720.0,1279.25,166.06,1310.4,918.57
4,13,-16.68,-16.6,50,73.59,97.95,14.7,108.12,0.9,277,...,720.0,0.006944,715.0,724.25,0.006559,729.0,1334.07,200.2,1472.58,1002.283333
5,14,-15.43,-16.6,53,73.98,101.72,15.55,104.67,0.9,280,...,721.5,0.007623,727.0,725.375,0.000862,726.0,1474.6,225.37,1517.34,1072.436667
6,15,-14.6,-16.3,65,64.62,86.86,13.09,93.92,1.1,280,...,723.375,0.007776,729.0,725.125,0.003965,728.0,1548.51,233.28,1674.4,1152.063333
7,16,-14.1,-16.2,76,61.97,77.59,25.8,82.53,0.9,278,...,723.875,0.005698,728.0,725.0,0.001379,724.0,1619.93,538.72,1723.12,1293.923333
8,17,-13.27,-16.2,83,75.36,73.55,73.36,79.16,0.7,280,...,724.625,0.002243,723.0,725.125,0.001551,724.0,1681.68,1677.36,1810.0,1723.013333
9,18,-12.41,-16.2,86,76.06,75.89,73.95,78.34,1.0,279,...,635.375,0.147354,729.0,647.0,0.123648,727.0,1802.96,1756.89,1861.12,1806.99
