In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
import math
from catboost import Pool, CatBoostRegressor
%matplotlib inline




In [6]:
# 数据预处理
def clean_data(train_data_X):    
    for j in range(len(np.array(train_data_X.columns))):
        if train_data_X.columns[j] != "ID":
            print("正在处理第%d列数据"%j)
            array = train_data_X[train_data_X.columns[j]]
            num = 0
            len_ = len(train_data_X[train_data_X.columns[j]])
            for i in range(len_):   
                std_ = array.std()
                min_ = array.min()
                max_ = array.max()
                mean_ = array.mean()
                num = array[i]
                std_max =np.float(mean_+3*std_)
                std_min = np.float(mean_-3*std_)
                if (array[0]>std_max) | (array[0]<std_min):
    #                 mean_del = mean_-(array[0]-mean_)/len_
    #                 array[0] = mean_del
                    array[0] = mean_
                if (array[len_-1]>std_max) | (array[len_-1]<std_min):
                    array[len_-1] = mean_
                if i>0 and i<(len_-1):
                    if (array[i]>std_max) | (array[i]<std_min):
                        if (array[i-1]<std_max) and (array[i-1]>std_min) and (array[i+1]<std_max) and (array[i+1]>std_min):
                            array[i]= np.float(array[i-1]+array[i+1])/2
                        elif ((array[i-1]<std_max) and (array[i-1]>std_min))| (array[i+1]>std_max) | (array[i+1]<std_min): 
                            array[i] = array[i-1]
                        elif (array[i-1]<std_max) | (array[i-1]>std_min)|((array[i+1]<std_max) and (array[i+1]>std_min)):
                            array[i] = array[i+1]
            train_data_X[train_data_X.columns[j]] = array
        else:
            pass
            
    return train_data_X
def drop_all_outlier(df):
    df.drop_duplicates(df.columns.drop('ID'), keep='first', inplace=True)
    df.drop(df[(df.电压A > 800) | (df.电压A < 500)].index,inplace=True)
    df.drop(df[(df.电压B > 800) | (df.电压B < 500)].index,inplace=True)
    df.drop(df[(df.电压C > 800) | (df.电压C < 500)].index,inplace=True)
    df.drop(df[(df.现场温度 > 30) | (df.现场温度 < -30)].index,inplace=True)
    df.drop(df[(df.转换效率A > 100)].index,inplace=True)
    df.drop(df[(df.转换效率B > 100)].index,inplace=True)
    df.drop(df[(df.转换效率C > 100)].index,inplace=True)
    df.drop(df[(df.风向 > 360)].index,inplace=True)
    df.drop(df[(df.风速 > 20)].index,inplace=True)
    return df
# 生成数据
def generate_train_data(train_data, test_data, poly=False, select=False):
    y = train_data['发电量']
    X = train_data.drop(['发电量','ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)
    
    polynm = None
    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        polynm = PolynomialFeatures(degree=2, interaction_only=True)
        X = polynm.fit_transform(X)
        sub_data = polynm.transform(sub_data)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    
    sm = None
    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)
        
    return X_train, X_test, y_train, y_test, sub_data, sm, polynm

def cal_score(mse):
    if isinstance(mse, float):
        return 1 / (1 + math.sqrt(mse))
    else:
        return np.divide(1, 1 + np.sqrt(mse))
#  定义交叉验证函数  
def cross_validation_test(models, train_X_data, train_y_data, cv=5):
    model_name, mse_avg, score_avg = [], [], []
    for i, model in enumerate(models):
        print(i + 1,'- Model:', str(model).split('(')[0])
        model_name.append(str(i + 1) + '.' + str(model).split('(')[0])
        nmse = cross_val_score(model, train_X_data[i], train_y_data[i], cv=cv, scoring='neg_mean_squared_error')
        avg_mse = np.average(-nmse)
        scores = cal_score(-nmse)
        avg_score = np.average(scores)
        mse_avg.append(avg_mse)
        score_avg.append(avg_score)
        print('MSE:', -nmse)
        print('Score:', scores)
        print('Average XGB - MSE:', avg_mse, ' - Score:', avg_score, '\n')
    res = pd.DataFrame()
    res['Model'] = model_name
    res['Avg MSE'] = mse_avg
    res['Avg Score'] = score_avg
    return res

def add_avg(df):
    array = np.array(df["平均功率"])
    newarray=[]
    num = 0
    for i in np.arange(len(array)):
        for j in np.arange(10):
            sum_1=0
            for m in np.arange(j):
                sum_1 += array[m]
            if i<10:
                num = sum_1/(j+1)
            else:
                num = (array[i-1]+array[i-2]+array[i-3]+array[i-4]+array[i-5]+array[i-6]+array[i-7]+array[i-8]+array[i-9])/9
        newarray.append(num)
    df["old平均功率"] = newarray
    return df
def add_avgs(df,alpha,str_1,str_2):
    array = np.array(df[str_1])
    for i in np.arange(len(array)):
        if i==0:
            array[i]=alpha*0+(1-alpha)*array[i]
        else:
            array[i]=alpha*array[i-1]+(1-alpha)*array[i]
    df[str_2] = array
    return df

In [46]:
max(1,2)

2

### 读取数据

In [3]:
train_data = pd.read_csv('../data/public.train.csv')
test_data = pd.read_csv('../data/public.test.csv')

df_result = pd.DataFrame()
df_result['ID'] = list(test_data['ID'])
special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']

### 异常值处理

In [4]:
cleaned_train_data = train_data.copy()
cleaned_train_data = drop_all_outlier(cleaned_train_data)

cleaned_sub_data = test_data.copy()
cleaned_sub_data = drop_all_outlier(cleaned_sub_data)
cleaned_sub_data_ID = cleaned_sub_data['ID']

# all_data = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)
# bad_feature = ['ID', '功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B', '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C']
# bad_index = all_data[bad_feature][
#     (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
#     (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
# ].dropna(how='all').index

all_data  = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)
bad_feature = ['ID','功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B', '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C']
bad_index1 = all_data[bad_feature][
    (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
    (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
].dropna(how='all').index
bad_index2 = all_data[
    ((all_data['电压A']<500)&(all_data['电压A']!=0))|
    ((all_data['电压B']<500)&(all_data['电压B']!=0))|
    ((all_data['电压C']<500)&(all_data['电压C']!=0))].index
bad_index = pd.Int64Index(list(bad_index1)+list(bad_index2))
# all_data.loc[np.concatenate([bad_index -1,bad_index,bad_index+1])].sort_values(by='ID', ascending=True)


nn_bad_data = all_data.loc[np.concatenate([bad_index - 1, bad_index, bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates()
bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  from ipykernel import kernelapp as app


In [5]:
# 上下记录均值替代异常值
for idx, line in bad_data.iterrows():
    ID = line['ID']
    col_index = line[bad_feature][ 
        (line[bad_feature] > all_data[bad_feature].mean() + 3 * all_data[bad_feature].std())| 
        (line[bad_feature] < all_data[bad_feature].mean() - 3 * all_data[bad_feature].std())
    ].index
    index = all_data[all_data['ID'] == ID].index
    
    before_offset = 1
    while (idx + before_offset)in bad_index:
        before_offset += 1

    after_offset = 1
    while (idx + after_offset) in bad_index:
        after_offset += 1
    
    replace_value = (all_data.loc[index - before_offset, col_index].values + all_data.loc[index + after_offset, col_index].values) / 2
    all_data.loc[index, col_index] = replace_value[0]

### 拆分数据

In [6]:
#拆分数据
train_data = all_data.drop(all_data[all_data['ID'].isin(df_result['ID'])].index).reset_index().drop(['index'], axis=1)
test_data = all_data[all_data['ID'].isin(df_result['ID'])].drop(['发电量'], axis=1).reset_index().drop(['index'], axis=1)
len(train_data), len(test_data)
# 去除重复值
train_data = train_data.drop_duplicates(train_data.columns.drop('ID'), keep='first')


In [7]:
train_data = add_avg(train_data)
test_data = add_avg(test_data)
cleaned_train_data = add_avg(cleaned_train_data)
cleaned_sub_data = add_avg(cleaned_sub_data)

# train_data = add_avg(train_data,0.9,"平均功率","平均功率_历史数据")
# test_data = add_avg(test_data,0.9,"平均功率","平均功率_历史数据")
# cleaned_train_data = add_avg(cleaned_train_data,0.9,"平均功率","平均功率_历史数据")
# cleaned_sub_data = add_avg(cleaned_sub_data,0.9,"平均功率","平均功率_历史数据")

# train_data = add_avgs(train_data,0.9,"功率A","功率A_历史数据")
# test_data = add_avgs(test_data,0.9,"功率A","功率A_历史数据")
# cleaned_train_data = add_avgs(cleaned_train_data,0.9,"功率A","功率A_历史数据")
# cleaned_sub_data = add_avgs(cleaned_sub_data,0.9,"功率A","功率A_历史数据")

train_data = add_avgs(train_data,0.9,"功率B","功率B_历史数据")
test_data = add_avgs(test_data,0.9,"功率B","功率B_历史数据")
cleaned_train_data = add_avgs(cleaned_train_data,0.9,"功率B","功率B_历史数据")
cleaned_sub_data = add_avgs(cleaned_sub_data,0.9,"功率B","功率B_历史数据")

train_data = add_avgs(train_data,0.9,"功率C","功率C_历史数据")
test_data = add_avgs(test_data,0.9,"功率C","功率C_历史数据")
cleaned_train_data = add_avgs(cleaned_train_data,0.9,"功率C","功率C_历史数据")
cleaned_sub_data = add_avgs(cleaned_sub_data,0.9,"功率C","功率C_历史数据")



In [3]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures

# 代码部分
# 读取数据，train_x,train_y,test_x就可以直接用

train_x = pd.read_csv('../feature/train_x.csv').values # 训练集X
train_y = pd.read_csv('../feature/train_y.csv').values.ravel() # 训练集y
test_x = pd.read_csv('../feature/test_x.csv').values   # 线上测试集test_x
test_ID = pd.read_csv('../feature/test_ID.csv')        # 这个不用管
all_ID = pd.read_csv('../feature/all_ID.csv')          # 也不管

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=123)

In [8]:
# X_train, X_test, y_train, y_test, sub_data, sm, polynm = generate_train_data(train_x, test_x, poly=True, select=True)

# # clean_X_train, clean_X_test, clean_y_train, clean_y_test, clean_sub_data, _, _ = generate_train_data(cleaned_train_data, cleaned_sub_data, poly=False, select=False)

# clean_X = np.concatenate([clean_X_train, clean_X_test])
# clean_y = np.concatenate([clean_y_train, clean_y_test])
# clean_X = polynm.transform(clean_X)
# clean_X = sm.transform(clean_X)

# clean_sub_data = polynm.transform(clean_sub_data)
# clean_sub_data = sm.transform(clean_sub_data)

## Stacking Model

### Tree Models

In [4]:
all_X_train = np.concatenate([X_train, X_test])
all_y_train = np.concatenate([y_train, y_test])

In [7]:
xgbt1 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=789,eta=0.03,scoring='neg_mean_squared_error',n_estimators=351, max_depth=5, max_features='sqrt', random_state=777, n_jobs=12)
xgbt2 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=1000,eta=0.03,scoring='neg_mean_squared_error',n_estimators=361, max_depth=5, max_features='sqrt', random_state=999, n_jobs=12)
xgbt3 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=1515,eta=0.03,scoring='neg_mean_squared_error',n_estimators=371, max_depth=5, max_features='sqrt', random_state=367, n_jobs=12)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=1060, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=1100, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1090, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=740, max_features='sqrt', random_state=7, n_jobs=12)
forest2 = RandomForestRegressor(n_estimators=730, max_features='sqrt', random_state=9, n_jobs=12)
forest3 = RandomForestRegressor(n_estimators=745, max_features='sqrt', random_state=11, n_jobs=12) 
# n_estimators 850 num_leaves=14,learning_rate=0.1 max_depth=5  seed
lgb1 = LGBMRegressor(n_estimators=840, max_depth=4, random_state=789, n_jobs=12,num_leaves=14,learning_rate=0.08,seed=666) 
lgb2 = LGBMRegressor(n_estimators=845, max_depth=4, random_state=798, n_jobs=12,num_leaves=13,learning_rate=0.1,seed=777)
lgb3 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=777, n_jobs=12,num_leaves=10,learning_rate=0.1,seed=999)

# xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=2, n_jobs=8)
# xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=3, n_jobs=8)
# xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=4, n_jobs=8)

# gbdt1 = GradientBoostingRegressor(n_estimators=500, max_depth=3, max_features='sqrt', random_state=2)
# gbdt2 = GradientBoostingRegressor(n_estimators=400, max_depth=3, max_features='sqrt', random_state=3)
# gbdt3 = GradientBoostingRegressor(n_estimators=500, max_depth=4, max_features='log2', random_state=4)

# forest1 = RandomForestRegressor(n_estimators=300, max_features='sqrt', random_state=2, n_jobs=8)
# forest2 = RandomForestRegressor(n_estimators=300, max_features='log2', random_state=3, n_jobs=8)
# forest3 = RandomForestRegressor(n_estimators=600, max_features='sqrt', random_state=4, n_jobs=8) 

# lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=2, n_jobs=8) 
# lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=3, n_jobs=8)
# lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=4, n_jobs=8)

cross_validation_test(
    models=[    
        xgbt1, xgbt2, xgbt3,
        gbdt1, gbdt2, gbdt3,
        forest1, forest2, forest3,
        lgb1, lgb2, lgb3
    ],
    train_X_data=[
        all_X_train, all_X_train, all_X_train, all_X_train,
        all_X_train, all_X_train, all_X_train, all_X_train,
        all_X_train, all_X_train, all_X_train, all_X_train
    ],
    train_y_data=[
        all_y_train, all_y_train, all_y_train, all_y_train,
        all_y_train, all_y_train, all_y_train, all_y_train,
        all_y_train, all_y_train, all_y_train, all_y_train
    ]
)

1 - Model: XGBRegressor
MSE: [0.05529833 0.01467334 0.01883639 0.01771896 0.01761158]
Score: [0.80961436 0.8919544  0.87931743 0.88252484 0.88283958]
Average XGB - MSE: 0.024827718685883536  - Score: 0.8692501202201539 

2 - Model: XGBRegressor
MSE: [0.05528646 0.01465626 0.01880667 0.01769189 0.01758168]
Score: [0.8096309  0.89201051 0.87940121 0.88260408 0.88292743]
Average XGB - MSE: 0.02480458925543899  - Score: 0.8693148237567208 

3 - Model: XGBRegressor
MSE: [0.05525404 0.01463408 0.01878894 0.01766022 0.01756507]
Score: [0.8096761  0.89208341 0.8794512  0.88269686 0.88297625]
Average XGB - MSE: 0.024780471001577674  - Score: 0.869376763260887 

4 - Model: GradientBoostingRegressor
MSE: [0.05682425 0.0148558  0.02004622 0.02109739 0.01719202]
Score: [0.80750764 0.89135744 0.87597533 0.87317225 0.88408078]
Average XGB - MSE: 0.026003137160532774  - Score: 0.866418687861405 

5 - Model: GradientBoostingRegressor
MSE: [0.0568036  0.01482577 0.02000118 0.02108444 0.01717382]
Score: 

Unnamed: 0,Model,Avg MSE,Avg Score
0,1.XGBRegressor,0.024828,0.86925
1,2.XGBRegressor,0.024805,0.869315
2,3.XGBRegressor,0.02478,0.869377
3,4.GradientBoostingRegressor,0.026003,0.866419
4,5.GradientBoostingRegressor,0.025978,0.866486
5,6.GradientBoostingRegressor,0.025986,0.866465
6,7.RandomForestRegressor,0.030189,0.856574
7,8.RandomForestRegressor,0.030147,0.85665
8,9.RandomForestRegressor,0.030056,0.856792
9,10.LGBMRegressor,0.025695,0.866945


In [None]:
xgbt1 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=3000,eta=0.03,scoring='neg_mean_squared_error',n_estimators=351, max_depth=5, max_features='sqrt', random_state=2, n_jobs=12)
xgbt2 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=3000,eta=0.03,scoring='neg_mean_squared_error',n_estimators=361, max_depth=5, max_features='sqrt', random_state=3, n_jobs=12)
xgbt3 = xgb.XGBRegressor(silent=1,max_leaf_nodes=255,seed=3000,eta=0.03,scoring='neg_mean_squared_error',n_estimators=371, max_depth=5, max_features='sqrt', random_state=4, n_jobs=12)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=1010, max_depth=5, max_features='log2', random_state=790,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features='log2', random_state=791,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1020, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=720, max_features='sqrt', random_state=7, n_jobs=12)
forest2 = RandomForestRegressor(n_estimators=750, max_features='log2', random_state=9, n_jobs=12)
forest3 = RandomForestRegressor(n_estimators=800, max_features='sqrt', random_state=11, n_jobs=12) 

lgb1 = LGBMRegressor(n_estimators=800, max_depth=4, random_state=5, n_jobs=12) 
lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=7, n_jobs=12)
lgb3 = LGBMRegressor(n_estimators=900, max_depth=4, random_state=9, n_jobs=12)

Model	Avg MSE	Avg Score
0	1.XGBRegressor	0.028357	0.871174
1	2.XGBRegressor	0.028332	0.871251
2	3.XGBRegressor	0.028333	0.871279
3	4.GradientBoostingRegressor	0.027985	0.871623
4	5.GradientBoostingRegressor	0.027273	0.872185
5	6.GradientBoostingRegressor	0.026847	0.874907
6	7.RandomForestRegressor	0.030499	0.865014
7	8.RandomForestRegressor	0.030333	0.864953
8	9.RandomForestRegressor	0.030495	0.864881
9	10.LGBMRegressor	0.029263	0.868385
10	11.LGBMRegressor	0.029263	0.868362
11	12.LGBMRegressor	0.029246	0.868430

```
xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=2, n_jobs=8)
xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=3, n_jobs=8)
xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=4, n_jobs=8)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=800, max_depth=4, max_features='log2', random_state=2,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=900, max_depth=4, max_features='log2', random_state=3,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features='log2', random_state=4,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=300, max_features='sqrt', random_state=2, n_jobs=8)
forest2 = RandomForestRegressor(n_estimators=400, max_features='log2', random_state=3, n_jobs=8)
forest3 = RandomForestRegressor(n_estimators=600, max_features='sqrt', random_state=4, n_jobs=8) 

lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=2, n_jobs=8) 
lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=3, n_jobs=8)
lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=4, n_jobs=8)

Model	Avg MSE	Avg Score
0	1.XGBRegressor	0.029047	0.869022
1	2.XGBRegressor	0.029007	0.869151
2	3.XGBRegressor	0.028923	0.869379
3	4.GradientBoostingRegressor	0.028264	0.870069
4	5.GradientBoostingRegressor	0.028909	0.869134
5	6.GradientBoostingRegressor	0.027992	0.872356
6	7.RandomForestRegressor	0.030729	0.864344
7	8.RandomForestRegressor	0.030402	0.865025
8	9.RandomForestRegressor	0.030547	0.864977
9	10.LGBMRegressor	0.028749	0.868748
10	11.LGBMRegressor	0.029332	0.867618
11	12.LGBMRegressor	0.029250	0.867823
```

### Stacking

In [8]:
regrs = [
    xgbt1, gbdt1, forest1, lgb1,
    xgbt2, gbdt2, forest2, lgb2,
    xgbt3, gbdt3, forest3, lgb3
]

In [9]:
class Stacker(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
    
    # X: 原始训练集, y: 原始训练集真实值, predict_data: 原始待预测数据
    def fit_predict(self, X, y, predict_data):
        X = np.array(X)
        y = np.array(y)
        T = np.array(predict_data)

        folds = list(KFold(n_splits=self.n_splits, shuffle=False, random_state=2018).split(X, y))
        
        # 以基学习器预测结果为特征的 stacker的训练数据 与 stacker预测数据
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_predict = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, regr in enumerate(self.base_models):
            print(i + 1, 'Base model:', str(regr).split('(')[0])
            S_predict_i = np.zeros((T.shape[0], self.n_splits))
            
            for j, (train_idx, test_idx) in enumerate(folds):
                # 将X分为训练集与测试集
                X_train, y_train, X_test, y_test = X[train_idx], y[train_idx], X[test_idx], y[test_idx]
                print ('Fit fold', (j+1), '...')
                regr.fit(X_train, y_train)
                y_pred = regr.predict(X_test)                
                S_train[test_idx, i] = y_pred
                S_predict_i[:, j] = regr.predict(T)
            
            S_predict[:, i] = S_predict_i.mean(axis=1)

        nmse_score = cross_val_score(self.stacker, S_train, y, cv=5, scoring='neg_mean_squared_error')
        print('CV MSE:', -nmse_score)
        print('Stacker AVG MSE:', -nmse_score.mean(), 'Stacker AVG Score:', np.mean(np.divide(1, 1 + np.sqrt(-nmse_score))))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_predict)
        return res, S_train, S_predict

In [11]:
# stacking_mode1 = Ridge(alpha=0.008, copy_X=True, fit_intercept=False, solver='auto', random_state=2)
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(all_X_train, all_y_train, test_x)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [None]:
CV MSE: [0.00834157 0.01945687 0.0095695  0.08376848 0.01302208]
Stacker AVG MSE: 0.02683170221309707 Stacker AVG Score: 0.8755806306487525

In [14]:
stacking_model2 = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker2 = Stacker(5, stacking_model2, regrs)
pred_clean_stack, S_clean_train_data, S_clean_predict_data = stacker2.fit_predict(clean_X, clean_y, clean_sub_data)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [None]:
CV MSE: [0.01659384 0.00626714 0.01261921 0.05345451 0.01616147]
Stacker AVG MSE: 0.021019233812974467 Stacker AVG Score: 0.8821919685074249

In [None]:
CV MSE: [0.00902831 0.01733493 0.05746076 0.01078939 0.00961003]
Stacker AVG MSE: 0.020844682710295333 Stacker AVG Score: 0.8840294650706134

In [12]:
submission = all_ID.copy()
submission['Detection'] = pred_stack # !!!!!预测结果pred放在这个,其他的不用管
submission=pd.merge(test_ID,submission,on='ID',how='left')
submission=submission.fillna(3.799931e-01)
submission.to_csv('000000000.csv',index=None,sep=',')

### Output

In [15]:
df_result['score'] = pred_stack

In [16]:
index = df_result[df_result['ID'].isin(special_missing_ID)].index
df_result.loc[index, 'score'] = 0.379993053

In [17]:
c_index = df_result[df_result['ID'].isin(cleaned_sub_data_ID)].index
df_result.loc[c_index, 'score'] = pred_clean_stack

In [18]:
df_result.to_csv('8_16_time_16.19.version4.0.csv', index=False, header=False)