In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
import math
%matplotlib inline


  from numpy.core.umath_tests import inner1d


In [2]:
rolling_mask_two = [-i for i in range(1,2)]+[i for i in range(1,2)]
rolling_mask_four = [-i for i in range(1,3)]+[i for i in range(1,3)]
rolling_mask_six = [-i for i in range(1,4)]+[i for i in range(1,4)]
rolling_mask_eight = [-i for i in range(1,5)]+[i for i in range(1,5)]
rolling_mask_ten = [-i for i in range(1,6)]+[i for i in range(1,6)]

In [3]:
# 数据预处理
# def drop_all_outlier(df):
#     df.drop_duplicates(df.columns.drop('ID'), keep='first', inplace=True)
#     df.drop(df[(df.V_A > 800) | (df.V_A < 500)].index,inplace=True)
#     df.drop(df[(df.V_B > 800) | (df.V_B < 500)].index,inplace=True)
#     df.drop(df[(df.V_C > 800) | (df.V_C < 500)].index,inplace=True)
#     df.drop(df[(df.env_t > 30) | (df.env_t < -30)].index,inplace=True)
# #     df.drop(df[(df.转换效率A > 100)].index,inplace=True)
# #     df.drop(df[(df.转换效率B > 100)].index,inplace=True)
# #     df.drop(df[(df.转换效率C > 100)].index,inplace=True)
# #     df.drop(df[(df.wind_direction > 360)].index,inplace=True)
#     df.drop(df[(df.wind_speed > 20)].index,inplace=True)
#     return df

# 生成数据
def generate_train_data(train_data, test_data, poly=False, select=False):
    y = train_data['y']
    X = train_data.drop(['y','ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)
    
    polynm = None
    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        polynm = PolynomialFeatures(degree=2, interaction_only=True)
        X = polynm.fit_transform(X)
        sub_data = polynm.transform(sub_data)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
    
    sm = None
    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)
        
    return X_train, X_test, y_train, y_test, sub_data, sm, polynm

def cal_score(mse):
    if isinstance(mse, float):
        return 1 / (1 + math.sqrt(mse))
    else:
        return np.divide(1, 1 + np.sqrt(mse))
#  定义交叉验证函数  
def cross_validation_test(models, train_X_data, train_y_data, cv=5):
    model_name, mse_avg, score_avg = [], [], []
    for i, model in enumerate(models):
        print(i + 1,'- Model:', str(model).split('(')[0])
        model_name.append(str(i + 1) + '.' + str(model).split('(')[0])
        nmse = cross_val_score(model, train_X_data[i], train_y_data[i], cv=cv, scoring='neg_mean_squared_error')
        avg_mse = np.average(-nmse)
        scores = cal_score(-nmse)
        avg_score = np.average(scores)
        mse_avg.append(avg_mse)
        score_avg.append(avg_score)
        print('MSE:', -nmse)
        print('Score:', scores)
        print('Average XGB - MSE:', avg_mse, ' - Score:', avg_score, '\n')
    res = pd.DataFrame()
    res['Model'] = model_name
    res['Avg MSE'] = mse_avg
    res['Avg Score'] = score_avg
    return res

# def add_newid(df):
#     ID = df["ID"]
#     df["new_id"]=(np.mod(ID,205))
#     return df
def add_avg(df):
    array = np.array(df["P_avg"])
    newarray=[]
    num = 0
    for i in np.arange(len(array)):
        for j in np.arange(10):
            if i<10:
                num = (array[j-1]+array[j-2]+array[j-3])/3
            if i>=10:
                num = (array[i-1]+array[i-2]+array[i-3]+array[i-5]+array[i-6]+array[i-7]+array[i-8]+array[i-9])/9
        newarray.append(num)
    df["old_SoCalledSF_P_avg"] = newarray
    return df

# 原始数据导入

In [4]:
train = pd.read_csv('../data/public_raw.train.csv')
test = pd.read_csv('../data/public_raw.test.csv')

train['is_train']=1
test['is_train']=0

df = pd.concat([train, test],sort=False)

rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

df.rename(index=str, columns=rep_cols, inplace=True)

df.sort_values(by=['ID'],ascending=True, inplace=True)

df.reset_index(drop=True, inplace=True)

# train_data.rename(index=str, columns=rep_cols, inplace=True)
# test_data.rename(index=str, columns=rep_cols, inplace=True)

# 数据清洗

In [9]:
#清洗原因一：人工发现训练集和测试集合中均有很多样本，测量值均为零，发电量为0.379993053。
#于是有两个方向，方向一：将此类样本是为异常样本，认为不存在学习价值，使用最粗暴的方式，从训练集中剔除这些样本。在测试集中人工赋值0.379993053。
#方向二：认为这些样本不是异常样本，存在学习价值。同时将测试集合中此类样本增加到训练样本中，让分布更逼近真实分布。

#首先尝试方向一：
special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']

In [None]:
#清洗原因二：电压、电流、温度、风速这些连续测量值中存在明显的异常值，违背物理常识
#如何定义异常？
#情况一：在一个合理的时间段内，一个时刻的测量值与其他时刻测量值的均值的差异很大，差异如何刻画？

# 通过试验确定定义异常测量量的时长和偏差率

In [28]:
#通过试验确定时间段长度和偏差率
df = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)

#电流试验，时间段包括前后各3个时刻，偏差率大于1
df['I_A_avg_sequence'] = np.nanmean([df['I_A'].shift(i) for i in rolling_mask_eight],axis=0)
df['I_A_exception_ratio'] = np.abs(df['I_A']-df['I_A_avg_sequence'])/df['I_A_avg_sequence']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [29]:
df['I_A_exception_ratio'].describe()

count    17409.000000
mean         0.086667
std          0.351945
min          0.000000
25%          0.006467
50%          0.020145
75%          0.081967
max         20.051095
Name: I_A_exception_ratio, dtype: float64

In [32]:
df[df['I_A_exception_ratio']>2][['ID','I_A','I_A_avg_sequence','I_A_exception_ratio','V_A','V_B','V_C','y']]


Unnamed: 0,ID,I_A,I_A_avg_sequence,I_A_exception_ratio,V_A,V_B,V_C,y
527,591,6.68,2.22125,2.007316,36,65402,0,
531,595,6.77,2.25375,2.003882,37,39,65406,
673,737,6.53,0.3125,19.896,37,37,65514,
859,948,6.87,0.515,12.339806,65394,4,14,
1373,1519,7.04,0.9175,6.673025,77,65387,14,0.839478
1561,1717,6.81,0.415,15.409639,36,40,65396,
1722,1894,7.1,0.73,8.726027,74,65350,22,0.780917
2071,2271,7.22,1.14375,5.312568,114,65353,14,
3108,3393,6.64,0.36375,17.254296,34,38,65470,
3253,3538,6.56,1.2575,4.2167,36,37,65463,0.344703


In [33]:
df['V_A_avg_sequence'] = np.nanmean([df['V_A'].shift(i) for i in rolling_mask_eight],axis=0)
df['V_A_exception_ratio'] = np.abs(df['V_A']-df['V_A_avg_sequence'])/df['V_A_avg_sequence']

In [34]:
df['V_A_exception_ratio'].describe()

count    17409.000000
mean         0.062689
std          1.904155
min          0.000000
25%          0.001121
50%          0.002548
75%          0.005303
max        106.977709
Name: V_A_exception_ratio, dtype: float64

In [39]:
#1.6是比较合理的，电压值大于800的均被筛选出
df[df['V_A_exception_ratio']>1.6][['ID','V_A','V_A_avg_sequence','V_A_exception_ratio','V_B','V_C','y']]

Unnamed: 0,ID,V_A,V_A_avg_sequence,V_A_exception_ratio,V_B,V_C,y
14,22,65382,722.25,89.525441,7,107,5.440741
859,948,65394,605.625,106.977709,4,14,
981,1070,65477,696.5,93.008615,41,692,
1065,1173,65408,688.375,94.017977,22,250,7.753474
1237,1362,65386,702.375,92.092721,23,244,7.806384
1419,1565,65420,696.125,92.977375,8,260,
2740,2986,65515,690.625,93.863348,0,89,4.196051
3303,3597,65419,8789.0,6.443281,10,63,
3305,3599,65419,16880.625,2.87539,5,64,0.268685
3309,3603,65420,24970.125,1.619931,3,67,0.286363


In [38]:
df[df['V_A']>800][['ID','V_A','V_A_avg_sequence','V_A_exception_ratio','V_B','V_C','y']]

Unnamed: 0,ID,V_A,V_A_avg_sequence,V_A_exception_ratio,V_B,V_C,y
14,22,65382,722.25,89.525441,7,107,5.440741
859,948,65394,605.625,106.977709,4,14,
981,1070,65477,696.5,93.008615,41,692,
1065,1173,65408,688.375,94.017977,22,250,7.753474
1237,1362,65386,702.375,92.092721,23,244,7.806384
1419,1565,65420,696.125,92.977375,8,260,
2740,2986,65515,690.625,93.863348,0,89,4.196051
3303,3597,65419,8789.0,6.443281,10,63,
3305,3599,65419,16880.625,2.87539,5,64,0.268685
3309,3603,65420,24970.125,1.619931,3,67,0.286363


In [42]:
df['V_B_avg_sequence'] = np.nanmean([df['V_B'].shift(i) for i in rolling_mask_eight],axis=0)
df['V_B_exception_ratio'] = np.abs(df['V_B']-df['V_B_avg_sequence'])/df['V_B_avg_sequence']

In [46]:
df[df['V_B']>800][['ID','V_B','V_B_avg_sequence','V_B_exception_ratio','V_A','V_C','y']].shape

(23, 7)

In [50]:
#同样，对于VB而言，1.6也是比较合理的，电压值大于800的均被筛选出
df[df['V_B_exception_ratio']>1.6][['ID','V_B','V_B_avg_sequence','V_B_exception_ratio','V_A','V_C','y']]

Unnamed: 0,ID,V_B,V_B_avg_sequence,V_B_exception_ratio,V_A,V_C,y
527,591,65402,24870.625,1.629689,36,0,
528,592,65402,24871.75,1.62957,36,0,0.426489
529,593,65403,24871.75,1.62961,37,3,
530,594,65403,24870.875,1.629702,37,3,
1067,1175,65406,534.25,121.425831,553,27,
1326,1451,65454,681.875,94.991201,560,27,7.99687
1373,1519,65387,712.25,90.80344,77,14,0.839478
1722,1894,65350,710.875,90.928961,74,22,0.780917
1950,2137,65428,695.625,93.056424,563,30,
2036,2223,65446,706.125,91.683307,297,18,


In [51]:
df['V_C_avg_sequence'] = np.nanmean([df['V_C'].shift(i) for i in rolling_mask_eight],axis=0)
df['V_C_exception_ratio'] = np.abs(df['V_C']-df['V_C_avg_sequence'])/df['V_C_avg_sequence']

In [54]:
df[df['V_C']>800][['ID','V_C','V_C_avg_sequence','V_C_exception_ratio','V_A','V_B','y']].shape

(13, 7)

In [57]:
#同样，对于VC而言，1.6也是比较合理的，电压值大于800的均被筛选出
df[df['V_C_exception_ratio']>1.6][['ID','V_C','V_C_avg_sequence','V_C_exception_ratio','V_A','V_B','y']]

Unnamed: 0,ID,V_C,V_C_avg_sequence,V_C_exception_ratio,V_A,V_B,y
127,135,65498,677.125,95.729555,555,559,
531,595,65406,341.25,190.665934,37,39,
673,737,65514,641.125,101.186001,37,37,
1178,1286,65438,713.75,90.681961,310,307,4.694385
1520,1666,65460,684.875,94.579485,293,255,4.521973
1561,1717,65396,685.625,94.381586,36,40,
2214,2414,65470,700.625,92.445138,292,296,4.299538
3108,3393,65470,654.625,99.011457,34,38,
3136,3421,65475,700.375,92.485633,290,300,
3253,3538,65463,8679.125,6.542581,36,37,0.344703


# 试验休了，时长为前后各4个时刻，偏差率为1.6，以此作为初值

In [5]:
for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
    df[c+'_avg_sequence'] = np.nanmean([df[c].shift(i) for i in rolling_mask_eight],axis=0)
    df[c+'_exception_ratio'] = np.abs(df[c]-df[c+'_avg_sequence'])/df[c+'_avg_sequence']

In [6]:
def drop_all_outlier(df):
    for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
        df.drop(df[(df[c+'_exception_ratio'] > 1.6)].index,inplace=True)
    df.drop(df[(df.env_t > 30) | (df.env_t < -30)].index,inplace=True)
    df.drop(df[(df.wind_speed > 20)].index,inplace=True)

    return df

In [7]:
#train和test拆分
train_data = df[df['is_train']==1]
test_data = df[df['is_train']==0]

In [8]:
#准备测试结果
df_result = pd.DataFrame()
df_result['ID'] = list(test_data['ID'])


In [9]:
special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']

In [10]:
cleaned_train_data = train_data.copy()
cleaned_train_data = drop_all_outlier(cleaned_train_data)

cleaned_sub_data = test_data.copy()
cleaned_sub_data = drop_all_outlier(cleaned_sub_data)
cleaned_sub_data_ID = cleaned_sub_data['ID']

In [35]:
all_data  = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)
bad_feature = ['ID','P_A', 'P_B', 'P_C', 'P_avg', 'env_t', 'V_A', 'V_B', 'V_C', 'I_B', 'I_C', 'efficiency', 'efficiency_A', 'efficiency_B', 'efficiency_C']
bad_index1 = all_data[bad_feature][
    (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
    (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
].dropna(how='all').index
bad_index2 = all_data[
    ((all_data['V_A']<500)&(all_data['V_A']!=0))|
    ((all_data['V_B']<500)&(all_data['V_B']!=0))|
    ((all_data['V_C']<500)&(all_data['V_C']!=0))].index
bad_index = pd.Int64Index(list(bad_index1)+list(bad_index2))
# all_data.loc[np.concatenate([bad_index -1,bad_index,bad_index+1])].sort_values(by='ID', ascending=True)


nn_bad_data = all_data.loc[np.concatenate([bad_index - 1, bad_index, bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates()
bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [36]:
# 上下记录均值替代异常值
for idx, line in bad_data.iterrows():
    ID = line['ID']
    col_index = line[bad_feature][ 
        (line[bad_feature] > all_data[bad_feature].mean() + 3 * all_data[bad_feature].std())| 
        (line[bad_feature] < all_data[bad_feature].mean() - 3 * all_data[bad_feature].std())
    ].index
    index = all_data[all_data['ID'] == ID].index
    
    before_offset = 1
    while (idx + before_offset)in bad_index:
        before_offset += 1

    after_offset = 1
    while (idx + after_offset) in bad_index:
        after_offset += 1
    
    replace_value = (all_data.loc[index - before_offset, col_index].values + all_data.loc[index + after_offset, col_index].values) / 2
    all_data.loc[index, col_index] = replace_value[0]

In [11]:
#异常值被前后各4个点的平均值替代
for idx, line in df.iterrows():
    for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
        if line[c+'_exception_ratio']>1.6:
            line.loc[c] = line[c+'_avg_sequence']
            print(str(line[c]) + 'is replaced by '+str(line[c+'_avg_sequence'])) 

2.85625is replaced by 2.85625
722.25is replaced by 722.25
677.125is replaced by 677.125
2.56is replaced by 2.56
2.22125is replaced by 2.22125
241.31999999999994is replaced by 241.31999999999994
24870.625is replaced by 24870.625
241.33249999999998is replaced by 241.33249999999998
24871.75is replaced by 24871.75
2.2449999999999997is replaced by 2.2449999999999997
241.33249999999998is replaced by 241.33249999999998
24871.75is replaced by 24871.75
241.3425is replaced by 241.3425
24870.875is replaced by 24870.875
2.25375is replaced by 2.25375
341.25is replaced by 341.25
0.3125is replaced by 0.3125
0.31875is replaced by 0.31875
0.32375is replaced by 0.32375
641.125is replaced by 641.125
0.515is replaced by 0.515
0.2875is replaced by 0.2875
0.7137500000000001is replaced by 0.7137500000000001
605.625is replaced by 605.625
5.369999999999999is replaced by 5.369999999999999
696.5is replaced by 696.5
5.754999999999999is replaced by 5.754999999999999
688.375is replaced by 688.375
4.890000000000001i

In [12]:
#拆分数据
train_data = df[df['is_train']==1].reset_index().drop(['index'], axis=1)
test_data = df[df['is_train']==0].drop(['y'], axis=1).reset_index().drop(['index'], axis=1)

In [38]:
#拆分数据
train_data = all_data.drop(all_data[all_data['ID'].isin(df_result['ID'])].index).reset_index().drop(['index'], axis=1)
test_data = all_data[all_data['ID'].isin(df_result['ID'])].drop(['y'], axis=1).reset_index().drop(['index'], axis=1)
len(train_data), len(test_data)
# 去除重复值
train_data = train_data.drop_duplicates(train_data.columns.drop('ID'), keep='first')

In [13]:
train_data = add_avg(train_data)
test_data = add_avg(test_data)
cleaned_train_data = add_avg(cleaned_train_data)
cleaned_sub_data = add_avg(cleaned_sub_data)

In [14]:
cleaned_sub_data = cleaned_sub_data.drop(['y'], axis=1).reset_index().drop(['index'], axis=1)

In [21]:
cleaned_train_data.head()

Unnamed: 0,ID,board_t,env_t,light_strength,efficiency,efficiency_A,efficiency_B,efficiency_C,V_A,V_B,...,I_B_exception_ratio,I_C_avg_sequence,I_C_exception_ratio,V_A_avg_sequence,V_A_exception_ratio,V_B_avg_sequence,V_B_exception_ratio,V_C_avg_sequence,V_C_exception_ratio,old_SoCalledSF_P_avg
2,10,-19.14,-17.4,34,80.55,106.32,16.98,118.36,729,709,...,0.03937,1.486667,0.008969,606.666667,0.201648,597.666667,0.18628,603.666667,0.200994,2011.693333
3,11,-18.73,-17.3,30,99.9,139.0,21.2,139.51,728,717,...,0.070064,1.594286,0.021505,624.0,0.166667,615.285714,0.165312,621.285714,0.168544,2011.693333
4,12,-17.54,-17.0,41,82.48,114.86,14.91,117.66,731,722,...,0.206897,1.66,0.096386,637.25,0.147117,628.75,0.14831,634.875,0.134082,2011.693333
6,14,-15.43,-16.6,53,73.98,101.72,15.55,104.67,730,727,...,0.633136,2.08,0.004808,729.0,0.001372,721.5,0.007623,725.375,0.000862,2011.693333
7,15,-14.6,-16.3,65,64.62,86.86,13.09,93.92,727,729,...,0.718681,2.2075,0.041903,729.0,0.002743,723.375,0.007776,725.125,0.003965,2011.693333


In [15]:
def drop_cols(df):
    col_list=[] 
    for c in ['I_A','I_B','I_C','V_A','V_B','V_C']:
        col_list.append(c+'_exception_ratio')
        col_list.append(c+'_avg_sequence')
    df.drop(col_list,axis=1,inplace=True)
    return df


In [16]:
train_data=drop_cols(train_data)
test_data = drop_cols(test_data)
cleaned_train_data = drop_cols(cleaned_train_data)
cleaned_sub_data = drop_cols(cleaned_sub_data)

In [17]:
cleaned_sub_data.head()

Unnamed: 0,ID,board_t,env_t,light_strength,efficiency,efficiency_A,efficiency_B,efficiency_C,V_A,V_B,...,I_C,P_A,P_B,P_C,P_avg,wind_speed,wind_direction,y,is_train,old_SoCalledSF_P_avg
0,1,0.01,0.1,1,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.1,1,,0,2277.953333
1,9,-19.33,-17.5,13,198.32,259.11,42.17,293.66,722,705,...,1.43,909.72,148.05,1031.03,696.27,0.3,273,,0,2277.953333
5,13,-16.68,-16.6,50,73.59,97.95,14.7,108.12,729,715,...,2.02,1334.07,200.2,1472.58,1002.28,0.9,277,,0,2277.953333
9,17,-13.27,-16.2,83,75.36,73.55,73.36,79.16,728,723,...,2.5,1681.68,1677.36,1810.0,1723.01,0.7,280,,0,2277.953333
10,18,-12.41,-16.2,86,76.06,75.89,73.95,78.34,727,729,...,2.56,1802.96,1756.89,1861.12,1806.99,1.0,279,,0,2277.953333


In [17]:
X_train, X_test, y_train, y_test, sub_data, sm, polynm = generate_train_data(train_data, test_data, poly=True, select=True)

clean_X_train, clean_X_test, clean_y_train, clean_y_test, clean_sub_data, _, _ = generate_train_data(cleaned_train_data, cleaned_sub_data, poly=False, select=False)

clean_X = np.concatenate([clean_X_train, clean_X_test])
clean_y = np.concatenate([clean_y_train, clean_y_test])
clean_X = polynm.transform(clean_X)
clean_X = sm.transform(clean_X)

clean_sub_data = polynm.transform(clean_sub_data)
clean_sub_data = sm.transform(clean_sub_data)

In [18]:
all_X_train = np.concatenate([X_train, X_test])
all_y_train = np.concatenate([y_train, y_test])

In [19]:
xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=321, n_jobs=8)
xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=456, n_jobs=8)
xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=789, n_jobs=8)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=800, max_depth=4, max_features='log2', random_state=123,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=900, max_depth=4, max_features='log2', random_state=456,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=800, max_features='sqrt', random_state=7, n_jobs=8)
forest2 = RandomForestRegressor(n_estimators=900, max_features='log2', random_state=9, n_jobs=8)
forest3 = RandomForestRegressor(n_estimators=900, max_features='sqrt', random_state=11, n_jobs=8) 

lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=5, n_jobs=8) 
lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=7, n_jobs=8)
lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=9, n_jobs=8)

# xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=2, n_jobs=8)
# xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=3, n_jobs=8)
# xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=4, n_jobs=8)

# gbdt1 = GradientBoostingRegressor(n_estimators=500, max_depth=3, max_features='sqrt', random_state=2)
# gbdt2 = GradientBoostingRegressor(n_estimators=400, max_depth=3, max_features='sqrt', random_state=3)
# gbdt3 = GradientBoostingRegressor(n_estimators=500, max_depth=4, max_features='log2', random_state=4)

# forest1 = RandomForestRegressor(n_estimators=300, max_features='sqrt', random_state=2, n_jobs=8)
# forest2 = RandomForestRegressor(n_estimators=300, max_features='log2', random_state=3, n_jobs=8)
# forest3 = RandomForestRegressor(n_estimators=600, max_features='sqrt', random_state=4, n_jobs=8) 

# lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=2, n_jobs=8) 
# lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=3, n_jobs=8)
# lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=4, n_jobs=8)

cross_validation_test(
    models=[    
        xgbt1, xgbt2, xgbt3,
        gbdt1, gbdt2, gbdt3,
        forest1, forest2, forest3,
        lgb1, lgb2, lgb3
    ],
    train_X_data=[
        all_X_train, all_X_train, all_X_train, all_X_train,
        all_X_train, all_X_train, all_X_train, all_X_train,
        all_X_train, all_X_train, all_X_train, all_X_train
    ],
    train_y_data=[
        all_y_train, all_y_train, all_y_train, all_y_train,
        all_y_train, all_y_train, all_y_train, all_y_train,
        all_y_train, all_y_train, all_y_train, all_y_train
    ]
)

1 - Model: XGBRegressor
MSE: [0.02099711 0.02525734 0.03396981 0.08523078 0.01639701]
Score: [0.87343582 0.86286818 0.84437422 0.77402789 0.88648491]
Average XGB - MSE: 0.03637040987573491  - Score: 0.8482382037012346 

2 - Model: XGBRegressor
MSE: [0.02098197 0.02521585 0.0338727  0.08516197 0.01636402]
Score: [0.8734757  0.86296541 0.84456223 0.77409852 0.88658619]
Average XGB - MSE: 0.036319300409360995  - Score: 0.8483376104007381 

3 - Model: XGBRegressor
MSE: [0.02088441 0.02522819 0.03378339 0.08503715 0.0162451 ]
Score: [0.87373302 0.86293649 0.84473544 0.77422673 0.88695235]
Average XGB - MSE: 0.036235648314689624  - Score: 0.8485168053055088 

4 - Model: GradientBoostingRegressor
MSE: [0.01474687 0.01843338 0.03630021 0.0843154  0.02030152]
Score: [0.8917133  0.88046027 0.83996476 0.77497083 0.87528624]
Average XGB - MSE: 0.034819475637591524  - Score: 0.852479080722752 

5 - Model: GradientBoostingRegressor
MSE: [0.01623187 0.01980781 0.03758554 0.07981223 0.0182266 ]
Score:

Unnamed: 0,Model,Avg MSE,Avg Score
0,1.XGBRegressor,0.03637,0.848238
1,2.XGBRegressor,0.036319,0.848338
2,3.XGBRegressor,0.036236,0.848517
3,4.GradientBoostingRegressor,0.034819,0.852479
4,5.GradientBoostingRegressor,0.034333,0.8524
5,6.GradientBoostingRegressor,0.033817,0.852557
6,7.RandomForestRegressor,0.038812,0.841891
7,8.RandomForestRegressor,0.039294,0.841324
8,9.RandomForestRegressor,0.039438,0.841175
9,10.LGBMRegressor,0.037472,0.84456


In [20]:
regrs = [
    xgbt1, gbdt1, forest1, lgb1,
    xgbt2, gbdt2, forest2, lgb2,
    xgbt3, gbdt3, forest3, lgb3
]

In [21]:
class Stacker(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
    
    # X: 原始训练集, y: 原始训练集真实值, predict_data: 原始待预测数据
    def fit_predict(self, X, y, predict_data):
        X = np.array(X)
        y = np.array(y)
        T = np.array(predict_data)

        folds = list(KFold(n_splits=self.n_splits, shuffle=False, random_state=2018).split(X, y))
        
        # 以基学习器预测结果为特征的 stacker的训练数据 与 stacker预测数据
        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_predict = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, regr in enumerate(self.base_models):
            print(i + 1, 'Base model:', str(regr).split('(')[0])
            S_predict_i = np.zeros((T.shape[0], self.n_splits))
            
            for j, (train_idx, test_idx) in enumerate(folds):
                # 将X分为训练集与测试集
                X_train, y_train, X_test, y_test = X[train_idx], y[train_idx], X[test_idx], y[test_idx]
                print ('Fit fold', (j+1), '...')
                regr.fit(X_train, y_train)
                y_pred = regr.predict(X_test)                
                S_train[test_idx, i] = y_pred
                S_predict_i[:, j] = regr.predict(T)
            
            S_predict[:, i] = S_predict_i.mean(axis=1)

        nmse_score = cross_val_score(self.stacker, S_train, y, cv=5, scoring='neg_mean_squared_error')
        print('CV MSE:', -nmse_score)
        print('Stacker AVG MSE:', -nmse_score.mean(), 'Stacker AVG Score:', np.mean(np.divide(1, 1 + np.sqrt(-nmse_score))))

        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_predict)
        return res, S_train, S_predict

In [22]:

# stacking_mode1 = Ridge(alpha=0.008, copy_X=True, fit_intercept=False, solver='auto', random_state=2)# stacki 
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(all_X_train, all_y_train, sub_data)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [23]:
stacking_model2 = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker2 = Stacker(5, stacking_model2, regrs)
pred_clean_stack, S_clean_train_data, S_clean_predict_data = stacker2.fit_predict(clean_X, clean_y, clean_sub_data)

1 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
2 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
3 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
4 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
5 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
6 Base model: GradientBoostingRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
7 Base model: RandomForestRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
8 Base model: LGBMRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
9 Base model: XGBRegressor
Fit fold 1 ...
Fit fold 2 ...
Fit fold 3 ...
Fit fold 4 ...
Fit fold 5 ...
10 Base model: GradientBoostingRegre

In [24]:
df_result['score'] = pred_stack

In [25]:
index = df_result[df_result['ID'].isin(special_missing_ID)].index
df_result.loc[index, 'score'] = 0.379993053

In [26]:
c_index = df_result[df_result['ID'].isin(cleaned_sub_data_ID)].index
df_result.loc[c_index, 'score'] = pred_clean_stack

In [54]:
df_result.to_csv('../result/081002_08816.csv', index=False, header=False)