# 1.问题分析

## 1.1.问题类别

## 1.2.特征分析 

## 1.3.数据挖掘

# 2.准备工作

## 2.1.准备工具包

In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
import math


from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import StratifiedKFold


%matplotlib inline

  from numpy.core.umath_tests import inner1d


## 2.2.原始数据准备(不进行任何清洗)

In [2]:
train_raw = pd.read_csv('../data/public_raw.train.csv')
test_raw = pd.read_csv('../data/public_raw.test.csv')

train_raw['is_train']=1
test_raw['is_train']=0

df = pd.concat([train_raw, test_raw],sort=False)

rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

df.rename(index=str, columns=rep_cols, inplace=True)

df.sort_values(by=['ID'],ascending=True, inplace=True)

df.reset_index(drop=True, inplace=True)

# 3.数据清洗

In [3]:
#原始路线
all_data = df.copy()
bad_feature = ['P_A', 'P_B', 'P_C', 'P_avg', 'env_t', 'V_A', 'V_B', 'V_C', 'I_B', 'I_C', 'efficiency', 'efficiency_A', 'efficiency_B', 'efficiency_C']
bad_index1 = all_data[bad_feature][
    (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
    (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
].dropna(how='all').index
bad_index2 = all_data[
    ((all_data['V_A']<500)&(all_data['V_A']!=0))|
    ((all_data['V_B']<500)&(all_data['V_B']!=0))|
    ((all_data['V_C']<500)&(all_data['V_C']!=0))].index
bad_index = pd.Int64Index(list(bad_index1)+list(bad_index2))

# bad_index = all_data[bad_feature][
#     (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
#     (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
# ].dropna(how='all').index



bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)





# 上下记录均值替代异常值
for idx, line in bad_data.iterrows():
    ID = line['ID']
    col_index = line[bad_feature][ 
        (line[bad_feature] > all_data[bad_feature].mean() + 3 * all_data[bad_feature].std())| 
        (line[bad_feature] < all_data[bad_feature].mean() - 3 * all_data[bad_feature].std())
    ].index
    index = all_data[all_data['ID'] == ID].index
    
    # idx - before_offset, CV  0.8684
    before_offset = 1
    while (idx - before_offset)in bad_index:
        before_offset += 1

    after_offset = 1
    while (idx + after_offset) in bad_index:
        after_offset += 1
    
    replace_value = (all_data.loc[index - before_offset, col_index].values + all_data.loc[index + after_offset, col_index].values) / 2
    all_data.loc[index, col_index] = replace_value[0]

# 4.特征工程

## 4.1.增加前后有效发电量均值

In [4]:
df = all_data.copy()

#前二后二
next_one = []
prev_one = []
next_id = []
prev_id = []

second_next_one = []
second_prev_one = []

df_len = df.shape[0]

i_y =df.columns.get_loc("y")

def get_prev_nn_index(cur_i):
    prev_i = cur_i-1
    while(prev_i>=0 and pd.isnull(df.iat[prev_i,i_y])):
        prev_i-=1
    return prev_i

def get_next_nn_index(cur_i):
    prev_i = cur_i+1
    while(prev_i<df_len and pd.isnull(df.iat[prev_i,i_y])):
        prev_i+=1
    return prev_i

for i in range(df_len):
    f_pre_i=get_prev_nn_index(i)
    if(f_pre_i)<0:
        prev_one.append(np.nan)
        prev_id.append(0)
    else:
        prev_one.append(df.iat[f_pre_i,i_y])
        prev_id.append(f_pre_i)
        
    s_pre_i=get_prev_nn_index(f_pre_i)
    if (s_pre_i)<0:
        second_prev_one.append(np.nan)
    else:
        second_prev_one.append(df.iat[s_pre_i,i_y])
    
    f_next_i=get_next_nn_index(i)
    if(f_next_i<df_len):
        next_one.append(df.iat[f_next_i,i_y])
        next_id.append(f_next_i)
    else:
        next_one.append(np.nan)
        next_id.append(df_len)
    
    s_next_i=get_next_nn_index(f_next_i)
    if(s_next_i<df_len):
        second_next_one.append(df.iat[s_next_i,i_y])
    else:
        second_next_one.append(np.nan)
        

df['next_value'] = next_one
df['prev_value'] = prev_one
df['avg_value'] = np.nanmean([df['next_value'], df['prev_value']],axis=0)

df.drop(['next_value','prev_value'],1,inplace=True)



## 4.2.增加前后功率均值

In [5]:
def add_avg(df):
    array = np.array(df["P_avg"])
    newarray=[]
    num = 0
    for i in np.arange(len(array)):
        for j in np.arange(10):
            if i<10:
                num = (array[j-1]+array[j-2]+array[j-3])/3
            if i>=10:
                num = (array[i-1]+array[i-2]+array[i-3]+array[i-5]+array[i-6]+array[i-7]+array[i-8]+array[i-9])/9
        newarray.append(num)
    df["old_SoCalledSF_P_avg"] = newarray
    return df

df = add_avg(df)

# 5.训练集测试集数据准备

## 5.1.去除训练集的重复样本

In [6]:
# 拆分数据

train_data = df[df['is_train']==1]
test_data = df[df['is_train']==0]
len(train_data), len(test_data)

(9000, 8409)

In [7]:
# 准备提交结果

df_result = pd.DataFrame()
df_result['ID'] = list(test_data['ID'])
special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']

In [8]:
df_result.shape[0]

8409

In [9]:
train_data.head()

Unnamed: 0,ID,board_t,env_t,light_strength,efficiency,efficiency_A,efficiency_B,efficiency_C,V_A,V_B,...,P_A,P_B,P_C,P_avg,wind_speed,wind_direction,y,is_train,avg_value,old_SoCalledSF_P_avg
2,10,-19.14,-17.4,34,80.55,106.32,16.98,118.36,729.0,709.0,...,976.86,155.98,1087.5,740.11,0.6,272,1.437752,1,1.692575,1172.806667
3,11,-18.73,-17.3,30,99.9,139.0,21.2,139.51,728.0,717.0,...,1128.4,172.08,1132.56,811.01,0.8,275,1.692575,1,1.70677,1172.806667
4,12,-17.54,-17.0,41,82.48,114.86,14.91,117.66,731.0,722.0,...,1279.25,166.06,1310.4,918.57,1.1,283,1.975787,1,2.031615,1172.806667
6,14,-15.43,-16.6,53,73.98,101.72,15.55,104.67,730.0,727.0,...,1474.6,225.37,1517.34,1072.44,0.9,280,2.370656,1,2.253939,1172.806667
7,15,-14.6,-16.3,65,64.62,86.86,13.09,93.92,727.0,729.0,...,1548.51,233.28,1674.4,1152.06,1.1,280,2.532091,1,2.575187,1172.806667


In [10]:
print('去重前训练集条数:' +str(train_data.shape[0]))
train_data = train_data.drop_duplicates(train_data.columns.drop(['ID','avg_value','old_SoCalledSF_P_avg']), keep='first')
print('去重后训练集条数:' +str(train_data.shape[0]))

去重前训练集条数:9000
去重后训练集条数:8918


## 5.2.使训练集样本分布更合理

In [11]:
def improve_train_test_data(train_data, test_data, poly=False, select=False):
    Y = train_data['y']
    X = train_data.drop(['y','ID','is_train'], axis=1)
    test_data = test_data.drop(['y','ID','is_train'], axis=1)
    
    polynm = None
    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        polynm = PolynomialFeatures(degree=2, interaction_only=True)
        X = polynm.fit_transform(X)
        test_data = polynm.transform(test_data)
        
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=123)
    
    sm = None
    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, Y_train)
        X_val = sm.transform(X_val)
        test_data = sm.transform(test_data)
    
    train_X = np.concatenate([X_train, X_val])
    train_Y = np.concatenate([Y_train, Y_val])
    test_X = test_data
        
    return train_X, train_Y, test_X

In [12]:
# # X_train, X_test, y_train, y_test, sub_data, sm, polynm = generate_train_data(train_data, test_data, poly=True, select=True)
# X_train, X_test, y_train, y_test, sub_data, sm, polynm = generate_train_data(train_data, test_data)

# train_X = np.concatenate([X_train, X_test])
# train_Y = np.concatenate([y_train, y_test])



# test_X = sub_data

In [None]:
train_X, train_Y, test_X = improve_train_test_data(train_data, test_data, poly=True, select=True)

# 6.Cross Validation

In [None]:
# 定义打分函数,  SCORE = 1/(1+RMSE)
def cal_score(mse):
    if isinstance(mse, float):
        return 1 / (1 + math.sqrt(mse))
    else:
        return np.divide(1, 1 + np.sqrt(mse))

# def cal_score(mse):
#     return np.divide(1, 1 + np.sqrt(mse))

# 定义交叉验证函数  
def cross_validate(models, X, Y, cv=5):
    model_name, mse_avg, score_avg = [], [], []
    for i, model in enumerate(models):
        #获取模型名
        name = str(i + 1) + '.' + str(model) 
#         print(i + 1,'- Model:', str(model).split('(')[0])
        print(name)
#         model_name.append(str(i + 1) + '.' + str(model).split('(')[0])
        model_name.append(name.split('(')[0])
        #计算metric
#         strat_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
#         folds=strat_k_fold.split(X,Y)
        #apply shuffling to cross_val_score
#         strat_k_fold = StratifiedKFold(n_splits=cv, shuffle=True, random_state=0)
        nmse = cross_val_score(model, X, Y, cv=cv, scoring='neg_mean_squared_error')
    
#         nmse = cross_val_score(model, X, Y, cv=cv, scoring='neg_mean_squared_error')
        avg_mse = np.average(-nmse)
        mse_avg.append(avg_mse)
        #计算分数
        scores = cal_score(-nmse)
        avg_score = np.average(scores)    
        score_avg.append(avg_score)
        print('MSE:', -nmse)
        print('Score:', scores)
        print('Average MSE:', avg_mse, ' - Score:', avg_score, '\n')
    res = pd.DataFrame()
    res['Model'] = model_name
    res['Avg MSE'] = mse_avg
    res['Avg Score'] = score_avg
    return res

In [None]:
#基学习器

xgbt1 = xgb.XGBRegressor(n_estimators=950, max_depth=3, max_features='sqrt', random_state=321, n_jobs=8)
xgbt2 = xgb.XGBRegressor(n_estimators=1000, max_depth=3, max_features='sqrt', random_state=456, n_jobs=8)
xgbt3 = xgb.XGBRegressor(n_estimators=1100, max_depth=3, max_features='sqrt', random_state=789, n_jobs=8)
# n_estimators=1000  max_depth=5  'sqrt'  GradientBoostingRegressor 最佳参数 ,learning_rate=0.08
gbdt1 = GradientBoostingRegressor(n_estimators=800, max_depth=4, max_features='log2', random_state=123,learning_rate=0.08)
gbdt2 = GradientBoostingRegressor(n_estimators=900, max_depth=4, max_features='log2', random_state=456,learning_rate=0.08)
gbdt3 = GradientBoostingRegressor(n_estimators=1000, max_depth=5, max_features='log2', random_state=789,learning_rate=0.08)
# n_estimators=700, max_features='auto', random_state=2, n_jobs=8,max_depth=10
forest1 = RandomForestRegressor(n_estimators=800, max_features='sqrt', random_state=7, n_jobs=8)
forest2 = RandomForestRegressor(n_estimators=900, max_features='log2', random_state=9, n_jobs=8)
forest3 = RandomForestRegressor(n_estimators=900, max_features='sqrt', random_state=11, n_jobs=8) 

lgb1 = LGBMRegressor(n_estimators=900, max_depth=5, random_state=5, n_jobs=8) 
lgb2 = LGBMRegressor(n_estimators=850, max_depth=4, random_state=7, n_jobs=8)
lgb3 = LGBMRegressor(n_estimators=720, max_depth=4, random_state=9, n_jobs=8)

In [None]:
regrs = [
    xgbt1, gbdt1, forest1, lgb1,
    xgbt2, gbdt2, forest2, lgb2,
    xgbt3, gbdt3, forest3, lgb3
]

regrs_light = [
    lgb3, xgbt3, gbdt3, forest3
]

In [None]:
cross_validate(models=regrs_light, X = train_X, Y = train_Y)

# 7.Stack

In [None]:
class Stacker(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models
    
    # Train_X: 原始训练集输入矩阵, Train_Y: 原始训练集输出矩阵, Test_X: 原始测试集输入矩阵
    def fit_predict(self, Train_X, Train_Y, Test_X):
        Train_X = np.array(Train_X)
        Train_Y = np.array(Train_Y)
        Test_X = np.array(Test_X)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2018).split(Train_X, Train_Y))       
        
        # 以基学习器预测结果为特征的 stacker训练数据 与 stacker预测数据
        # 原始训练集预测结果容器
        S_train = np.zeros((Train_X.shape[0], len(self.base_models)))
        # 原始测试集预测结果容器
        S_predict = np.zeros((Test_X.shape[0], len(self.base_models)))
        
        for n_model, regr in enumerate(self.base_models):
            print(n_model + 1, 'Base model:', str(regr).split('(')[0])
            S_predict_i = np.zeros((Test_X.shape[0], self.n_splits))
            
            for n_fold, (train_idx, test_idx) in enumerate(folds):
                # 将X分为训练集与测试集
                X_train_fold, Y_train_fold, X_test_fold, Y_test_fold = Train_X[train_idx], Train_Y[train_idx], Train_X[test_idx], Train_Y[test_idx]
                print ('Fit fold', (n_fold+1), '...')
                regr.fit(X_train_fold, Y_train_fold)
                Y_pred = regr.predict(X_test_fold)
                # 每折训练得到的模型根据原始训练集中的测试折的输入矩阵预测
                S_train[test_idx, n_model] = Y_pred
                # 每折训练得到的模型根据原始测试集输入矩阵预测
                S_predict_i[:, n_fold] = regr.predict(Test_X)
            
            S_predict[:, n_model] = S_predict_i.mean(axis=1)

        nmse_score = cross_val_score(self.stacker, S_train, Train_Y, cv=5, scoring='neg_mean_squared_error')
        print('CV MSE:', -nmse_score)
        print('Stacker AVG MSE:', -nmse_score.mean(), 'Stacker AVG Score:', np.mean(np.divide(1, 1 + np.sqrt(-nmse_score))))

        self.stacker.fit(S_train, Train_Y)
        res = self.stacker.predict(S_predict)
        return res, S_train, S_predict

In [None]:
stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
stacker = Stacker(5, stacking_model, regrs_light)
pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

In [None]:
# stacking_model = SVR(C=100, gamma=0.01, epsilon=0.01)
# stacker = Stacker(5, stacking_model, regrs)
# pred_stack, S_train_data, S_predict_data = stacker.fit_predict(train_X, train_Y, test_X)

In [None]:
# df_result['score'] = pred_stack

# index = df_result[df_result['ID'].isin(special_missing_ID)].index
# df_result.loc[index, 'score'] = 0.379993053

# df_result.to_csv('../result/081701_08789.csv', index=False, header=False)