In [1]:
import numpy as np
import pandas as pd
 
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.externals import joblib 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.decomposition import PCA
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVR
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import xgboost as xgb
from lightgbm import LGBMRegressor
import math
%matplotlib inline

pd.set_option('display.max_colwidth',1000)
pd.set_option('display.height',1000)
pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)
pd.set_option('display.width',1000)

height has been deprecated.





In [2]:
train_data = pd.read_csv('./public.train.csv')
test_data = pd.read_csv('./public.test.csv')

In [3]:
df_result = pd.DataFrame()
df_result['ID'] = list(test_data['ID'])
special_missing_ID = test_data[test_data[(test_data == 0) | (test_data == 0.)].count(axis=1) > 13]['ID']

### 异常值处理

In [4]:
# 负值 -> 正值
# power_negative_index = train_data[train_data['发电量'] < 0].index
# train_data.loc[power_negative_index, '发电量'] = - train_data.loc[power_negative_index, '发电量']
# train_data.loc[power_negative_index, :]

In [5]:
all_data = pd.concat([train_data, test_data], axis=0).sort_values(by='ID').reset_index().drop(['index'], axis=1)
bad_feature = ['ID', '功率A', '功率B', '功率C', '平均功率', '现场温度', '电压A', '电压B', '电压C', '电流B', '电流C', '转换效率', '转换效率A', '转换效率B', '转换效率C']
bad_index = all_data[bad_feature][
    (all_data[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std()) | 
    (all_data[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
].dropna(how='all').index

nn_bad_data = all_data.loc[np.concatenate([bad_index - 1, bad_index, bad_index + 1])].sort_values(by='ID', ascending=True).drop_duplicates()
bad_data = all_data.loc[bad_index].sort_values(by='ID', ascending=True)

In [6]:
len(bad_data)

53

In [7]:
# 上下记录均值替代异常值
for idx, line in bad_data.iterrows():
    ID = line['ID']
    col_index = line[bad_feature][ 
        (line[bad_feature] > all_data[bad_feature].mean() + 2 * all_data[bad_feature].std())| 
        (line[bad_feature] < all_data[bad_feature].mean() - 2 * all_data[bad_feature].std())
    ].index
    index = all_data[all_data['ID'] == ID].index
    
    before_offset = 1
    while (idx + before_offset)in bad_index:
        before_offset += 1

    after_offset = 1
    while (idx + after_offset) in bad_index:
        after_offset += 1
    
    replace_value = (all_data.loc[index - before_offset, col_index].values + all_data.loc[index + after_offset, col_index].values) / 2
    all_data.loc[index, col_index] = replace_value[0]

### 拆分数据

In [8]:
test_data = all_data[all_data['ID'].isin(df_result['ID'])].drop(['发电量'], axis=1).reset_index().drop(['index'], axis=1)
len(test_data)

8409

In [9]:
train_data = all_data.drop(all_data[all_data['ID'].isin(df_result['ID'])].index).reset_index().drop(['index'], axis=1)
len(train_data)

9000

### 去除重复值

In [10]:
train_data.drop_duplicates(train_data.columns.drop('ID'), keep='first', inplace=True)

### Importance

In [None]:
# train_data['ID'] = train_data['ID'].apply(lambda x: x % 190)
# forest_imp = RandomForestRegressor(n_estimators=150, max_features='log2', random_state=2, n_jobs=8)
# X = train_data.drop(['发电量'], axis=1)
# y = train_data['发电量']
# forest_imp.fit(X, y)
# importance = forest_imp.feature_importances_

# indices = np.argsort(importance)[:: -1]
# print(indices, indices.shape)

# for f in range(X.shape[1]):
#     print("%2d) %-*s %f" %
#           (f + 1, 30, train_data.drop(['发电量'], axis=1).columns[indices[f]], importance[indices[f]]))

### Model: Xgboost, Sklearn_GBDT, RandomForest, LightGBM

In [11]:
def generate_train_data(train_data, test_data, poly=False, select=False):
    # 1. ID % 190
#     train_data['ID'] = train_data['ID'].apply(lambda x: x % 190)
#     sub_data = test_data
#     sub_data['ID'].apply(lambda x: x % 190)
#     y = train_data['发电量']
#     X = train_data.drop(['发电量'], axis=1)
    
    # 2. ID
#     y = train_data['发电量']
#     X = train_data.drop(['发电量'], axis=1)
#     sub_data = test_data
    
    # 3. without ID
    y = train_data['发电量']
    X = train_data.drop(['发电量','ID'], axis=1)
    sub_data = test_data.drop(['ID'], axis=1)
    
    # 4. shuffle
#     X, y = shuffle(X, y, random_state=123)
#     X, y = shuffle(X, y, random_state=1234)

    if poly:
        from sklearn.preprocessing import PolynomialFeatures
        poly = PolynomialFeatures(degree=2, interaction_only=True)
        X = poly.fit_transform(X)
        sub_data = poly.transform(sub_data)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

    if select:
        from sklearn.feature_selection import SelectFromModel
        sm = SelectFromModel(GradientBoostingRegressor(random_state=2))
        X_train = sm.fit_transform(X_train, y_train)
        X_test = sm.transform(X_test)
        sub_data = sm.transform(sub_data)
        
    return X_train, X_test, y_train, y_test, sub_data

In [12]:
def cal_score(mse):
    return 1 / (1 + math.sqrt(mse))

In [13]:
X_train, X_test, y_train, y_test, sub_data = generate_train_data(train_data, test_data, poly=True, select=True)
print(X_train.shape, sub_data.shape)

(7134, 54) (8409, 54)


In [14]:
xgbt = xgb.XGBRegressor(n_estimators=300, max_depth=3, random_state=2, n_jobs=8)
gbdt = GradientBoostingRegressor(n_estimators=300, max_depth=3, max_features='log2', random_state=2)
forest = RandomForestRegressor(n_estimators=100, max_features='log2', random_state=2, n_jobs=8)

lgb_params = {}
lgb_params['n_estimators'] = 300
lgb_params['max_depth'] = 3 
lgb_params['random_state'] = 2
lgb = LGBMRegressor(**lgb_params)

In [15]:
def train(X_train, y_train):
    xgbt.fit(X_train, y_train)
    gbdt.fit(X_train, y_train)
    forest.fit(X_train, y_train)
    lgb.fit(X_train, y_train)

def predict(X_test, y_test):
    y_pred_xgb = xgbt.predict(X_test)
    mse_xgb = mean_squared_error(y_test.values, y_pred_xgb)
    
    y_pred_gbdt = gbdt.predict(X_test)
    mse_gbdt = mean_squared_error(y_test.values, y_pred_gbdt)
    
    y_pred_forest = forest.predict(X_test)
    mse_forest = mean_squared_error(y_true=y_test, y_pred=y_pred_forest)
    
    y_pred_lgb = lgb.predict(X_test)
    mse_lgb = mean_squared_error(y_true=y_test, y_pred=y_pred_lgb)
    
    res = pd.DataFrame()
    res['model'] = np.array(['XGBoost', 'Sklearn_GBDT', 'RandomForest', 'LightGBM'])
    res['mse'] = np.array([mse_xgb, mse_gbdt, mse_forest, mse_lgb])
    res['score'] = np.array([cal_score(mse_xgb), cal_score(mse_gbdt), cal_score(mse_forest), cal_score(mse_lgb)])
    return res

def cross_validation_using_mse(X_train, y_train, cv=5):
    scores_xgb = cross_val_score(xgbt, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    xgb_avg = np.average(-scores_xgb)
    print('Average XGB - MSE:', xgb_avg, ' - Score:', cal_score(xgb_avg))
    
    scores_gbdt = cross_val_score(gbdt, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    gbdt_avg = np.average(-scores_gbdt)
    print('Average GBDT - MSE:', gbdt_avg, ' - Score:', cal_score(gbdt_avg))
    
    scores_forest = cross_val_score(forest, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    rf_avg = np.average(-scores_forest)
    print('Average RF - MSE:', rf_avg, ' - Score:', cal_score(rf_avg))
    
    scores_lgb = cross_val_score(lgb, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
    lgb_avg = np.average(-scores_lgb)
    print('Average LGB - MSE:', lgb_avg, ' - Score:', cal_score(lgb_avg))
    
    res = pd.DataFrame({
        'XGBoost': -scores_xgb,
        'Sklearn_GBDT': -scores_gbdt,
        'RandomForest': -scores_forest,
        'LightGBM': -scores_lgb
    })
    
    return res

In [16]:
train(X_train, y_train)
predict(X_test, y_test)

Unnamed: 0,model,mse,score
0,XGBoost,0.016908,0.884931
1,Sklearn_GBDT,0.02053,0.874675
2,RandomForest,0.016276,0.886858
3,LightGBM,0.016375,0.886551


In [17]:
cross_validation_using_mse(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]), cv=5)

Average XGB - MSE: 0.03248307780460091  - Score: 0.8472920289633665
Average GBDT - MSE: 0.033259116671348274  - Score: 0.8457583538929327
Average RF - MSE: 0.030792564684868108  - Score: 0.8507176759148936
Average LGB - MSE: 0.030453712226382056  - Score: 0.8514189474729728


Unnamed: 0,LightGBM,RandomForest,Sklearn_GBDT,XGBoost
0,0.013062,0.013684,0.016057,0.013085
1,0.023412,0.021713,0.025226,0.029165
2,0.013821,0.014352,0.017467,0.01601
3,0.085465,0.087725,0.087286,0.087944
4,0.016508,0.016488,0.020259,0.016211


In [18]:
xgbt2 = xgb.XGBRegressor(n_estimators=300, max_depth=3, random_state=2, n_jobs=8)
gbdt2 = GradientBoostingRegressor(n_estimators=300, max_depth=3, max_features='log2', random_state=2)
forest2 = RandomForestRegressor(n_estimators=100, max_features='log2', random_state=2, n_jobs=8)
lgb2 = LGBMRegressor(**lgb_params)

In [19]:
xgbt2.fit(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]))
gbdt2.fit(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]))
forest2.fit(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]))
lgb2.fit(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]))

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=3,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=300, n_jobs=-1, num_leaves=31, objective=None,
       random_state=2, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

### KNN

In [20]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(np.concatenate([X_train, X_test]), np.concatenate([y_train, y_test]))

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [21]:
scores_knn = cross_val_score(knn, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(-scores_knn)
knn_avg = np.average(-scores_knn)
print('Average XGB - MSE:', knn_avg, ' - Score:', cal_score(knn_avg))

[0.0210029  0.03720661 0.02300897 0.07728633 0.07190166]
Average XGB - MSE: 0.04608129499247423  - Score: 0.823271892786727


### stack

In [22]:
all_X_train = np.concatenate([X_train, X_test])
all_y_train = np.concatenate([y_train, y_test])
regrs = [xgbt2, gbdt2, forest2, lgb2, knn]

In [23]:
stack_X_train = np.zeros((all_X_train.shape[0],len(regrs)))
stack_X_sub = np.zeros((sub_data.shape[0],len(regrs)))

for j, regr in enumerate(regrs):
    stack_X_train[:, j] = regr.predict(all_X_train)
    stack_X_sub[:, j] = regr.predict(sub_data)

stacker = RandomForestRegressor(n_estimators=70, random_state=2)
mse_stack = cross_val_score(stacker, stack_X_train, all_y_train, cv=5, scoring='neg_mean_squared_error')
stack_avg = np.average(-mse_stack)
print(-mse_stack)
print('Average XGB - MSE:', stack_avg, ' - Score:', cal_score(stack_avg))

[0.00169305 0.00289093 0.00168124 0.01063128 0.00199901]
Average XGB - MSE: 0.0037791044520296226  - Score: 0.9420858195951028


In [24]:
stacker.fit(stack_X_train, all_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=70, n_jobs=1,
           oob_score=False, random_state=2, verbose=0, warm_start=False)

### Output

In [20]:
def output(sub_data, model='gbdt'):
    if model == 'nn':
        sub_data_nn = min_max_scaler.transform(np.array(sub_data))
        sub_data_nn = poly.transform(sub_data_nn)
        pred = NN.predict(sub_data_nn)

    elif model == 'gbdt':
        pred = gbdt2.predict(sub_data)

    elif model == 'xgb':
        pred = xgbt2.predict(sub_data)

    elif model == 'rf':
        pred = forest2.predict(sub_data)
    
    elif model == 'lgb':
        pred = lgb2.predict(sub_data)
        
    return pred

In [26]:
# pred_stack = stacker.predict(stack_X_sub)
# df_result['score'] = pred_stack

In [21]:
df_result['score'] = output(sub_data, 'rf')

In [22]:
index = df_result[df_result['ID'].isin(special_missing_ID)].index
df_result.loc[index, 'score'] = 0.379993053
df_result[df_result['ID'].isin(special_missing_ID)]

Unnamed: 0,ID,score
0,1,0.379993
425,940,0.379993
754,1694,0.379993
841,1879,0.379993
1276,2823,0.379993
1427,3202,0.379993
1979,4459,0.379993
2068,4648,0.379993
2139,4821,0.379993
2217,5010,0.379993


In [24]:
df_result

Unnamed: 0,ID,score
0,1,0.379993
1,9,1.316774
2,13,2.139053
3,17,3.398114
4,18,3.661115
5,21,4.144628
6,23,4.214166
7,25,4.799182
8,26,4.949822
9,28,5.208117


In [26]:
df_result.to_csv('submit_rf_poly_select_dropdup_replaceoutlier_nega20.csv', index=False, header=False)