In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import lightgbm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

In [3]:
def my_val(preds, train_data):
    label = train_data.get_label()
    return 'score', 1/(1+np.sqrt(mean_squared_error(preds, label))), True

In [4]:
def my_obj(preds, train_data):
    labels = train_deata.get_label()
    

In [5]:
train = pd.read_csv('../data/public.train.csv')
test = pd.read_csv('../data/public.test.csv')

train_len = train.shape[0]

df = pd.concat([train, test])

In [6]:
df.rename(index=str, columns=rep_cols, inplace=True)

In [7]:
df.head()

Unnamed: 0,ID,light_strength,P_A,P_B,P_C,y,P_avg,board_t,env_t,V_A,...,V_C,I_A,I_B,I_C,efficiency,efficiency_A,efficiency_B,efficiency_C,wind_direction,wind_speed
0,10,34,976.86,155.98,1087.5,1.437752,740.11,-19.14,-17.4,729,...,725,1.34,0.22,1.5,80.55,106.32,16.98,118.36,272,0.6
1,11,30,1128.4,172.08,1132.56,1.692575,811.01,-18.73,-17.3,728,...,726,1.55,0.24,1.56,99.9,139.0,21.2,139.51,275,0.8
2,12,41,1279.25,166.06,1310.4,1.975787,918.57,-17.54,-17.0,731,...,720,1.75,0.23,1.82,82.48,114.86,14.91,117.66,283,1.1
3,14,53,1474.6,225.37,1517.34,2.370656,1072.44,-15.43,-16.6,730,...,726,2.02,0.31,2.09,73.98,101.72,15.55,104.67,280,0.9
4,15,65,1548.51,233.28,1674.4,2.532091,1152.06,-14.6,-16.3,727,...,728,2.13,0.32,2.3,64.62,86.86,13.09,93.92,280,1.1


In [8]:
#异常值处理
cols = [c for c in df.columns.tolist() if c!='y' and c!='ID']
for c in cols:
    df[c+'_is_out_of_upper'] = (df[c]>df[c].quantile(0.99)).astype(np.int32)
    df[c+'_is_out_of_lower'] = (df[c]<df[c].quantile(0.01)).astype(np.int32)
    df[c] = np.clip(df[c],df[c].quantile(0.01),df[c].quantile(0.99))

In [9]:
target = ['P_A','P_avg','I_A','P_C','I_C','P_B','I_B']

for c in target:
    df['log_'+c] = np.log1p(df[c])

In [11]:
# 第一阶，用其他值预测某些特征

params_layer1 = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': 'mae',
    'num_leaves': 48,
    'learning_rate': 0.2,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': 0
}
kf_layer1 = KFold(5, shuffle=True, random_state=1991)

layer1_result = df[['ID']]

for t in target:
    print('Now, we are processing',t)
    # 特征中去掉t
    predictor_t = [c for c in df.columns.tolist() if c not in['ID','y',t,t+'_is_out_of_upper',t+'_is_out_of_lower']]
    
    # 初始化结果
    val_preds = np.zeros(df.shape[0])
    
    for n_fold, (tra_idx, val_idx) in enumerate(kf_layer1.split(df)):
        tra = df.iloc[tra_idx]
        # 删掉有问题的数据
        tra.drop(tra[(tra[t+'_is_out_of_upper']==1)|(tra[t+'_is_out_of_lower']==1)].index, inplace=True)
        
        val = df.iloc[val_idx]
        # 删掉有问题的数据
        val_c = val.drop(val[(val[t+'_is_out_of_upper']==1)|(val[t+'_is_out_of_lower']==1)].index)

        train_set = lightgbm.Dataset(
            tra[predictor_t],
            tra[t]
        )

        validation_set = lightgbm.Dataset(
            val_c[predictor_t],
            val_c[t]
        )

        model = lightgbm.train(params_layer1, train_set, 
                               num_boost_round=8000,
                              valid_sets= [validation_set],
                              valid_names=['valid'],
                              early_stopping_rounds=100,
                              verbose_eval=1000)

        val_preds[val_idx] = model.predict(val[predictor_t])
        
    # 将结果保存起来
    layer1_result['predicted_'+t] = val_preds

Now, we are processing P_A
Training until validation scores don't improve for 100 rounds.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


[1000]	valid's l1: 19.5699
[2000]	valid's l1: 19.1661
[3000]	valid's l1: 18.9809
[4000]	valid's l1: 18.881
[5000]	valid's l1: 18.7997
Early stopping, best iteration is:
[5861]	valid's l1: 18.7601
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 22.412
[2000]	valid's l1: 21.6915
[3000]	valid's l1: 21.4276
[4000]	valid's l1: 20.9447
[5000]	valid's l1: 20.7911
[6000]	valid's l1: 20.6377
Early stopping, best iteration is:
[5903]	valid's l1: 20.6355
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 21.9198
[2000]	valid's l1: 21.2014
[3000]	valid's l1: 20.8011
Early stopping, best iteration is:
[3312]	valid's l1: 20.7189
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 20.8502
[2000]	valid's l1: 20.4632
[3000]	valid's l1: 20.2324
Early stopping, best iteration is:
[3536]	valid's l1: 20.121
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 21.3021
[2000]	valid's l1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Now, we are processing P_avg
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 14.0323
[2000]	valid's l1: 13.6525
[3000]	valid's l1: 13.5146
Early stopping, best iteration is:
[3673]	valid's l1: 13.464
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 13.2075
[2000]	valid's l1: 12.9191
Early stopping, best iteration is:
[2846]	valid's l1: 12.7925
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 13.6669
[2000]	valid's l1: 13.2924
[3000]	valid's l1: 13.0858
[4000]	valid's l1: 13.0194
[5000]	valid's l1: 12.975
Early stopping, best iteration is:
[5375]	valid's l1: 12.9535
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 14.6364
[2000]	valid's l1: 14.3162
[3000]	valid's l1: 14.1967
Early stopping, best iteration is:
[3760]	valid's l1: 14.1098
Training until validation scores don't improve for 100 rounds.
[1000]	valid's l1: 13.7579
[2000]	valid's l1: 13.1464
[3000

In [12]:
layer1_result.head()

Unnamed: 0,ID,predicted_P_A,predicted_P_avg,predicted_I_A,predicted_P_C,predicted_I_C,predicted_P_B,predicted_I_B
0,10,966.9927,745.956065,1.378123,1105.88718,1.519377,112.660774,0.261436
1,11,1143.93398,791.085711,1.506057,1160.905395,1.611262,158.551712,0.234952
2,12,1268.876903,908.922733,1.792034,1316.487492,1.886203,147.423895,0.191875
3,14,1488.13708,1056.086816,2.016126,1552.175131,2.096158,245.03884,0.33996
4,15,1540.428434,1167.471279,2.061485,1678.449627,2.283592,191.410674,0.265982


In [13]:
layer1_result.to_pickle('../feature/predicted_value.pkl')

In [18]:
layer1_result = pd.read_pickle('../feature/predicted_value.pkl')

In [19]:
df = df.merge(layer1_result, on='ID', how='left')

In [20]:
for t in target:
    df['diff_'+t] = df[t]-df['predicted_'+t]

In [10]:
train = df.iloc[0:train_len]
test = df.iloc[train_len:]

In [13]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.08,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': 0
}

In [12]:
predictor = [c for c in train.columns.tolist() if c not in['ID','y']]

In [14]:
test_predicts = []
val_preds = []
for idx, seed in enumerate([1,2,3,4,5]):
    kf = KFold(5, shuffle=True, random_state=seed)
    
    val_preds.append(np.zeros(train.shape[0]))
    for n_fold, (tra_idx, val_idx) in enumerate(kf.split(train)):
        tra = train.iloc[tra_idx]
        val = train.iloc[val_idx]

        train_set = lightgbm.Dataset(
            tra[predictor],
            tra['y']
        )

        validation_set = lightgbm.Dataset(
            val[predictor],
            val['y']
        )

        model = lightgbm.train(params, train_set, num_boost_round=5000,
                              valid_sets= [validation_set],
                              valid_names=['valid'],
                              early_stopping_rounds=100,
                               feval=my_val,
                              verbose_eval=500)

        val_preds[idx][val_idx] = model.predict(val[predictor])
        test_predicts.append(model.predict(test[predictor]))

Training until validation scores don't improve for 100 rounds.
[500]	valid's l2: 0.0594366	valid's score: 0.80399
Early stopping, best iteration is:
[426]	valid's l2: 0.0593033	valid's score: 0.804167
Training until validation scores don't improve for 100 rounds.
[500]	valid's l2: 0.0226903	valid's score: 0.869087
[1000]	valid's l2: 0.0192116	valid's score: 0.878267
[1500]	valid's l2: 0.0186703	valid's score: 0.879786
Early stopping, best iteration is:
[1481]	valid's l2: 0.0186448	valid's score: 0.879859
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[114]	valid's l2: 0.013582	valid's score: 0.895623
Training until validation scores don't improve for 100 rounds.
[500]	valid's l2: 0.0530051	valid's score: 0.812857
[1000]	valid's l2: 0.0505087	valid's score: 0.816499
[1500]	valid's l2: 0.0501073	valid's score: 0.817096
Early stopping, best iteration is:
[1524]	valid's l2: 0.0500525	valid's score: 0.817178
Training until validation scores

In [15]:
print('local cv:',1/(1+np.sqrt(mean_squared_error(train['y'],np.mean(val_preds,axis=0)))))

local cv: 0.852215878345


In [16]:
pd.Series(model.feature_importance(importance_type='gain'),
          index=model.feature_name()).sort_values(ascending=False).head(10)

P_A        177202.330046
log_P_A    144375.902258
I_A         52886.129968
P_C         23843.209814
P_avg       19786.504029
log_P_C     11110.939180
I_C          4295.495398
log_P_B      3999.880512
log_I_C      3550.889470
log_I_A      2343.036369
dtype: float64

In [27]:
test['ans'] = np.mean(test_predicts, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
test[['ID','ans']].to_csv('../result/0727-8534-bagging.csv',header=False, index=False)