In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import lightgbm

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

%matplotlib inline

In [2]:
rep_cols = {'ID':'ID', 
 '板温':'board_t', 
 '现场温度':'env_t', 
 '光照强度':'light_strength', 
 '转换效率':'efficiency', 
 '转换效率A':'efficiency_A', 
 '转换效率B':'efficiency_B', 
 '转换效率C':'efficiency_C', 
 '电压A':'V_A',
 '电压B':'V_B', 
 '电压C':'V_C', 
 '电流A':'I_A', 
 '电流B':'I_B', 
 '电流C':'I_C', 
 '功率A':'P_A', 
 '功率B':'P_B', 
 '功率C':'P_C', 
 '平均功率':'P_avg', 
 '风速':'wind_speed',
 '风向':'wind_direction', 
 '发电量':'y'
}

In [3]:
def my_val(preds, train_data):
    label = train_data.get_label()
    return 'score', 1/(1+np.sqrt(mean_squared_error(preds, label))), True

In [None]:
def my_obj(preds, train_data):
    labels = train_deata.get_label()
    

In [4]:
train = pd.read_csv('../data/public.train.csv')
test = pd.read_csv('../data/public.test.csv')

train_len = train.shape[0]

df = pd.concat([train, test])

In [5]:
df.rename(index=str, columns=rep_cols, inplace=True)

In [6]:
df.head()

Unnamed: 0,ID,light_strength,P_A,P_B,P_C,y,P_avg,board_t,env_t,V_A,...,V_C,I_A,I_B,I_C,efficiency,efficiency_A,efficiency_B,efficiency_C,wind_direction,wind_speed
0,10,34,976.86,155.98,1087.5,1.437752,740.11,-19.14,-17.4,729,...,725,1.34,0.22,1.5,80.55,106.32,16.98,118.36,272,0.6
1,11,30,1128.4,172.08,1132.56,1.692575,811.01,-18.73,-17.3,728,...,726,1.55,0.24,1.56,99.9,139.0,21.2,139.51,275,0.8
2,12,41,1279.25,166.06,1310.4,1.975787,918.57,-17.54,-17.0,731,...,720,1.75,0.23,1.82,82.48,114.86,14.91,117.66,283,1.1
3,14,53,1474.6,225.37,1517.34,2.370656,1072.44,-15.43,-16.6,730,...,726,2.02,0.31,2.09,73.98,101.72,15.55,104.67,280,0.9
4,15,65,1548.51,233.28,1674.4,2.532091,1152.06,-14.6,-16.3,727,...,728,2.13,0.32,2.3,64.62,86.86,13.09,93.92,280,1.1


In [7]:
#异常值处理
cols = [c for c in df.columns.tolist() if c!='y' and c!='ID']
for c in cols:
    df[c] = np.clip(df[c],df[c].quantile(0.01),df[c].quantile(0.99))

In [9]:
train = df.iloc[0:train_len]
test = df.iloc[train_len:]

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l2',
    'metric': 'mse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [11]:
predictor = [c for c in train.columns.tolist() if c!='y' and c!='ID']

In [12]:
kf = KFold(5, shuffle=True, random_state=2018)

In [13]:
test_predicts = []
val_preds = np.zeros(train.shape[0])
for n_fold, (tra_idx, val_idx) in enumerate(kf.split(train)):
    tra = train.iloc[tra_idx]
    val = train.iloc[val_idx]
    
    train_set = lightgbm.Dataset(
        tra[predictor],
        tra['y']
    )
    
    validation_set = lightgbm.Dataset(
        val[predictor],
        val['y']
    )
    
    model = lightgbm.train(params, train_set, num_boost_round=5000,
                          valid_sets= [validation_set],
                          valid_names=['valid'],
                          early_stopping_rounds=100,
                           feval=my_val,
                          verbose_eval=100)

    val_preds[val_idx] = model.predict(val[predictor])
    test_predicts.append(model.predict(test[predictor]))

Training until validation scores don't improve for 100 rounds.
[100]	valid's l2: 0.115208	valid's score: 0.74659
[200]	valid's l2: 0.099653	valid's score: 0.760064
[300]	valid's l2: 0.0930849	valid's score: 0.766226
[400]	valid's l2: 0.0892482	valid's score: 0.769975
[500]	valid's l2: 0.0870964	valid's score: 0.772129
[600]	valid's l2: 0.0861401	valid's score: 0.773098
[700]	valid's l2: 0.0852497	valid's score: 0.774009
[800]	valid's l2: 0.0847041	valid's score: 0.77457
[900]	valid's l2: 0.0840564	valid's score: 0.775239
[1000]	valid's l2: 0.0835562	valid's score: 0.775759
[1100]	valid's l2: 0.083197	valid's score: 0.776133
[1200]	valid's l2: 0.0828824	valid's score: 0.776462
[1300]	valid's l2: 0.0827022	valid's score: 0.776651
[1400]	valid's l2: 0.0827175	valid's score: 0.776635
[1500]	valid's l2: 0.0826004	valid's score: 0.776758
[1600]	valid's l2: 0.0824657	valid's score: 0.776899
[1700]	valid's l2: 0.0823382	valid's score: 0.777033
[1800]	valid's l2: 0.0823305	valid's score: 0.7770

In [17]:
test['ans'] = np.mean(test_predicts, axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [18]:
test[['ID','ans']].to_csv('../result/baseline.csv',header=False, index=False)