In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# tools
import joblib

# preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# model
import lightgbm as lgb

global quantile

In [107]:
def add_variable(df): 
    
    df['sum_energy'] = df['DHI'] + df['DNI']

    df['theta'] = 0
    condition_list = [
        (df['Hour'] == 6) | (df['Hour'] == 19),
        (df['Hour'] == 7) | (df['Hour'] == 18),
        (df['Hour'] == 8) | (df['Hour'] == 17),
        (df['Hour'] == 9) | (df['Hour'] == 16),
        (df['Hour'] == 10) | (df['Hour'] == 15),
        (df['Hour'] == 11) | (df['Hour'] == 14),
        (df['Hour'] == 12) | (df['Hour'] == 13)
    ]

    choice_list = [0, 10, 20, 30, 40, 50, 60]

    df['theta'] = np.select(condition_list, choice_list)
    
    # GHI
    df['GHI'] = df['DNI'] * np.cos(df['theta']) + df['DHI']

    condition_list = [
        ((df['Hour'] >= 0) & (df['Hour'] <= 7)) | ((df['Hour'] >= 18) & (df['Hour'] <= 23)),
        ((df['Hour'] > 7) & (df['Hour'] < 10)) | ((df['Hour'] >= 15) & (df['Hour'] < 18)),
        (df['Hour'] >=10) & (df['Hour'] < 15),
    ]
    choice_list = [0, 2, 1]
    df['time'] = np.select(condition_list, choice_list)
    
    df['target0'] = df['TARGET']
    

    return pd.DataFrame(df)

In [137]:
def compute_quantile_loss(y_true, y_pred, *quantile):
    quantile = 0.9
    residual = y_true - y_pred.label
    loss = np.mean(np.maximum(quantile * residual, (quantile - 1) * residual))
    return 'quantile_loss', loss, False

In [118]:
df = pd.read_csv('/home/ys/repo/solar_prediction/data/train/train.csv')
df = add_variable(df)

X = df.drop('TARGET', axis = 1)[:-96]
y = df['TARGET'][96:]

In [119]:
df

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,sum_energy,theta,GHI,time,target0
0,0,0,0,0,0,1.5,69.08,-12,0.0,0,0,0.0,0,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0,0,0,0.0,0,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0,0,0,0.0,0,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0,0,0,0.0,0,0.0
4,0,2,0,0,0,1.6,75.20,-12,0.0,0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,1094,21,30,0,0,2.4,70.70,-4,0.0,0,0,0.0,0,0.0
52556,1094,22,0,0,0,2.4,66.79,-4,0.0,0,0,0.0,0,0.0
52557,1094,22,30,0,0,2.2,66.78,-4,0.0,0,0,0.0,0,0.0
52558,1094,23,0,0,0,2.1,67.72,-4,0.0,0,0,0.0,0,0.0


In [120]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, shuffle = False)
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.2, shuffle = False)

In [121]:
train_ds = lgb.Dataset(X_train, label = y_train)
valid_ds = lgb.Dataset(X_valid, label = y_valid)
test_ds = lgb.Dataset(X_test, label = y_test)

In [122]:
params = {'learning_rate':0.05,
         'max_depth':-1,
         'boosting':'gbdt',
         'objective':'regression',
         'metric':'compute_quanitle_loss',
          'feature_fraction':0.9,
          'bagging_fraction':0.8,
          'bagging_freq':1000,
          'min_data_in_leaf':500,
         }

In [138]:
result = pd.DataFrame([])
result['y_test'] = y_test.values

for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
    *quantile = i
    print('quantile: ', quantile)
    model = lgb.train(
        train_set = train_ds,
        valid_sets = [valid_ds, train_ds],
        valid_names = ['eval','train'],
        params = {'metric':'compute_quantile_loss'},
        early_stopping_rounds = 500,
        verbose_eval = 500,
        num_boost_round = 100000,
        feval = compute_quantile_loss,
        )
    joblib.dump(model, 'model_{}.pkl'.format(i))
    
    #result['q_{}'.format(i)] = (model.predict(X_test)) * 0.8
    print('check iteration #: ',i)
    print('-----------------------')

quantile:  0.9
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1981
[LightGBM] [Info] Number of data points in the train set: 41971, number of used features: 13
[LightGBM] [Info] Start training from score 17.775091
Training until validation scores don't improve for 500 rounds
[500]	train's quantile_loss: 1.55126	eval's quantile_loss: 3.76079
Early stopping, best iteration is:
[54]	train's quantile_loss: 2.56554	eval's quantile_loss: 3.15881
check iteration #:  0.9
-----------------------


In [144]:

final_result = pd.DataFrame([])

for i in range(0,81):

    path = '/home/ys/repo/solar_prediction/data/test/' + str(i) +'.csv'
    print(path)
    df = pd.read_csv(path)
    df = add_variable(df)

    X2 = df.drop('TARGET', axis = 1)
    y2 = df['TARGET']

    result2 = pd.DataFrame([])
    result2['id'] = 0
    result2['hour'] = 0
    result2['day'] = 0
    result2['day'] = result2['day'].astype('int32')


    for j,k in enumerate([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
        quantile = k
        
        file_name = '/home/ys/repo/solar_prediction/02_model/YS/model_{}.pkl'.format(k)
        model = joblib.load(file_name)

        result2['q_{}'.format(k)] = model.predict(X2)

        X2['Minute'] = X2['Minute'].astype('str')
        X2['Minute'] = X2['Minute'].replace('0','00')

        for h in range(len(X2)):
            result2['id'].loc[h] = '{}.csv_Day{}_{}h{}m'.format(i, X2['Day'].iloc[h]+2, X2['Hour'].iloc[h], X2['Minute'].iloc[h])
            result2['hour'].loc[h] = X2['Hour'].iloc[h]
            result2['day'].loc[h] = X2['Day'].iloc[h]+2
        
        for h in range(len(X2)):
            if (result2['hour'].loc[h] < 7) | (result2['hour'].loc[h] > 19):
                result2['q_{}'.format(k)].iloc[h] = 0
            if result2['q_{}'.format(k)].loc[h] < 0:
                result2['q_{}'.format(k)].iloc[h] = 0
        
        X2['Minute'] = X2['Minute'].astype('int')
        
    result2 = result2[240:]
    final_result = pd.concat([final_result, result2])
    
            

/home/ys/repo/solar_prediction/data/test/0.csv
/home/ys/repo/solar_prediction/data/test/1.csv
/home/ys/repo/solar_prediction/data/test/2.csv
/home/ys/repo/solar_prediction/data/test/3.csv
/home/ys/repo/solar_prediction/data/test/4.csv
/home/ys/repo/solar_prediction/data/test/5.csv
/home/ys/repo/solar_prediction/data/test/6.csv
/home/ys/repo/solar_prediction/data/test/7.csv
/home/ys/repo/solar_prediction/data/test/8.csv
/home/ys/repo/solar_prediction/data/test/9.csv
/home/ys/repo/solar_prediction/data/test/10.csv
/home/ys/repo/solar_prediction/data/test/11.csv
/home/ys/repo/solar_prediction/data/test/12.csv
/home/ys/repo/solar_prediction/data/test/13.csv
/home/ys/repo/solar_prediction/data/test/14.csv
/home/ys/repo/solar_prediction/data/test/15.csv
/home/ys/repo/solar_prediction/data/test/16.csv
/home/ys/repo/solar_prediction/data/test/17.csv
/home/ys/repo/solar_prediction/data/test/18.csv
/home/ys/repo/solar_prediction/data/test/19.csv
/home/ys/repo/solar_prediction/data/test/20.csv
/h

In [145]:
final_result

Unnamed: 0,id,hour,day,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
240,0.csv_Day7_0h00m,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
241,0.csv_Day7_0h30m,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,0.csv_Day7_1h00m,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.csv_Day7_1h30m,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
244,0.csv_Day7_2h00m,2.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
331,80.csv_Day8_21h30m,21.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332,80.csv_Day8_22h00m,22.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
333,80.csv_Day8_22h30m,22.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
334,80.csv_Day8_23h00m,23.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [146]:
final_result = final_result.drop(['hour','day'], axis =1 )
final_result = final_result.reset_index()
final_result = final_result.drop('index', axis = 1)

In [147]:
final_result.to_csv('submit_final5.csv', index = False)

In [143]:
final_result

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
