# 태양광 발전량 예측 경진대회

## IMPORT LIBRARY & DATA

In [1]:
import warnings
warnings.filterwarnings(action='ignore')
import os, sys
from tqdm import tqdm

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
from lightgbm import LGBMRegressor
import catboost as catb
from catboost import CatBoostRegressor

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [2]:
print(pd.__version__)
print(np.__version__)
print(sns.__version__)
print(sklearn.__version__)
print(lgbm.__version__)
print(catb.__version__)

1.0.5
1.19.3
0.10.1
0.23.1
3.0.0
0.24.1


In [3]:
TRAIN_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv'
TEST_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\test'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\sample_submission.csv'

In [4]:
train = pd.read_csv(TRAIN_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

## Preprocessing

In [5]:
def sum_hour_minute(train):
    """
    input: df
    output: df
    summary:
    df를 받아서 Hour와 Minute을 합쳐주는 함수
    
    """
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [6]:
train = sum_hour_minute(train)

In [7]:
# 타겟 값을 만들어줍니다.
train['TARGET_1'] = train.TARGET.shift(-48)
train['TARGET_2'] = train.TARGET_1.shift(-48)

In [8]:
# 학습에 쓰지 않을 변수들은 버려줍니다.
train.drop(['Day', 'Minute'], axis=1, inplace=True)

### Columns Shift

test데이터를 최대한 활용하기 위해 하루 단위로 변수를 shift해서 옆으로 붙여 줍니다. (unstack개념)

In [9]:
train.columns

Index(['Hour', 'DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET', 'TARGET_1',
       'TARGET_2'],
      dtype='object')

In [10]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 6 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [11]:
def shift_columns(data, shifted_columns, lags):
    
    temp = data.copy()
    for lag in lags:
        for col in shifted_columns:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            
    return temp

In [12]:
# shift되지 않은 train은 test를 만들기 위해 필요합니다.
# 새로운 df로 받아줍시다.
shifted_train = shift_columns(train, shifted_columns, lags)
shifted_train.shape

(52560, 39)

In [13]:
shifted_train.dropna(inplace=True) # shift되기 때문에 nan값이 생겨 dropna 해줘야 함.
shifted_train.shape

(52224, 39)

In [14]:
# 하루 48개 idx X 7일 X 52주 X 2년 + 48개 idx(앞에 짤렸음) = 34992
X = shifted_train[:34992]
val = shifted_train[34992:]

In [15]:
def make_train_test_val(X, val):

    x_train = X.iloc[:, X.columns != 'TARGET_1']
    x_train = x_train.iloc[:, x_train.columns != 'TARGET_2']
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val.iloc[:, val.columns != 'TARGET_1']
    x_val = x_val.iloc[:, x_val.columns != 'TARGET_2']
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    print(x_train.shape)
    print(y_train_1.shape)
    print(y_train_2.shape)
    print(x_val.shape)
    print(y_val_1.shape)
    print(y_val_2.shape)
    
    return x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2

In [16]:
x_train, y_train_1, y_train_2, x_val, y_val_1, y_val_2 = make_train_test_val(X, val)

(34992, 37)
(34992,)
(34992,)
(17232, 37)
(17232,)
(17232,)


In [17]:
x_train

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48,DHI_lag_96,DNI_lag_96,WS_lag_96,RH_lag_96,T_lag_96,TARGET_lag_96,DHI_lag_144,DNI_lag_144,WS_lag_144,RH_lag_144,T_lag_144,TARGET_lag_144,DHI_lag_192,DNI_lag_192,WS_lag_192,RH_lag_192,T_lag_192,TARGET_lag_192,DHI_lag_240,DNI_lag_240,WS_lag_240,RH_lag_240,T_lag_240,TARGET_lag_240
240,0.0,0,0,1.5,77.41,1,0.0,0.0,0.0,2.1,72.10,-5.0,0.0,0.0,0.0,1.8,77.76,-14.0,0.0,0.0,0.0,2.2,73.80,-8.0,0.0,0.0,0.0,1.6,90.66,-10.0,0.0,0.0,0.0,1.5,69.08,-12.0,0.0
241,0.5,0,0,1.7,77.42,1,0.0,0.0,0.0,2.0,72.10,-5.0,0.0,0.0,0.0,2.0,77.77,-13.0,0.0,0.0,0.0,2.1,68.20,-8.0,0.0,0.0,0.0,1.6,90.68,-10.0,0.0,0.0,0.0,1.5,69.06,-12.0,0.0
242,1.0,0,0,1.9,76.77,1,0.0,0.0,0.0,2.0,72.14,-5.0,0.0,0.0,0.0,2.2,77.69,-13.0,0.0,0.0,0.0,2.1,69.06,-8.0,0.0,0.0,0.0,1.6,88.11,-11.0,0.0,0.0,0.0,1.6,71.78,-12.0,0.0
243,1.5,0,0,2.0,76.77,1,0.0,0.0,0.0,2.0,72.11,-5.0,0.0,0.0,0.0,2.2,71.57,-13.0,0.0,0.0,0.0,2.1,69.04,-8.0,0.0,0.0,0.0,1.6,88.11,-11.0,0.0,0.0,0.0,1.6,71.75,-12.0,0.0
244,2.0,0,0,2.2,76.39,1,0.0,0.0,0.0,2.0,72.94,-5.0,0.0,0.0,0.0,2.2,72.62,-13.0,0.0,0.0,0.0,2.2,69.38,-8.0,0.0,0.0,0.0,1.6,90.85,-11.0,0.0,0.0,0.0,1.6,75.20,-12.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35227,21.5,0,0,3.0,83.41,-11,0.0,0.0,0.0,1.5,74.69,-6.0,0.0,0.0,0.0,1.3,79.92,-5.0,0.0,0.0,0.0,1.9,68.61,-2.0,0.0,0.0,0.0,1.0,78.26,-12.0,0.0,0.0,0.0,2.5,64.00,-1.0,0.0
35228,22.0,0,0,2.8,86.02,-11,0.0,0.0,0.0,1.8,79.87,-6.0,0.0,0.0,0.0,1.3,78.19,-5.0,0.0,0.0,0.0,1.9,68.93,-2.0,0.0,0.0,0.0,1.1,77.77,-12.0,0.0,0.0,0.0,2.6,61.82,-1.0,0.0
35229,22.5,0,0,2.8,85.99,-11,0.0,0.0,0.0,1.9,79.86,-6.0,0.0,0.0,0.0,1.3,78.18,-5.0,0.0,0.0,0.0,1.9,68.93,-2.0,0.0,0.0,0.0,1.1,77.76,-12.0,0.0,0.0,0.0,2.6,61.81,-1.0,0.0
35230,23.0,0,0,2.9,84.16,-11,0.0,0.0,0.0,2.0,78.80,-6.0,0.0,0.0,0.0,1.4,76.79,-6.0,0.0,0.0,0.0,2.0,68.66,-2.0,0.0,0.0,0.0,1.2,76.81,-13.0,0.0,0.0,0.0,2.6,60.16,-1.0,0.0


In [18]:
# test set load

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    # shifted
    temp = shift_columns(temp, shifted_columns, lags)
    # drop
    temp.drop(['Day', 'Minute'], axis=1, inplace=True)
    
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 37)


## Modeling

### LGBM

In [19]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000,
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

In [20]:
# Target1
lgbm_models_1, results_1 = train_data(x_train, y_train_1, x_val, y_val_1, X_test)

# Target2
lgbm_models_2, results_2 = train_data(x_train, y_train_2, x_val, y_val_2, X_test)

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[114]	valid_0's quantile: 1.45124
0.2
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[140]	valid_0's quantile: 2.33869
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.78605
Early stopping, best iteration is:
[311]	valid_0's quantile: 2.776
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.88616
Early stopping, best iteration is:
[254]	valid_0's quantile: 2.8856
0.5
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.76079
Early stopping, best iteration is:
[351]	valid_0's quantile: 2.75406
0.6
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.45842
Early stopping, best iteration is:
[374]	valid_0's quantile: 2.45314
0.7
Training until validation scores don't improve for 30

In [21]:
submission = pd.read_csv(SUBMISSION_PATH)
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.csv_Day7_7h30m,0.37,0.35,0.48,0.47,1.47,1.52,2.06,2.94,4.89
16,0.csv_Day7_8h00m,2.24,2.92,3.54,3.74,5.23,5.85,6.57,7.53,7.95
17,0.csv_Day7_8h30m,3.22,6.01,6.74,6.05,9.51,10.18,12.53,12.24,15.0
18,0.csv_Day7_9h00m,4.84,7.76,10.29,12.03,15.28,16.48,20.25,21.97,20.16
19,0.csv_Day7_9h30m,6.39,9.97,12.12,15.73,20.12,17.66,25.95,27.58,29.77
20,0.csv_Day7_10h00m,8.31,12.16,15.48,18.15,25.27,20.79,29.08,28.32,32.0
21,0.csv_Day7_10h30m,10.45,16.67,20.98,23.71,28.53,28.5,36.74,36.43,34.47
22,0.csv_Day7_11h00m,13.53,19.56,26.08,31.32,33.97,37.09,40.58,42.35,42.52
23,0.csv_Day7_11h30m,13.74,19.03,23.7,30.95,32.96,37.25,42.48,44.48,42.23


In [22]:
submission_lgbm_shifted = submission.copy()

### CatB

In [23]:
# base model
def CATB(X_train, Y_train, X_valid, Y_valid, X_test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # (a) Modeling  
    model = CatBoostRegressor(objective='Quantile',
                              iterations=100000,
                              learning_rate=0.027)            
                         
                         
    model.fit(X_train,
              Y_train,
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300,
              verbose=500) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    
    CATB_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        CATB_actual_pred = pd.concat([CATB_actual_pred, pred*(q+0.5)],axis=1) #pred를 컨캣해줍니다.

    CATB_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return model, CATB_actual_pred

In [24]:
# Target1
models_1, results_1 = CATB(x_train, y_train_1, x_val, y_val_1, X_test)
# Target2
models_2, results_2 = CATB(x_train, y_train_2, x_val, y_val_2, X_test)

0:	learn: 8.6976866	test: 8.7241896	best: 8.7241896 (0)	total: 146ms	remaining: 4h 4m 2s
500:	learn: 2.3144752	test: 2.7393779	best: 2.7285693 (309)	total: 3.92s	remaining: 12m 59s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 2.728569253
bestIteration = 309

Shrink model to first 310 iterations.
0:	learn: 8.7029060	test: 8.7194443	best: 8.7194443 (0)	total: 11.5ms	remaining: 19m 14s
500:	learn: 2.3811997	test: 2.8122450	best: 2.8114702 (482)	total: 3.79s	remaining: 12m 32s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 2.811470231
bestIteration = 482

Shrink model to first 483 iterations.


In [25]:
submission = pd.read_csv(SUBMISSION_PATH)
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
15,0.csv_Day7_7h30m,1.014,1.183,1.352,1.521,1.69,1.859,2.028,2.197,2.366
16,0.csv_Day7_8h00m,2.634,3.073,3.512,3.951,4.39,4.829,5.268,5.707,6.146
17,0.csv_Day7_8h30m,5.88,6.86,7.84,8.82,9.8,10.78,11.76,12.74,13.72
18,0.csv_Day7_9h00m,9.156,10.682,12.208,13.734,15.26,16.786,18.312,19.838,21.364
19,0.csv_Day7_9h30m,12.12,14.14,16.16,18.18,20.2,22.22,24.24,26.26,28.28
20,0.csv_Day7_10h00m,14.616,17.052,19.488,21.924,24.36,26.796,29.232,31.668,34.104
21,0.csv_Day7_10h30m,17.556,20.482,23.408,26.334,29.26,32.186,35.112,38.038,40.964
22,0.csv_Day7_11h00m,19.122,22.309,25.496,28.683,31.87,35.057,38.244,41.431,44.618
23,0.csv_Day7_11h30m,18.912,22.064,25.216,28.368,31.52,34.672,37.824,40.976,44.128


In [26]:
submission_catb_shifted = submission.copy()

### 48 Models

30분 단위로 데이터를 묶어서 각기 다른 48개의 model에 학습 후 각기 다른 48개의 예측을 합니다.

각 모델에 쓰이는 train 데이터가 약 1000개로 매우 적습니다. 따라서 6days shift 보다는 3days shift가 더 높은 성능을 보입니다.

In [28]:
train = pd.read_csv(TRAIN_PATH)
train = sum_hour_minute(train)

In [29]:
# shift할 컬럼 지정
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']

# shift할 기간 지정
periods = 3 # n일치를 넣겠다라고 지정(t일 포함) # 6일치가 가장 높은 성능을 보임
lags = list(np.arange(48, periods*48, 48))

In [30]:
train = shift_columns(train, shifted_columns, lags)
train.shape

(52560, 21)

In [31]:
# Hour의 값을 48개로 받아서 48개의 각기 다른 train set을 만들어줍니다.

trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

In [32]:
train_0.shape, train_15.shape, train_45.shape

((1091, 21), (1091, 21), (1091, 21))

In [33]:
# shift 조건이 바뀌었기 때문에 test set을 다시 받아줍니다.

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    temp = shift_columns(temp, shifted_columns, lags)
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 21)


In [34]:
# 마찬가지로 48개의 test set을 만들어줍니다.

tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

In [35]:
train_0.shape, test_0.shape

((1091, 21), (81, 19))

In [36]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.2
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.3
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.4
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.6
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.7
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.8
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1]	valid_

In [37]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values
    
for i, res in enumerate(results_2): # 똑같이 해줍니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [38]:
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,0.0,0.0,0.0,0.0,0.0,0.01,-0.02,0.01,0.1
15,0.csv_Day7_7h30m,0.44,0.41,0.53,0.54,0.72,1.19,0.97,1.68,2.11
16,0.csv_Day7_8h00m,5.23,6.21,7.66,8.36,8.23,8.65,8.7,8.1,11.84
17,0.csv_Day7_8h30m,5.82,9.21,12.54,13.28,14.43,15.55,14.15,13.86,17.44
18,0.csv_Day7_9h00m,9.9,19.05,20.61,20.37,23.28,23.23,24.58,23.41,26.07
19,0.csv_Day7_9h30m,14.61,18.18,24.0,25.83,26.68,29.88,31.32,32.88,33.82
20,0.csv_Day7_10h00m,22.2,26.3,26.51,27.74,30.73,34.01,37.65,39.09,38.54
21,0.csv_Day7_10h30m,20.75,28.54,31.4,32.85,33.43,38.37,40.69,41.6,46.7
22,0.csv_Day7_11h00m,22.64,37.2,40.49,41.54,42.86,44.24,46.96,49.13,50.76
23,0.csv_Day7_11h30m,25.35,36.7,43.25,41.08,45.48,49.32,49.3,49.6,53.1


In [39]:
submission_48models = submission.copy()

## Ensemble

In [40]:
multi_res = submission_48models.set_index('id').values
catb_res = submission_catb_shifted.set_index('id').values
lgbm_res = submission_lgbm_shifted.set_index('id').values

In [41]:
ens_res = (multi_res + lgbm_res + catb_res) / 3

In [42]:
# 0이 아닌 0~0.1 사이의 값이 너무 많습니다.
# score에 감점이 될까하여 0으로 전부 맞춰주겠습니다.

for r in range(ens_res.shape[0]):
    for c in range(ens_res.shape[1]):
        if ens_res[r, c] < 0.1:
            ens_res[r, c] = 0

In [43]:
submission.loc[:, "q_0.1":] = ens_res

In [44]:
submission[14:33]

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
14,0.csv_Day7_7h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.csv_Day7_7h30m,0.608,0.647667,0.787333,0.843667,1.293333,1.523,1.686,2.272333,3.122
16,0.csv_Day7_8h00m,3.368,4.067667,4.904,5.350333,5.95,6.443,6.846,7.112333,8.645333
17,0.csv_Day7_8h30m,4.973333,7.36,9.04,9.383333,11.246667,12.17,12.813333,12.946667,15.386667
18,0.csv_Day7_9h00m,7.965333,12.497333,14.369333,15.378,17.94,18.832,21.047333,21.739333,22.531333
19,0.csv_Day7_9h30m,11.04,14.096667,17.426667,19.913333,22.333333,23.253333,27.17,28.906667,30.623333
20,0.csv_Day7_10h00m,15.042,18.504,20.492667,22.604667,26.786667,27.198667,31.987333,33.026,34.881333
21,0.csv_Day7_10h30m,16.252,21.897333,25.262667,27.631333,30.406667,33.018667,37.514,38.689333,40.711333
22,0.csv_Day7_11h00m,18.430667,26.356333,30.688667,33.847667,36.233333,38.795667,41.928,44.303667,45.966
23,0.csv_Day7_11h30m,19.334,25.931333,30.722,33.466,36.653333,40.414,43.201333,45.018667,46.486


In [46]:
#submission.to_csv('./submission/submission_210126-3_ensemble-best-trio-finalcode.csv', index=False)