# 48 Models

정지훈 선생의 아이디어로, 30분 단위의 데이터를 모아서 1개의 모델로 예측하는, 48개 모델의 아키텍쳐입니다.

In [5]:
import load_dtypes as ld
import warnings
warnings.filterwarnings(action='ignore')
import os, sys

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

import numpy as np
from scipy.spatial import distance

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import KFold

In [6]:
TRAIN_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv'
TEST_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\test'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\sample_submission.csv'

In [259]:
train = ld.load_dtypes(TRAIN_PATH)
train.info()
submission = pd.read_csv(SUBMISSION_PATH)
test = ld.load_dtypes(TEST_PATH + '/0.csv')

C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int16  
 1   Hour    52560 non-null  int8   
 2   Minute  52560 non-null  int8   
 3   DHI     52560 non-null  int16  
 4   DNI     52560 non-null  int16  
 5   WS      52560 non-null  float32
 6   RH      52560 non-null  float32
 7   T       52560 non-null  int8   
 8   TARGET  52560 non-null  float32
dtypes: float32(3), int16(3), int8(3)
memory usage: 1.1 MB
C:\Users\Wyatt\wyatt37\Data\solarpanel\test/0.csv


In [260]:
# hour와 minute을 합쳐주는 함수입니다.
def sum_hour_minute(train):
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [261]:
train = sum_hour_minute(train)
train

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0.0,0.0,0,0,1.5,69.080002,-12,0.0
1,0,0.5,0.5,0,0,1.5,69.059998,-12,0.0
2,0,1.0,0.0,0,0,1.6,71.779999,-12,0.0
3,0,1.5,0.5,0,0,1.6,71.750000,-12,0.0
4,0,2.0,0.0,0,0,1.6,75.199997,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21.5,0.5,0,0,2.4,70.699997,-4,0.0
52556,1094,22.0,0.0,0,0,2.4,66.790001,-4,0.0
52557,1094,22.5,0.5,0,0,2.2,66.779999,-4,0.0
52558,1094,23.0,0.0,0,0,2.1,67.720001,-4,0.0


## Based 48 Model

48개의 데이터셋으로 나눠줘야 합니다. 글로벌 변수 설정을 통해 48개의 데이터셋을 만들어줍니다. 데이터가 많지 않으니, 컬럼을 늘리지 않고 베이스라인으로만 가봅니다.

In [262]:
trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

1092개의 데이터를 가진 48개의 테이블을 만들었습니다.

In [263]:
trains[:2]

[       Hour  DHI  DNI   WS         RH   T  TARGET  TARGET_1  TARGET_2
 0       0.0    0    0  1.5  69.080002 -12     0.0       0.0       0.0
 48      0.0    0    0  1.6  90.660004 -10     0.0       0.0       0.0
 96      0.0    0    0  2.2  73.800003  -8     0.0       0.0       0.0
 144     0.0    0    0  1.8  77.760002 -14     0.0       0.0       0.0
 192     0.0    0    0  2.1  72.099998  -5     0.0       0.0       0.0
 ...     ...  ...  ...  ...        ...  ..     ...       ...       ...
 52224   0.0    0    0  1.0  60.779999  -4     0.0       0.0       0.0
 52272   0.0    0    0  1.4  78.230003  -6     0.0       0.0       0.0
 52320   0.0    0    0  1.8  80.360001  -8     0.0       0.0       0.0
 52368   0.0    0    0  2.6  52.590000  -1     0.0       0.0       0.0
 52416   0.0    0    0  3.4  50.470001  -1     0.0       0.0       0.0
 
 [1093 rows x 9 columns],
        Hour  DHI  DNI   WS         RH   T  TARGET  TARGET_1  TARGET_2
 1       0.5    0    0  1.5  69.059998 -12     0.

In [264]:
train_0

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,TARGET_1,TARGET_2
0,0.0,0,0,1.5,69.080002,-12,0.0,0.0,0.0
48,0.0,0,0,1.6,90.660004,-10,0.0,0.0,0.0
96,0.0,0,0,2.2,73.800003,-8,0.0,0.0,0.0
144,0.0,0,0,1.8,77.760002,-14,0.0,0.0,0.0
192,0.0,0,0,2.1,72.099998,-5,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
52224,0.0,0,0,1.0,60.779999,-4,0.0,0.0,0.0
52272,0.0,0,0,1.4,78.230003,-6,0.0,0.0,0.0
52320,0.0,0,0,1.8,80.360001,-8,0.0,0.0,0.0
52368,0.0,0,0,2.6,52.590000,-1,0.0,0.0,0.0


이제 48개의 테이블을 x_train, x_val, y_train, y_val로 찢어서 학습을 시켜야 합니다. 그리고 x_test를 받아와야 합니다.

In [265]:
# test set load

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 9)


In [266]:
X_test

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
288,6,0.0,0.0,0,0,0.8,80.92,-2.8,0.0
289,6,0.5,0.5,0,0,0.9,81.53,-2.9,0.0
290,6,1.0,0.0,0,0,1.0,79.91,-3.0,0.0
291,6,1.5,0.5,0,0,0.9,79.91,-3.0,0.0
292,6,2.0,0.0,0,0,0.9,77.20,-3.0,0.0
...,...,...,...,...,...,...,...,...,...
331,6,21.5,0.5,0,0,0.8,63.35,13.7,0.0
332,6,22.0,0.0,0,0,0.7,64.82,13.1,0.0
333,6,22.5,0.5,0,0,0.7,66.10,12.8,0.0
334,6,23.0,0.0,0,0,0.6,67.64,12.4,0.0


In [267]:
tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    #globals()['test_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    #globals()['test_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    #globals()['test_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

인덱스를 찾을 수 없네요. 인덱스는 대충 다른 변수들 짬뽕해서 찾겠습니다. 일단 81개의 테이블을 합치고, 각 시간에 대해서 찢었습니다.

In [268]:
train_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,TARGET_1,TARGET_2
0,0.0,0,0,1.5,69.080002,-12,0.0,0.0,0.0
48,0.0,0,0,1.6,90.660004,-10,0.0,0.0,0.0
96,0.0,0,0,2.2,73.800003,-8,0.0,0.0,0.0
144,0.0,0,0,1.8,77.760002,-14,0.0,0.0,0.0
192,0.0,0,0,2.1,72.099998,-5,0.0,0.0,0.0


In [269]:
test_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET
288,0.0,0,0,0.8,80.92,-2.8,0.0
288,0.0,0,0,1.7,52.4,-10.7,0.0
288,0.0,0,0,3.3,61.4,-1.4,0.0
288,0.0,0,0,2.7,57.37,-6.7,0.0
288,0.0,0,0,2.2,67.95,-14.2,0.0


In [270]:
train_0.shape, test_0.shape, train_15.shape, train_47.shape

((1093, 9), (81, 7), (1093, 9), (1093, 9))

In [52]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000, # 10000만번을 돌리겠다고? 미쳤어?
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=False) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

48개의 train과 test를 넣어 각자 찢어서 학습하고, 결과를 반환하여 리스트에 저장하여 내뿜습니다.

In [65]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
0.1
0.2
0.3
0.4
0.5
0.6
0.7


In [221]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values

In [222]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_2): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [225]:
# lag model 3 window decross model

#submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
#submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission[16:33]

#submission.to_csv('./submission/submission_210115-5_48models.csv', index=False)

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
16,0.csv_Day7_8h00m,5.16,6.47,7.18,6.69,7.42,8.11,8.72,11.39,18.74
17,0.csv_Day7_8h30m,4.89,10.42,11.76,12.79,14.9,15.91,16.54,17.48,21.63
18,0.csv_Day7_9h00m,10.93,10.81,12.41,17.22,21.8,23.99,25.01,37.48,41.34
19,0.csv_Day7_9h30m,15.54,22.07,24.72,28.73,29.14,29.02,32.76,33.36,42.91
20,0.csv_Day7_10h00m,20.1,25.28,29.48,31.82,30.13,34.56,37.08,41.45,51.54
21,0.csv_Day7_10h30m,19.38,34.36,36.06,40.94,39.72,42.62,42.84,43.58,49.72
22,0.csv_Day7_11h00m,25.87,38.41,40.55,47.94,51.59,48.27,47.57,51.73,50.74
23,0.csv_Day7_11h30m,25.22,36.93,42.38,43.31,45.85,43.45,49.87,49.56,52.56
24,0.csv_Day7_12h00m,19.53,34.48,39.54,44.64,42.16,46.17,47.94,53.94,58.53
25,0.csv_Day7_12h30m,21.23,31.41,37.33,45.11,48.09,48.06,47.5,50.4,54.24


## Shifted 48 model

base 모델이 1.90이 나왔으니, shifted로 가보겠습니다.

In [226]:
train = ld.load_dtypes(TRAIN_PATH)
train.info()
submission = pd.read_csv(SUBMISSION_PATH)
test = ld.load_dtypes(TEST_PATH + '/0.csv')

C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int16  
 1   Hour    52560 non-null  int8   
 2   Minute  52560 non-null  int8   
 3   DHI     52560 non-null  int16  
 4   DNI     52560 non-null  int16  
 5   WS      52560 non-null  float32
 6   RH      52560 non-null  float32
 7   T       52560 non-null  int8   
 8   TARGET  52560 non-null  float32
dtypes: float32(3), int16(3), int8(3)
memory usage: 1.1 MB
C:\Users\Wyatt\wyatt37\Data\solarpanel\test/0.csv


In [227]:
# hour와 minute을 합쳐주는 함수입니다.
def sum_hour_minute(train):
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [228]:
train = sum_hour_minute(train)
train

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0.0,0.0,0,0,1.5,69.080002,-12,0.0
1,0,0.5,0.5,0,0,1.5,69.059998,-12,0.0
2,0,1.0,0.0,0,0,1.6,71.779999,-12,0.0
3,0,1.5,0.5,0,0,1.6,71.750000,-12,0.0
4,0,2.0,0.0,0,0,1.6,75.199997,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21.5,0.5,0,0,2.4,70.699997,-4,0.0
52556,1094,22.0,0.0,0,0,2.4,66.790001,-4,0.0
52557,1094,22.5,0.5,0,0,2.2,66.779999,-4,0.0
52558,1094,23.0,0.0,0,0,2.1,67.720001,-4,0.0


In [232]:
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']
lags = [48, 96, 144, 192, 240] # 뒤에서부터 하나씩 줄인다.

In [233]:
def shift_columns(data, shifted_columns, lags):
    
    temp = data.copy()
    for lag in lags:
        for col in shifted_columns:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            
    return temp

In [234]:
train = shift_columns(train, shifted_columns, lags)
train.shape

(52560, 39)

In [236]:
trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

1092개의 데이터를 가진 48개의 테이블을 만들었습니다.

In [238]:
trains[:1]

[       Hour  DHI  DNI   WS         RH  T  TARGET  DHI_lag_48  DNI_lag_48  \
 240     0.0    0    0  1.5  77.410004  1     0.0         0.0         0.0   
 288     0.0    0    0  1.9  86.510002 -2     0.0         0.0         0.0   
 336     0.0    0    0  4.1  55.669998  1     0.0         0.0         0.0   
 384     0.0    0    0  2.5  84.720001 -7     0.0         0.0         0.0   
 432     0.0    0    0  3.8  77.349998 -2     0.0         0.0         0.0   
 ...     ...  ...  ...  ...        ... ..     ...         ...         ...   
 52224   0.0    0    0  1.0  60.779999 -4     0.0         0.0         0.0   
 52272   0.0    0    0  1.4  78.230003 -6     0.0         0.0         0.0   
 52320   0.0    0    0  1.8  80.360001 -8     0.0         0.0         0.0   
 52368   0.0    0    0  2.6  52.590000 -1     0.0         0.0         0.0   
 52416   0.0    0    0  3.4  50.470001 -1     0.0         0.0         0.0   
 
        WS_lag_48  RH_lag_48  T_lag_48  TARGET_lag_48  DHI_lag_96  DNI_lag

In [242]:
train_0.shape, train_15.shape, train_45.shape

# 1093에서 5개씩 줄어들었습니다. 변수는 9개에서 39개로 늘어났습니다.

((1088, 39), (1088, 39), (1088, 39))

이제 48개의 테이블을 x_train, x_val, y_train, y_val로 찢어서 학습을 시켜야 합니다. 그리고 x_test를 받아와야 합니다.

In [243]:
# test set load

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    temp = shift_columns(temp, shifted_columns, lags)
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 39)


In [244]:
X_test

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48,DHI_lag_96,DNI_lag_96,WS_lag_96,RH_lag_96,T_lag_96,TARGET_lag_96,DHI_lag_144,DNI_lag_144,WS_lag_144,RH_lag_144,T_lag_144,TARGET_lag_144,DHI_lag_192,DNI_lag_192,WS_lag_192,RH_lag_192,T_lag_192,TARGET_lag_192,DHI_lag_240,DNI_lag_240,WS_lag_240,RH_lag_240,T_lag_240,TARGET_lag_240
288,6,0.0,0.0,0,0,0.8,80.92,-2.8,0.0,0.0,0.0,2.1,52.83,-4.4,0.0,0.0,0.0,2.4,57.25,-6.0,0.0,0.0,0.0,1.1,59.09,0.1,0.0,0.0,0.0,1.3,40.27,3.1,0.0,0.0,0.0,1.7,26.93,3.6,0.0
289,6,0.5,0.5,0,0,0.9,81.53,-2.9,0.0,0.0,0.0,2.0,54.44,-4.8,0.0,0.0,0.0,2.4,57.25,-6.0,0.0,0.0,0.0,1.2,61.20,-0.4,0.0,0.0,0.0,1.3,40.55,3.0,0.0,0.0,0.0,1.7,27.12,3.5,0.0
290,6,1.0,0.0,0,0,1.0,79.91,-3.0,0.0,0.0,0.0,1.9,52.78,-5.1,0.0,0.0,0.0,2.5,55.26,-6.1,0.0,0.0,0.0,1.3,65.76,-0.8,0.0,0.0,0.0,1.2,40.27,2.9,0.0,0.0,0.0,1.7,28.00,3.4,0.0
291,6,1.5,0.5,0,0,0.9,79.91,-3.0,0.0,0.0,0.0,1.8,53.59,-5.3,0.0,0.0,0.0,2.5,55.26,-6.1,0.0,0.0,0.0,1.4,66.24,-0.9,0.0,0.0,0.0,1.2,40.56,2.8,0.0,0.0,0.0,1.7,28.40,3.2,0.0
292,6,2.0,0.0,0,0,0.9,77.20,-3.0,0.0,0.0,0.0,1.7,52.63,-5.5,0.0,0.0,0.0,2.6,54.25,-6.0,0.0,0.0,0.0,1.5,69.70,-1.0,0.0,0.0,0.0,1.2,41.81,2.6,0.0,0.0,0.0,1.7,30.53,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,6,21.5,0.5,0,0,0.8,63.35,13.7,0.0,0.0,0.0,0.5,74.13,12.5,0.0,0.0,0.0,1.0,61.43,13.1,0.0,0.0,0.0,1.1,79.32,12.2,0.0,0.0,0.0,0.8,64.85,11.1,0.0,0.0,0.0,1.2,68.24,9.3,0.0
332,6,22.0,0.0,0,0,0.7,64.82,13.1,0.0,0.0,0.0,0.7,73.54,12.0,0.0,0.0,0.0,1.0,62.61,12.5,0.0,0.0,0.0,1.2,80.41,11.7,0.0,0.0,0.0,1.0,66.71,10.3,0.0,0.0,0.0,1.2,69.78,8.7,0.0
333,6,22.5,0.5,0,0,0.7,66.10,12.8,0.0,0.0,0.0,0.9,75.01,11.7,0.0,0.0,0.0,1.0,64.77,12.0,0.0,0.0,0.0,1.1,83.10,11.2,0.0,0.0,0.0,1.2,68.97,9.8,0.0,0.0,0.0,1.3,71.70,8.3,0.0
334,6,23.0,0.0,0,0,0.6,67.64,12.4,0.0,0.0,0.0,1.1,74.47,11.3,0.0,0.0,0.0,1.0,65.31,11.5,0.0,0.0,0.0,1.1,84.37,10.8,0.0,0.0,0.0,1.3,69.70,9.3,0.0,0.0,0.0,1.4,71.52,7.9,0.0


In [245]:
tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    #globals()['test_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    #globals()['test_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    #globals()['test_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

인덱스를 찾을 수 없네요. 인덱스는 대충 다른 변수들 짬뽕해서 찾겠습니다. 일단 81개의 테이블을 합치고, 각 시간에 대해서 찢었습니다.

In [246]:
train_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48,DHI_lag_96,DNI_lag_96,WS_lag_96,RH_lag_96,T_lag_96,TARGET_lag_96,DHI_lag_144,DNI_lag_144,WS_lag_144,RH_lag_144,T_lag_144,TARGET_lag_144,DHI_lag_192,DNI_lag_192,WS_lag_192,RH_lag_192,T_lag_192,TARGET_lag_192,DHI_lag_240,DNI_lag_240,WS_lag_240,RH_lag_240,T_lag_240,TARGET_lag_240,TARGET_1,TARGET_2
240,0.0,0,0,1.5,77.410004,1,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0,1.6,90.660004,-10.0,0.0,0.0,0.0,1.5,69.080002,-12.0,0.0,0.0,0.0
288,0.0,0,0,1.9,86.510002,-2,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0,1.6,90.660004,-10.0,0.0,0.0,0.0
336,0.0,0,0,4.1,55.669998,1,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0
384,0.0,0,0,2.5,84.720001,-7,0.0,0.0,0.0,4.1,55.669998,1.0,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0
432,0.0,0,0,3.8,77.349998,-2,0.0,0.0,0.0,2.5,84.720001,-7.0,0.0,0.0,0.0,4.1,55.669998,1.0,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0


In [247]:
test_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48,DHI_lag_96,DNI_lag_96,WS_lag_96,RH_lag_96,T_lag_96,TARGET_lag_96,DHI_lag_144,DNI_lag_144,WS_lag_144,RH_lag_144,T_lag_144,TARGET_lag_144,DHI_lag_192,DNI_lag_192,WS_lag_192,RH_lag_192,T_lag_192,TARGET_lag_192,DHI_lag_240,DNI_lag_240,WS_lag_240,RH_lag_240,T_lag_240,TARGET_lag_240
288,0.0,0,0,0.8,80.92,-2.8,0.0,0.0,0.0,2.1,52.83,-4.4,0.0,0.0,0.0,2.4,57.25,-6.0,0.0,0.0,0.0,1.1,59.09,0.1,0.0,0.0,0.0,1.3,40.27,3.1,0.0,0.0,0.0,1.7,26.93,3.6,0.0
288,0.0,0,0,1.7,52.4,-10.7,0.0,0.0,0.0,1.8,57.06,-15.6,0.0,0.0,0.0,0.7,62.81,-4.9,0.0,0.0,0.0,3.9,54.93,-4.2,0.0,0.0,0.0,0.7,64.13,-10.8,0.0,0.0,0.0,2.0,60.67,-10.7,0.0
288,0.0,0,0,3.3,61.4,-1.4,0.0,0.0,0.0,2.5,62.34,-5.2,0.0,0.0,0.0,2.3,65.99,0.4,0.0,0.0,0.0,2.3,62.8,-2.1,0.0,0.0,0.0,1.5,63.09,0.1,0.0,0.0,0.0,2.0,40.35,-7.0,0.0
288,0.0,0,0,2.7,57.37,-6.7,0.0,0.0,0.0,3.2,54.47,-9.1,0.0,0.0,0.0,2.4,74.08,-6.1,0.0,0.0,0.0,0.7,69.75,-7.5,0.0,0.0,0.0,1.3,48.36,-8.5,0.0,0.0,0.0,2.4,53.17,-13.8,0.0
288,0.0,0,0,2.2,67.95,-14.2,0.0,0.0,0.0,5.6,65.18,-4.8,0.0,0.0,0.0,2.0,82.69,1.9,0.0,0.0,0.0,1.2,68.06,-0.3,0.0,0.0,0.0,1.0,73.39,-3.4,0.0,0.0,0.0,0.6,63.89,-5.9,0.0


In [248]:
train_0.shape, test_0.shape

((1088, 39), (81, 37))

In [255]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000, # 10000만번을 돌리겠다고? 미쳤어?
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=100) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

48개의 train과 test를 넣어 각자 찢어서 학습하고, 결과를 반환하여 리스트에 저장하여 내뿜습니다.

In [256]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

0.1
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.2
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.3
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.4
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.5
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early 

In [251]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values

In [252]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_2): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [258]:
submission[16:33]

#submission.to_csv('./submission/submission_210115-6_shifted-48models.csv', index=False)

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
16,0.csv_Day7_8h00m,5.48,6.7,7.27,8.04,7.4,7.93,8.14,8.14,9.12
17,0.csv_Day7_8h30m,7.26,9.92,10.99,12.01,12.41,15.74,15.68,17.35,18.77
18,0.csv_Day7_9h00m,13.32,19.39,22.62,20.16,22.28,23.58,24.53,25.44,28.01
19,0.csv_Day7_9h30m,12.31,14.14,17.12,21.79,27.72,30.96,32.48,35.39,37.51
20,0.csv_Day7_10h00m,15.6,27.81,28.58,29.81,29.3,35.64,37.29,39.16,43.18
21,0.csv_Day7_10h30m,21.48,32.21,30.75,37.33,34.77,41.57,42.33,44.37,46.44
22,0.csv_Day7_11h00m,23.61,33.17,37.94,40.37,39.76,44.91,47.67,49.96,51.6
23,0.csv_Day7_11h30m,18.52,34.36,38.69,39.39,41.63,47.1,47.7,49.96,52.84
24,0.csv_Day7_12h00m,20.11,35.12,35.83,32.02,37.61,44.67,48.39,51.59,56.66
25,0.csv_Day7_12h30m,20.25,32.66,37.74,37.82,39.44,45.46,47.66,49.85,52.57


1. base 48 모델이 1.9099, base model이 2.25정도라는 걸 보면 엄청 개선된 모델인 것이다. 그러핟면 shifted에서 좋은 성능을 낼 수도 있다.
2. 6 days shifted model이 1.9435로 성능이 떨어졌다. column이 많아지고 data가 줄어들어서 생긴 일일 수도 있다. days를 2부터 시작해서 점수를 측정해봐야겠다.
3. shift model을 버리고, 버렸던 변수들을 다시 불러올 수도 있다. 소수의 변수를 추가한다면 점수가 오를 수도 있을 것이다.

## Shifted 2days 48 model

In [7]:
train = ld.load_dtypes(TRAIN_PATH)
train.info()
submission = pd.read_csv(SUBMISSION_PATH)
test = ld.load_dtypes(TEST_PATH + '/0.csv')

C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int16  
 1   Hour    52560 non-null  int8   
 2   Minute  52560 non-null  int8   
 3   DHI     52560 non-null  int16  
 4   DNI     52560 non-null  int16  
 5   WS      52560 non-null  float32
 6   RH      52560 non-null  float32
 7   T       52560 non-null  int8   
 8   TARGET  52560 non-null  float32
dtypes: float32(3), int16(3), int8(3)
memory usage: 1.1 MB
C:\Users\Wyatt\wyatt37\Data\solarpanel\test/0.csv


In [8]:
# hour와 minute을 합쳐주는 함수입니다.
def sum_hour_minute(train):
    # 0과 30으로 되어 있는 minute을 0과 0.5로 바꿉니다.
    train.Minute = round(train.Minute * 0.017, 1)
    # hour에 minute을 더해줍니다.
    train.Hour = train.Hour + train.Minute
    
    return train

In [9]:
train = sum_hour_minute(train)
train

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0.0,0.0,0,0,1.5,69.080002,-12,0.0
1,0,0.5,0.5,0,0,1.5,69.059998,-12,0.0
2,0,1.0,0.0,0,0,1.6,71.779999,-12,0.0
3,0,1.5,0.5,0,0,1.6,71.750000,-12,0.0
4,0,2.0,0.0,0,0,1.6,75.199997,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21.5,0.5,0,0,2.4,70.699997,-4,0.0
52556,1094,22.0,0.0,0,0,2.4,66.790001,-4,0.0
52557,1094,22.5,0.5,0,0,2.2,66.779999,-4,0.0
52558,1094,23.0,0.0,0,0,2.1,67.720001,-4,0.0


In [10]:
shifted_columns = ['DHI', 'DNI', 'WS', 'RH', 'T', 'TARGET']
lags = [48] # 하루만 shift 합니다.

In [11]:
def shift_columns(data, shifted_columns, lags):
    
    temp = data.copy()
    for lag in lags:
        for col in shifted_columns:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            
    return temp

In [12]:
train = shift_columns(train, shifted_columns, lags)
train.shape

(52560, 15)

In [13]:
trains = []

for i in range(48):
    idx = i*0.5
    globals()['train_{}'.format(i)] = train[train.Hour == idx]
    globals()['train_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    globals()['train_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    globals()['train_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['train_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    trains.append(globals()['train_{}'.format(i)])

1092개의 데이터를 가진 48개의 테이블을 만들었습니다.

In [14]:
trains[:1]

[       Hour  DHI  DNI   WS         RH   T  TARGET  DHI_lag_48  DNI_lag_48  \
 48      0.0    0    0  1.6  90.660004 -10     0.0         0.0         0.0   
 96      0.0    0    0  2.2  73.800003  -8     0.0         0.0         0.0   
 144     0.0    0    0  1.8  77.760002 -14     0.0         0.0         0.0   
 192     0.0    0    0  2.1  72.099998  -5     0.0         0.0         0.0   
 240     0.0    0    0  1.5  77.410004   1     0.0         0.0         0.0   
 ...     ...  ...  ...  ...        ...  ..     ...         ...         ...   
 52224   0.0    0    0  1.0  60.779999  -4     0.0         0.0         0.0   
 52272   0.0    0    0  1.4  78.230003  -6     0.0         0.0         0.0   
 52320   0.0    0    0  1.8  80.360001  -8     0.0         0.0         0.0   
 52368   0.0    0    0  2.6  52.590000  -1     0.0         0.0         0.0   
 52416   0.0    0    0  3.4  50.470001  -1     0.0         0.0         0.0   
 
        WS_lag_48  RH_lag_48  T_lag_48  TARGET_lag_48  TARGET_

In [15]:
train_0.shape, train_15.shape, train_45.shape

# 1093에서 5개씩 줄어들었습니다. 변수는 9개에서 39개로 늘어났습니다.

((1092, 15), (1092, 15), (1092, 15))

이제 48개의 테이블을 x_train, x_val, y_train, y_val로 찢어서 학습을 시켜야 합니다. 그리고 x_test를 받아와야 합니다.

In [16]:
# test set load

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    # 전처리 코드 아래에 넣기---------------------------------------------------------------
    temp = sum_hour_minute(temp) # hour와 min 합치고
    temp = shift_columns(temp, shifted_columns, lags)
    df_test.append(temp[-48:]) # 뒤에서 48개, 즉 마지막 하루만 받아준다.

X_test = pd.concat(df_test)
print(X_test.shape)

(3888, 15)


In [17]:
X_test

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48
288,6,0.0,0.0,0,0,0.8,80.92,-2.8,0.0,0.0,0.0,2.1,52.83,-4.4,0.0
289,6,0.5,0.5,0,0,0.9,81.53,-2.9,0.0,0.0,0.0,2.0,54.44,-4.8,0.0
290,6,1.0,0.0,0,0,1.0,79.91,-3.0,0.0,0.0,0.0,1.9,52.78,-5.1,0.0
291,6,1.5,0.5,0,0,0.9,79.91,-3.0,0.0,0.0,0.0,1.8,53.59,-5.3,0.0
292,6,2.0,0.0,0,0,0.9,77.20,-3.0,0.0,0.0,0.0,1.7,52.63,-5.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,6,21.5,0.5,0,0,0.8,63.35,13.7,0.0,0.0,0.0,0.5,74.13,12.5,0.0
332,6,22.0,0.0,0,0,0.7,64.82,13.1,0.0,0.0,0.0,0.7,73.54,12.0,0.0
333,6,22.5,0.5,0,0,0.7,66.10,12.8,0.0,0.0,0.0,0.9,75.01,11.7,0.0
334,6,23.0,0.0,0,0,0.6,67.64,12.4,0.0,0.0,0.0,1.1,74.47,11.3,0.0


In [18]:
tests = []

for i in range(48):
    idx = i*0.5
    globals()['test_{}'.format(i)] = X_test[X_test.Hour == idx]
    #globals()['test_{}'.format(i)]['TARGET_1'] = globals()['train_{}'.format(i)]['TARGET'].shift(-1)
    #globals()['test_{}'.format(i)]['TARGET_2'] = globals()['train_{}'.format(i)]['TARGET'].shift(-2)
    #globals()['test_{}'.format(i)] = globals()['train_{}'.format(i)].dropna()
    globals()['test_{}'.format(i)].drop(['Day', 'Minute'], axis=1, inplace=True)
    
    tests.append(globals()['test_{}'.format(i)])

In [246]:
train_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48,DHI_lag_96,DNI_lag_96,WS_lag_96,RH_lag_96,T_lag_96,TARGET_lag_96,DHI_lag_144,DNI_lag_144,WS_lag_144,RH_lag_144,T_lag_144,TARGET_lag_144,DHI_lag_192,DNI_lag_192,WS_lag_192,RH_lag_192,T_lag_192,TARGET_lag_192,DHI_lag_240,DNI_lag_240,WS_lag_240,RH_lag_240,T_lag_240,TARGET_lag_240,TARGET_1,TARGET_2
240,0.0,0,0,1.5,77.410004,1,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0,1.6,90.660004,-10.0,0.0,0.0,0.0,1.5,69.080002,-12.0,0.0,0.0,0.0
288,0.0,0,0,1.9,86.510002,-2,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0,1.6,90.660004,-10.0,0.0,0.0,0.0
336,0.0,0,0,4.1,55.669998,1,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0,2.2,73.800003,-8.0,0.0,0.0,0.0
384,0.0,0,0,2.5,84.720001,-7,0.0,0.0,0.0,4.1,55.669998,1.0,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0,1.8,77.760002,-14.0,0.0,0.0,0.0
432,0.0,0,0,3.8,77.349998,-2,0.0,0.0,0.0,2.5,84.720001,-7.0,0.0,0.0,0.0,4.1,55.669998,1.0,0.0,0.0,0.0,1.9,86.510002,-2.0,0.0,0.0,0.0,1.5,77.410004,1.0,0.0,0.0,0.0,2.1,72.099998,-5.0,0.0,0.0,0.0


In [19]:
test_0.head()

Unnamed: 0,Hour,DHI,DNI,WS,RH,T,TARGET,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,TARGET_lag_48
288,0.0,0,0,0.8,80.92,-2.8,0.0,0.0,0.0,2.1,52.83,-4.4,0.0
288,0.0,0,0,1.7,52.4,-10.7,0.0,0.0,0.0,1.8,57.06,-15.6,0.0
288,0.0,0,0,3.3,61.4,-1.4,0.0,0.0,0.0,2.5,62.34,-5.2,0.0
288,0.0,0,0,2.7,57.37,-6.7,0.0,0.0,0.0,3.2,54.47,-9.1,0.0
288,0.0,0,0,2.2,67.95,-14.2,0.0,0.0,0.0,5.6,65.18,-4.8,0.0


In [20]:
train_0.shape, test_0.shape

((1092, 15), (81, 13))

In [21]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000, # 10000만번을 돌리겠다고? 미쳤어?
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=100) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

48개의 train과 test를 넣어 각자 찢어서 학습하고, 결과를 반환하여 리스트에 저장하여 내뿜습니다.

In [None]:
# 결과 값을 받을 리스트를 만듭니다.
results_1 = []
results_2 = []

for tr, te in zip(trains, tests):
    try:
        # 단일 값이니 Hour를 지워줍니다.
        tr.drop(['Hour'], axis=1, inplace=True)
        te.drop(['Hour'], axis=1, inplace=True)
    except:
        pass
    
    # tr을 X, val 로 나눠줍니다.
    X = tr[:730]
    val = tr[730:]
    
    # train, val, x, y 로 다 찢어줍니다.
    x_train = X[X.columns[:-2]]
    y_train_1 = X['TARGET_1']
    y_train_2 = X['TARGET_2']

    x_val = val[X.columns[:-2]]
    y_val_1 = val['TARGET_1']
    y_val_2 = val['TARGET_2']
    
    _, result_1 = train_data(x_train, y_train_1, x_val, y_val_1, te)
    _, result_2 = train_data(x_train, y_train_2, x_val, y_val_2, te)
    
    results_1.append(result_1)
    results_2.append(result_2)

0.1
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.2
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.3
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.4
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early stopping, best iteration is:
[1]	valid_0's quantile: 0
0.5
Training until validation scores don't improve for 300 rounds
[100]	valid_0's quantile: 0
[200]	valid_0's quantile: 0
[300]	valid_0's quantile: 0
Early 

In [None]:
rg = np.array((range(0, 7776, 96))) # 96개의 간격으로 벌어지는 range list를 받아줍니다.

for i, res in enumerate(results_1): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i, 1:] = res.values
    
for i, res in enumerate(results_2): # 총 48번 반복, 즉 시간별로 반복
    # submission은 0~47까지 day7, 48~95까지 day8로 되어 있습니다. 그리고 다음 테이블입니다.
    # 즉, results_1은 0, 96, 192...이렇게 넣고, 그 다음 1, 97, 193... 이렇게 넣어야 합니다.
    # 그렇게 총 81개를 넣어주면 됩니다.
    submission.iloc[rg+i+48, 1:] = res.values

In [None]:
submission[16:33]

#submission.to_csv('./submission/submission_210115-6_shifted-48models.csv', index=False)