In [1]:
# load_dtypes()
# DF 불러올 때부터 축소해서 가져오는 함수.

def load_dtypes(file_path):
    print(file_path)
    tmp = pd.read_csv(file_path, nrows=0) # 애초에 파일을 불러올 때
    col_dtypes= {}
    for col in tmp.columns:
        df = pd.read_csv(file_path, usecols=[col]) # 애초에 파일을 불러올 때
        
        dtype = df[col].dtype
        
        if (dtype == 'int8' or dtype == 'int16' or dtype == 'int32' or dtype == 'int64'):
            c_min = df[col].min()
            c_max = df[col].max()
            dtype = 'int'
            
        elif dtype == 'float32' or dtype == 'float64':
            c_min = df[col].min()
            c_max = df[col].max()
            dtype = 'float'
            
        elif dtype == 'object':
            n_unique = df[col].nunique()
            threshold = n_unique / df.shape[0]
            
        if dtype == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                col_dtype = 'int8'
            elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                col_dtype = 'uint8'
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                col_dtype = 'int16'
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                col_dtype = 'uint16'
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                col_dtype = 'int32'
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                col_dtype = 'uint32'
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                col_dtype = 'int64'
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                col_dtype = 'uint64'
                
        elif dtype == 'float':
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                col_dtype = 'float32'
            else:
                col_dtype = 'float64'
                
        elif dtype == 'object':
            if threshold > 0.7:
                col_dtype = 'object'
            else:
                col_dtype = 'category'
                
        col_dtypes[col] = col_dtype
        

    df = pd.read_csv(file_path, dtype=col_dtypes)
    
    return df

# libraries

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import random
import warnings
warnings.filterwarnings("ignore")

In [3]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

# data load

In [4]:
TRAIN_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv'
TEST_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\test'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\sample_submission.csv'

In [5]:
train = load_dtypes(TRAIN_PATH)
train.info()

C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int16  
 1   Hour    52560 non-null  int8   
 2   Minute  52560 non-null  int8   
 3   DHI     52560 non-null  int16  
 4   DNI     52560 non-null  int16  
 5   WS      52560 non-null  float32
 6   RH      52560 non-null  float32
 7   T       52560 non-null  int8   
 8   TARGET  52560 non-null  float32
dtypes: float32(3), int16(3), int8(3)
memory usage: 1.1 MB


In [6]:
submission = pd.read_csv(SUBMISSION_PATH)

# Preprocessing

In [7]:
def create_lag_feats(data, lags, cols):
    
    lag_cols = []
    temp = data.copy()
    for col in cols:
        for lag in lags:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag) # 기본적으로 lag만큼 shift 합니다.
            temp['Target1'] = temp['TARGET'] # lag만큼 shift 된 값입니다.
            temp['Target2'] = temp['TARGET'].shift(-lag).fillna(method='ffill') # lag만큼 한 번 더 shift  
            lag_cols.append(col + '_lag_%s'%lag)

    return temp, lag_cols

In [10]:
def preprocess_data(data, target_lags=[48], weather_lags=[48], is_train=True):
    
    temp = data.copy()

    if is_train==True:          
    
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET']) # TARGET을 shift 합니다.
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags,
                                                ['DHI', 'DNI', 'WS', 'RH', 'T']) # 나머지 변수들 shift
     
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2 + ['Target1', 'Target2']].dropna()

    elif is_train==False:    
        
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
                              
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2].dropna() # targer1,2 를 제외하고 반환

In [11]:
df_train = preprocess_data(train,
                           target_lags=[48], # 48만큼 shift 합니다. 즉 1day를 shift 합니다.
                           weather_lags=[48], # 마찬가지.
                           is_train=True) # train 용입니다.

In [12]:
df_train

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,Target1,Target2
48,0,0.0,0.0,0.0,1.5,69.080002,-12.0,0.0,0.0
49,0,0.0,0.0,0.0,1.5,69.059998,-12.0,0.0,0.0
50,1,0.0,0.0,0.0,1.6,71.779999,-12.0,0.0,0.0
51,1,0.0,0.0,0.0,1.6,71.750000,-12.0,0.0,0.0
52,2,0.0,0.0,0.0,1.6,75.199997,-12.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
52555,21,0.0,0.0,0.0,2.4,68.379997,-2.0,0.0,0.0
52556,22,0.0,0.0,0.0,2.4,71.089996,-3.0,0.0,0.0
52557,22,0.0,0.0,0.0,2.2,71.110001,-3.0,0.0,0.0
52558,23,0.0,0.0,0.0,2.1,74.989998,-4.0,0.0,0.0


48개를 shift 해서 데이터가 48개 줄었고, <br>
모든 변수를 48개 만큼 shift 했습니다. <br>
Hour는 동일하기 때문에 변화가 없고,<br>
Target1은 48(하루)개 만큼 shift된 값이며,<br>
Target2는 96(이틀)개 만큼 shift된 값입니다.<br>
TARGET_lag_48은 shift 되었지만, 다른 변수들과 동일한 시게열성을 갖기 때문에 그대로라고 볼 수 있습니다.

In [13]:
# test set load

df_test = []

for i in range(81):
    file_path = TEST_PATH + '/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(3888, 7)

test set은 총 81개의 파일로 나눠져 있습니다. 그러나 어차피 예측은 다 함께 할 것입니다. 그러니 모두 불러와서 합쳐줍니다. 그러나 마찬가지로 shift를 해서 가져와줍니다.<br>
하지만 원래대로라면 27216(336x81)개의 데이터가 불러와져야 합니다.<br>
그렇지만 3888개만 불러와집니다. train 데이터와 마찬가지로 shift를 하고 다른 걸 쳐내버렸기 때문입니다.<br>
그리고 뒤에 하루치만 가져옵니다. .iloc[-48:] 로 말이죠.<br>
그래서 3888(48x81)의 shape을 가지게 됩니다.

In [14]:
X_test

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48
288,0,0.0,0.0,0.0,2.1,52.83,-4.4
289,0,0.0,0.0,0.0,2.0,54.44,-4.8
290,1,0.0,0.0,0.0,1.9,52.78,-5.1
291,1,0.0,0.0,0.0,1.8,53.59,-5.3
292,2,0.0,0.0,0.0,1.7,52.63,-5.5
...,...,...,...,...,...,...,...
331,21,0.0,0.0,0.0,0.5,74.13,12.5
332,22,0.0,0.0,0.0,0.7,73.54,12.0
333,22,0.0,0.0,0.0,0.9,75.01,11.7
334,23,0.0,0.0,0.0,1.1,74.47,11.3


In [15]:
X_test.duplicated().sum()

45

In [16]:
X_test[X_test.duplicated()]

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48
331,21,0.0,0.0,0.0,0.5,82.74,-2.6
329,20,0.0,0.0,0.0,1.7,58.56,-11.4
331,21,0.0,0.0,0.0,1.8,58.17,-11.4
333,22,0.0,0.0,0.0,1.8,57.15,-11.3
335,23,0.0,0.0,0.0,3.3,62.46,-1.4
333,22,0.0,0.0,0.0,2.1,54.12,-4.3
327,19,0.0,0.0,0.0,2.0,73.02,-2.4
333,22,0.0,0.0,0.0,1.8,73.5,-2.5
289,0,0.0,0.0,0.0,1.0,72.94,1.5
291,1,0.0,0.0,0.0,1.0,71.24,1.6


In [17]:
(X_train_1, # shift 전처리한 df_train에서 독립변수만 뽑습니다. 70% 를 먹습니다.
 X_valid_1, # valid set으로 30% 를 먹습니다.
 Y_train_1, # shift 전처리한 df_train에서 종속변수를 뽑습니다. 아래 코드에 의해 target1을 뽑습니다.
 Y_valid_1
) = train_test_split(df_train.iloc[:, :-2], # 독립변수들
                     df_train.iloc[:, -2], # 종속변수 뒤에서 두번째. 즉 target1
                     test_size=0.3,
                     random_state=42)

X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], # 독립변수들
                                                              df_train.iloc[:, -1], # 종속변수 target2
                                                              test_size=0.3,
                                                              random_state=42)

In [18]:
X_train_1.shape, X_valid_1.shape, Y_train_1.shape, Y_valid_1.shape

((36758, 7), (15754, 7), (36758,), (15754,))

In [19]:
X_train_2.shape, X_valid_2.shape, Y_train_2.shape, Y_valid_2.shape

((36758, 7), (15754, 7), (36758,), (15754,))

In [20]:
X_train_1.head(1)

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48
7436,22,0.0,0.0,0.0,2.2,95.650002,11.0


In [21]:
X_test.head(1)

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48
288,0,0.0,0.0,0.0,2.1,52.83,-4.4


train, valid 셋을 두개 씩 뽑습니다. 종속변수1과 종속변수2 를 따로 학습합니다.

# Train & Predict

In [73]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [75]:
# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                          alpha=q, # 이건 나중에 quantile 받을 인자입니다.
                          n_estimators=10000, # 10000만번을 돌리겠다고? 미쳤어?
                          bagging_fraction=0.7,
                          learning_rate=0.027,
                          subsample=0.7)                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_metric = ['quantile'],
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300, verbose=500) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [76]:
# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = LGBM(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model) # 모델을 어펜드 해줍니다.
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    LGBM_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return LGBM_models, LGBM_actual_pred

In [77]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)
results_1.sort_index()[:48]

0.1
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 1.35131
Early stopping, best iteration is:
[479]	valid_0's quantile: 1.35122
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.16397
[1000]	valid_0's quantile: 2.15059
Early stopping, best iteration is:
[1062]	valid_0's quantile: 2.14954
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.57479
[1000]	valid_0's quantile: 2.55241
[1500]	valid_0's quantile: 2.54134
[2000]	valid_0's quantile: 2.5362
[2500]	valid_0's quantile: 2.5361
Early stopping, best iteration is:
[2215]	valid_0's quantile: 2.53576
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.70647
[1000]	valid_0's quantile: 2.68191
[1500]	valid_0's quantile: 2.67048
[2000]	valid_0's quantile: 2.66027
[2500]	valid_0's quantile: 2.65363
[3000]	valid_0's quantile: 2.64729
[3500]	valid_0's quantile: 2.64598
[4000]	v

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)
results_2.sort_index()[:48]

0.1
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 1.412
Early stopping, best iteration is:
[401]	valid_0's quantile: 1.41133
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.25903
[1000]	valid_0's quantile: 2.24865
[1500]	valid_0's quantile: 2.24581
Early stopping, best iteration is:
[1250]	valid_0's quantile: 2.24508
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.70267
[1000]	valid_0's quantile: 2.68566
[1500]	valid_0's quantile: 2.67307
[2000]	valid_0's quantile: 2.66351
[2500]	valid_0's quantile: 2.65836
[3000]	valid_0's quantile: 2.6563
[3500]	valid_0's quantile: 2.65327
[4000]	valid_0's quantile: 2.65164
[4500]	valid_0's quantile: 2.65103
Early stopping, best iteration is:
[4212]	valid_0's quantile: 2.65065
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.85523
[1000]	valid_0's quantile: 2.82934
[1500]	va

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
results_1.sort_index().iloc[:48]

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
results_2.sort_index()

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
3883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3884,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
print(results_1.shape, results_2.shape)

(3888, 9) (3888, 9)


# Submission

In [82]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [83]:
submission.to_csv('./submission/submission_baseline.csv', index=False)

베이스라인 코드의 전체적인 아키텍쳐를 살펴보겠습니다.

일단 이 베이스라인 코드는 시계열을 고려하지 않은 모델입니다. 단순히 다중회귀모델로써 아키텍쳐를 구성하였고, 독립변수로는 시간, 기존의 타겟변수, 그리고 날씨변수 5개가 들어갑니다.

독립변수는 t일의 데이터입니다. 종속변수는 두개이며, 그래서 모델도 두개를 굴립니다. 종속변수는 t+1일의 발전량, t+2일의 발전량 이렇게 두개입니다.

즉 정리하자면, t일의 독립변수와 t+1일의 발전량(model1), t+2일의 발전량(model2)을 종속변수로 학습시킨 다중선형회귀모델을 통해 테스트 데이터의 t일의 독립변수를 통해 t+1일의 발전량과, t+2일의 발전량을 예측합니다.

핵심 아이디어는 이렇습니다. 모든 데이터를 활용하여, t+1의 종속변수와 t+2의 종속변수에 대한 선형회귀식을 만들어 데이터 테이블을 집어넣으면 t+1의 종속변수와 t+2일의 종속변수를 내뱉도록 되어 있습니다. 여기에 중요한 점은 t시점의 종속변수를 독립변수로 집어넣었다는 것입니다.

그렇게 해서 나온 결과값이 2.5 정도. 나쁘지 않습니다. 베이스라인 코드가 나오기 전에 단 한명을 제외하고는 이 코드를 이기지 못했거든요. 다만 변수도 있습니다. 이 모델은 굉장히 일반화되어 있는 모델이고, 최근의 시점에 가중치를 주지 못하는 모델입니다. 학습한 기간의 데이터에 대한 균등한 가중치를 가지고 있다고 보면 됩니다.

-------------------------------------------------------

그래서 저는 최근의 데이터에 조금 더 가중치를 줄 수 있는, 즉 시계열성을 가진 모델을 만들고자 합니다.

제가 만들고 싶은 모델은 시계열예측을 통해 독립변수들의 값을 예측해서 다시 타겟값을 예측하는 모델입니다.

1. 전체 학습 데이터를 가지고 다중선형회귀 모델을 만든다.
2. 시계열예측을 통해 각 독립변수를 예측하는 모델을 만든다.
3. 2번의 모델에 테스트 데이터를 넣어서 7일, 8일의 독립변수를 추출한다.
4. 3번의 7일, 8일의 독립변수와 1번의 다중선형회귀 모델을 통해 타겟값을 예측한다.

이게 제가 생각하고 있는 모델입니다. 왜냐하면 독립변수들의 trend와 seasonality가 너무 명확하고 residual이 적습니다. 충분히 가능하지 않을까 생각합니다.

심심하니까 번외편으로 성능좋은 catb 써볼게요

In [98]:
from catboost import CatBoostRegressor

In [120]:
# Get the model and the predictions in (a) - (b)
def CATB(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = CatBoostRegressor(objective='Quantile', # quantile로 받아줘야 quatile regression이 가능합니다.
                              loss_function='Quantile:{}'.format(q), # 이건 나중에 quantile 받을 인자입니다.
                              iterations=10000, # 10000만번을 돌리겠다고? 미쳤어?
                              learning_rate=0.027)#,
                              #task_type="GPU",
                              #devices='0:1')                   
                         
                         
    model.fit(X_train,
              Y_train,
              eval_set=[(X_valid, Y_valid)],
              early_stopping_rounds=300,
              verbose=500) # 대신 얼리 스토핑을 넣었네요.

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

In [121]:
# Target 예측

def train_catb(X_train, Y_train, X_valid, Y_valid, X_test):

    CATB_models=[]
    CATB_actual_pred = pd.DataFrame()

    for q in quantiles: # 9번 반복할 겁니다. quantile 마다.
        print(q)
        pred , model = CATB(q, # 퀀틸 박아줍니다.
                            X_train, Y_train, X_valid, Y_valid, X_test)
        CATB_models.append(model) # 모델을 어펜드 해줍니다.
        CATB_actual_pred = pd.concat([CATB_actual_pred,pred],axis=1) #pred를 컨캣해줍니다.

    CATB_actual_pred.columns=quantiles # 컬럼은 퀀틸로 박아줍니다.
    
    return CATB_models, CATB_actual_pred

In [117]:
# Target1
models_1, results_1 = train_catb(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)
results_1.sort_index()[:48]

0.1
0:	learn: 8.8598621	test: 8.9955230	best: 8.9955230 (0)	total: 55.6ms	remaining: 9m 16s
500:	learn: 8.1499623	test: 8.2748250	best: 8.2748250 (500)	total: 30.7s	remaining: 9m 41s
1000:	learn: 7.4911686	test: 7.6020158	best: 7.6020158 (1000)	total: 1m 1s	remaining: 9m 15s
1500:	learn: 6.9428075	test: 7.0415852	best: 7.0415852 (1500)	total: 1m 32s	remaining: 8m 45s
2000:	learn: 6.4476509	test: 6.5329371	best: 6.5329371 (2000)	total: 2m 3s	remaining: 8m 15s
2500:	learn: 6.0454225	test: 6.1186410	best: 6.1186410 (2500)	total: 2m 34s	remaining: 7m 43s
3000:	learn: 5.6552081	test: 5.7178316	best: 5.7178316 (3000)	total: 3m 4s	remaining: 7m 10s
3500:	learn: 5.3030748	test: 5.3587750	best: 5.3587750 (3500)	total: 3m 34s	remaining: 6m 38s
4000:	learn: 5.0232764	test: 5.0759599	best: 5.0759599 (4000)	total: 4m 4s	remaining: 6m 6s
4500:	learn: 4.7728832	test: 4.8224285	best: 4.8224285 (4500)	total: 4m 34s	remaining: 5m 35s
5000:	learn: 4.5386778	test: 4.5892328	best: 4.5892328 (5000)	total: 5

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.1,0.09,0.04,0.03,0.04,0.07,0.04,0.06,0.01
1,0.04,0.06,0.03,-0.0,0.05,0.07,0.04,0.09,0.02
2,0.09,0.06,0.03,0.05,-0.01,0.04,0.07,0.05,0.01
3,0.1,0.07,0.06,0.06,0.0,0.07,0.08,0.05,0.01
4,0.02,0.09,0.04,0.03,-0.02,0.04,0.07,0.06,0.02
5,0.03,0.07,0.04,0.04,-0.01,0.05,0.07,0.04,0.03
6,-0.05,0.06,0.0,-0.03,-0.11,0.01,0.03,-0.02,-0.03
7,-0.08,0.04,-0.0,-0.0,-0.1,0.01,0.05,-0.01,-0.05
8,-0.09,0.02,-0.0,-0.05,-0.09,0.02,-0.05,-0.02,-0.07
9,-0.08,0.02,-0.02,-0.02,-0.12,0.02,-0.02,-0.01,-0.05


In [122]:
# Target2
models_2, results_2 = train_catb(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)
results_2.sort_index()[:48]

0.1
0:	learn: 8.6349136	test: 8.7808516	best: 8.7808516 (0)	total: 87.3ms	remaining: 14m 33s
500:	learn: 2.7791585	test: 2.9010672	best: 2.9010672 (500)	total: 24.3s	remaining: 7m 40s
1000:	learn: 2.7226183	test: 2.8541882	best: 2.8541882 (1000)	total: 47.4s	remaining: 7m 5s
1500:	learn: 2.6725503	test: 2.8264682	best: 2.8264682 (1500)	total: 1m 9s	remaining: 6m 35s
2000:	learn: 2.6200208	test: 2.8044295	best: 2.8043788 (1990)	total: 1m 32s	remaining: 6m 10s
2500:	learn: 2.5666520	test: 2.7879388	best: 2.7879388 (2500)	total: 1m 55s	remaining: 5m 45s
3000:	learn: 2.5371003	test: 2.7796325	best: 2.7796325 (3000)	total: 2m 17s	remaining: 5m 20s
3500:	learn: 2.5010208	test: 2.7664364	best: 2.7664364 (3500)	total: 2m 39s	remaining: 4m 56s
4000:	learn: 2.4880946	test: 2.7625294	best: 2.7624103 (3934)	total: 3m 2s	remaining: 4m 33s
4500:	learn: 2.4625582	test: 2.7545174	best: 2.7545062 (4497)	total: 3m 24s	remaining: 4m 10s
5000:	learn: 2.4425831	test: 2.7490730	best: 2.7490730 (5000)	total:

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
5,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
6,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
7,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
8,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
9,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0


In [123]:
results_1.sort_index().iloc[:48]

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,0.1,0.09,0.04,0.03,0.04,0.07,0.04,0.06,0.01
1,0.04,0.06,0.03,-0.0,0.05,0.07,0.04,0.09,0.02
2,0.09,0.06,0.03,0.05,-0.01,0.04,0.07,0.05,0.01
3,0.1,0.07,0.06,0.06,0.0,0.07,0.08,0.05,0.01
4,0.02,0.09,0.04,0.03,-0.02,0.04,0.07,0.06,0.02
5,0.03,0.07,0.04,0.04,-0.01,0.05,0.07,0.04,0.03
6,-0.05,0.06,0.0,-0.03,-0.11,0.01,0.03,-0.02,-0.03
7,-0.08,0.04,-0.0,-0.0,-0.1,0.01,0.05,-0.01,-0.05
8,-0.09,0.02,-0.0,-0.05,-0.09,0.02,-0.05,-0.02,-0.07
9,-0.08,0.02,-0.02,-0.02,-0.12,0.02,-0.02,-0.01,-0.05


In [124]:
results_2.sort_index()

Unnamed: 0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
...,...,...,...,...,...,...,...,...,...
3883,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
3884,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
3885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
print(results_1.shape, results_2.shape)

(3888, 9) (3888, 9)


In [126]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.10,0.09,0.04,0.03,0.04,0.07,0.04,0.06,0.01
1,0.csv_Day7_0h30m,0.04,0.06,0.03,-0.00,0.05,0.07,0.04,0.09,0.02
2,0.csv_Day7_1h00m,0.09,0.06,0.03,0.05,-0.01,0.04,0.07,0.05,0.01
3,0.csv_Day7_1h30m,0.10,0.07,0.06,0.06,0.00,0.07,0.08,0.05,0.01
4,0.csv_Day7_2h00m,0.02,0.09,0.04,0.03,-0.02,0.04,0.07,0.06,0.02
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00
7772,80.csv_Day8_22h00m,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00,-0.00
7773,80.csv_Day8_22h30m,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
7774,80.csv_Day8_23h00m,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [127]:
submission.to_csv('./submission/submission_baseline_catb_1.csv', index=False)

3.36으로 오히려 떨어졌네요. 역시 로스가 떨어지지 않는 것이!