In [1]:
import warnings
warnings.filterwarnings('ignore')

# 데이터 읽기를 위한 라이브러리
import numpy as np
np.random.seed(0)
import pandas as pd
import gc, os, time
import scipy as sp
from pandas import DataFrame, Series
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import decomposition

# 탐색적 데이터 분석을 위한 라이브러리
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew, norm, probplot, boxcox

# 모델링을 위한 라이브러리
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_regression, VarianceThreshold

import pickle

pd.set_option('display.max_columns',999)

from tqdm import tqdm
from sklearn.model_selection import TimeSeriesSplit

def RMSE(y, y_pred):
    rmse = mean_squared_error(y, y_pred) ** 0.5
    return rmse

In [2]:
# 학습용 데이터 
train_sensor = pd.read_csv('train_sensor.csv')
train_quality = pd.read_csv('train_quality.csv')
# 평가용 데이터 
predict_sensor = pd.read_csv('predict_sensor.csv')

In [3]:
def make_dataset(X, y=None):
    
    # -----------------------------------
    # train_sensor (X 인자)
    # -----------------------------------
    ''' column을 param_alias 로만 pivot table 만들기. '''
    df_X = X.copy()
    df_X = df_X.sort_values(by='end_time',ascending=True)
    df_X['step_id'] = df_X['step_id'].apply(lambda x: str(x).zfill(2))
    # step_id 와 param_alias 를 결합한 임시 컬럼 step_param 을 생성합니다. ex. 17_EPD_para4
    df_X['step_param'] = df_X[['step_id', 'param_alias']].apply(lambda x: '_'.join(x), axis=1)
    df_X_tmp = df_X.pivot_table(index = ['module_name','key_val'], columns = 'step_param', values='mean_val', aggfunc='sum')
    # 데이터 통합을 위해 인덱스를 key_val 로 재설정합니다. 
    df_X_tmp = df_X_tmp.reset_index(level=[0, 1])
    df_X_tmp.set_index('key_val', inplace=True)

    # -----------------------------------
    # 시간 데이터 
    # -----------------------------------
    ''' step별 end_time을 column으로 pivot table 만들기 '''
    df_X['end_time_tmp'] = df_X.apply(lambda x: x['step_id'] + '_end_time', axis=1)
    df_X['end_time'] = pd.to_datetime(df_X['end_time'])
    # end_time 은 센서 데이터가 각 para 별로 서버에 도달한 시간으로 스텝 내 오차가 발생할 수 있습니다. 동일 스텝 구간내 공정 완료 시간이 다른 경우, min 함수를 사용하여 최초 수집된 time을 가져옵니다.
    df_time_tmp = df_X.pivot_table(index = ['key_val'], columns = 'end_time_tmp', values='end_time', aggfunc=lambda x : min(x.unique()))
    df_time_tmp = df_time_tmp.reset_index()
    df_time_tmp.set_index('key_val', inplace=True)

    # -----------------------------------
    # train_quality (y 인자)
    # -----------------------------------

    if y is None : # 평가용 데이터 
        col_target = []
        col_idx = ['module_name', 'key_val']
        df_complete = pd.concat([df_X_tmp, df_time_tmp], axis=1).reset_index()
        df_complete.rename(columns={'index':'key_val'},inplace=True)
    else : # 학습용 데이터 
        df_y = y.copy()
        df_y.set_index('key_val', inplace=True)
        col_target = ['y']
        col_idx = ['module_name', 'key_val', 'end_dt_tm']
        # 센서 데이터, 시간데이터, 품질지표에 대하여 인덱스(key_val)기준으로 데이터프레임을 통합합니다.
        df_complete = pd.concat([df_X_tmp, df_time_tmp, df_y], axis=1).reset_index()
        # 컬럼 이름을 변경합니다.  
        df_complete.rename(columns={'msure_val':'y'}, inplace=True)
        df_complete.rename(columns={'index':'key_val'},inplace=True)


    # 컬럼 순서를 정렬합니다. 
    col_feats = df_X['step_param'].unique().tolist()
    col_feats.sort()
    col_time = [s for s in df_complete.columns.tolist() if "_end_time" in s]
    col_all = col_idx + col_target + col_feats + col_time
    df_complete = df_complete[col_all]
    # 처음 step이 시작된 시점을 기준으로 다시 정렬(APC value를 먹고 들어가는 값을 기준으로 정렬하고 싶었음.)
    df_complete = df_complete.set_index(['module_name','key_val','04_end_time']).sort_index(level=[0,2,1],ascending=True).reset_index()
    df_complete = df_complete[col_all]
    
    # 컬럼을 소문자로 변경합니다. 
    df_complete.columns = df_complete.columns.str.lower()

    return df_complete

# 학습용 데이터 
train = make_dataset(train_sensor, train_quality)
# 평가용 데이터 
predict = make_dataset(predict_sensor)

In [4]:
print(len(train.columns))
print(len(predict.columns))

676
674


In [5]:
# 전체 및 개별 공정 소요시간 변수를 생성하는 함수입니다.
def gen_duration_feats(df, lst_stepsgap):
    
    # 전체 공정 소요시간(초) 변수를 생성합니다. 
    df['gen_tmdiff'] = (df['20_end_time'] - df['04_end_time']).dt.total_seconds()
    
    # 개별 스텝간 공정 소요시간(초) 변수를 생성합니다. 
    # ex. gen_tmdiff_0406 : 04 스텝 공정 완료 시간과 06 스텝 공정 완료 시간의 차이 
    
    for stepgap in lst_stepsgap:
        df[f'gen_tmdiff_{stepgap}'] = (df[f'{stepgap[2:]}_end_time'] - df[f'{stepgap[:2]}_end_time']).dt.total_seconds()

    return df

# 4. 데이터 전처리

In [6]:
# 전처리를 위한 학습용 데이터와 평가용 데이터를 복사합니다.
df_train = train.copy()
df_predict = predict.copy()
del train

In [7]:
# -----------------------------------
# 3 장 EDA 분석에 필요한 변수를 선언합니다.
# -----------------------------------

# 센서 컬럼과 날짜 컬럼을 정의합니다. 
col_sensor = df_train.iloc[:, 4:-7].columns.tolist() 
col_time = df_train.filter(regex='end').columns.tolist() 

assert len(col_sensor) == 665
assert len(col_time) == 8 

# 3.4절 공정 소요시간 분석에 필요한 변수를 정의합니다. 
lst_steps = ['04','06','12','13','17','18', '20']
lst_stepsgap = ['0406','0612','1213','1317','1718','1820']

''' step별로 fdc para명 따로 수집 '''
lst_sensors = []
for step in lst_steps:
    _ = [col for col in col_sensor if col[:2] == step]
    lst_sensors.append(_)

sensors_nm = list(map(lambda x: x[3:], lst_sensors[0]))

# 시간과 관련한 분석을 진행하기 위하여 날짜형으로 변환합니다. 
df_train[col_time] = df_train[col_time].apply(pd.to_datetime)

In [8]:
# 전체 및 개별 공정 소요시간 7개의 변수를 생성합니다(3.4절)
lst_stepsgap = ['0406','0612','1213','1317','1718','1820']
df_train = gen_duration_feats(df_train, lst_stepsgap)
df_predict = gen_duration_feats(df_predict, lst_stepsgap)
df_train.filter(regex='tmdiff').head(2)

Unnamed: 0,gen_tmdiff,gen_tmdiff_0406,gen_tmdiff_0612,gen_tmdiff_1213,gen_tmdiff_1317,gen_tmdiff_1718,gen_tmdiff_1820
0,1912.0,146.0,846.0,16.0,477.0,16.0,411.0
1,1911.0,145.0,847.0,16.0,476.0,16.0,411.0


# Cyclic Transformation 적용

In [11]:
# adding datetime features
df_train['04_end_time'] = pd.to_datetime(df_train['04_end_time'])
df_train['hour'] = df_train['04_end_time'].dt.hour
df_train['month'] = df_train['04_end_time'].dt.month
df_train['day'] = df_train['04_end_time'].dt.day
df_train['date'] = df_train['04_end_time'].dt.date
df_train['weekday'] = df_train['04_end_time'].dt.weekday

In [12]:
# adding datetime features
df_predict['04_end_time'] = pd.to_datetime(df_predict['04_end_time'])
df_predict['hour'] = df_predict['04_end_time'].dt.hour
df_predict['month'] = df_predict['04_end_time'].dt.month
df_predict['day'] = df_predict['04_end_time'].dt.day
df_predict['date'] = df_predict['04_end_time'].dt.date
df_predict['weekday'] = df_predict['04_end_time'].dt.weekday

In [13]:
## cyclic transformation on hour
df_train['hour_sin'] = np.sin(2 * np.pi * df_train['hour']/23.0)
df_train['hour_cos'] = np.cos(2 * np.pi * df_train['hour']/23.0)
## cyclic transformation on date 
df_train['date_sin'] = -np.sin(2 * np.pi * (df_train['month']+df_train['day']/31)/12)
df_train['date_cos'] = -np.cos(2 * np.pi * (df_train['month']+df_train['day']/31)/12)
## cyclic transformation on month
df_train['month_sin'] = -np.sin(2 * np.pi * df_train['month']/12.0)
df_train['month_cos'] = -np.cos(2 * np.pi * df_train['month']/12.0)
## cyclic transformation on weekday
df_train['weekday_sin'] = -np.sin(2 * np.pi * (df_train['weekday']+1)/7.0)
df_train['weekday_cos'] = -np.cos(2 * np.pi * (df_train['weekday']+1)/7.0)

In [14]:
## cyclic transformation on hour
df_predict['hour_sin'] = np.sin(2 * np.pi * df_predict['hour']/23.0)
df_predict['hour_cos'] = np.cos(2 * np.pi * df_predict['hour']/23.0)
## cyclic transformation on date 
df_predict['date_sin'] = -np.sin(2 * np.pi * (df_predict['month']+df_predict['day']/31)/12)
df_predict['date_cos'] = -np.cos(2 * np.pi * (df_predict['month']+df_predict['day']/31)/12)
## cyclic transformation on month
df_predict['month_sin'] = -np.sin(2 * np.pi * df_predict['month']/12.0)
df_predict['month_cos'] = -np.cos(2 * np.pi * df_predict['month']/12.0)
## cyclic transformation on weekday
df_predict['weekday_sin'] = -np.sin(2 * np.pi * (df_predict['weekday']+1)/7.0)
df_predict['weekday_cos'] = -np.cos(2 * np.pi * (df_predict['weekday']+1)/7.0)

In [15]:
df_train.head(1)

Unnamed: 0,module_name,key_val,end_dt_tm,y,04_efem_para2,04_efem_para25,04_efem_para78,04_epd_para4,04_epd_para40,04_epd_para63,04_epd_para80,04_esc_para84,04_esc_para94,04_fr_para28,04_fr_para35,04_fr_para61,04_fr_para69,04_gas_para10,04_gas_para13,04_gas_para15,04_gas_para19,04_gas_para21,04_gas_para26,04_gas_para27,04_gas_para33,04_gas_para36,04_gas_para39,04_gas_para46,04_gas_para48,04_gas_para50,04_gas_para51,04_gas_para52,04_gas_para59,04_gas_para6,04_gas_para70,04_gas_para71,04_gas_para73,04_gas_para74,04_gas_para85,04_he_para1,04_he_para22,04_he_para88,04_he_para95,04_hv_para3,04_hv_para45,04_hv_para47,04_hv_para56,04_position_para72,04_power_para14,04_power_para49,04_power_para57,04_power_para68,04_power_para76,04_power_para82,04_pressure_para91,04_temp_para11,04_temp_para12,04_temp_para17,04_temp_para18,04_temp_para20,04_temp_para23,04_temp_para24,04_temp_para32,04_temp_para38,04_temp_para53,04_temp_para54,04_temp_para55,04_temp_para58,04_temp_para60,04_temp_para65,04_temp_para66,04_temp_para79,04_temp_para86,04_temp_para87,04_temp_para92,04_temp_para93,04_time_para16,04_time_para29,04_time_para30,04_time_para34,04_time_para37,04_time_para41,04_time_para42,04_time_para43,04_time_para44,04_time_para5,04_time_para62,04_time_para64,04_time_para67,04_time_para7,04_time_para75,04_time_para77,04_time_para8,04_time_para81,04_time_para83,04_time_para89,04_time_para9,04_time_para90,04_tmp_para31,06_efem_para2,06_efem_para25,06_efem_para78,06_epd_para4,06_epd_para40,06_epd_para63,06_epd_para80,06_esc_para84,06_esc_para94,06_fr_para28,06_fr_para35,06_fr_para61,06_fr_para69,06_gas_para10,06_gas_para13,06_gas_para15,06_gas_para19,06_gas_para21,06_gas_para26,06_gas_para27,06_gas_para33,06_gas_para36,06_gas_para39,06_gas_para46,06_gas_para48,06_gas_para50,06_gas_para51,06_gas_para52,06_gas_para59,06_gas_para6,06_gas_para70,06_gas_para71,06_gas_para73,06_gas_para74,06_gas_para85,06_he_para1,06_he_para22,06_he_para88,06_he_para95,06_hv_para3,06_hv_para45,06_hv_para47,06_hv_para56,06_position_para72,06_power_para14,06_power_para49,06_power_para57,06_power_para68,06_power_para76,06_power_para82,06_pressure_para91,06_temp_para11,06_temp_para12,06_temp_para17,06_temp_para18,06_temp_para20,06_temp_para23,06_temp_para24,06_temp_para32,06_temp_para38,06_temp_para53,06_temp_para54,06_temp_para55,06_temp_para58,06_temp_para60,06_temp_para65,06_temp_para66,06_temp_para79,06_temp_para86,06_temp_para87,06_temp_para92,06_temp_para93,06_time_para16,06_time_para29,06_time_para30,06_time_para34,06_time_para37,06_time_para41,06_time_para42,06_time_para43,06_time_para44,06_time_para5,06_time_para62,06_time_para64,06_time_para67,06_time_para7,06_time_para75,06_time_para77,06_time_para8,06_time_para81,06_time_para83,06_time_para89,06_time_para9,06_time_para90,06_tmp_para31,12_efem_para2,12_efem_para25,12_efem_para78,12_epd_para4,12_epd_para40,12_epd_para63,12_epd_para80,12_esc_para84,12_esc_para94,12_fr_para28,12_fr_para35,12_fr_para61,12_fr_para69,12_gas_para10,12_gas_para13,12_gas_para15,12_gas_para19,12_gas_para21,12_gas_para26,12_gas_para27,12_gas_para33,12_gas_para36,12_gas_para39,12_gas_para46,12_gas_para48,12_gas_para50,12_gas_para51,12_gas_para52,12_gas_para59,12_gas_para6,12_gas_para70,12_gas_para71,12_gas_para73,12_gas_para74,12_gas_para85,12_he_para1,12_he_para22,12_he_para88,12_he_para95,12_hv_para3,12_hv_para45,12_hv_para47,12_hv_para56,12_position_para72,12_power_para14,12_power_para49,12_power_para57,12_power_para68,12_power_para76,12_power_para82,12_pressure_para91,12_temp_para11,12_temp_para12,12_temp_para17,12_temp_para18,12_temp_para20,12_temp_para23,12_temp_para24,12_temp_para32,12_temp_para38,12_temp_para53,12_temp_para54,12_temp_para55,12_temp_para58,12_temp_para60,12_temp_para65,12_temp_para66,12_temp_para79,12_temp_para86,12_temp_para87,12_temp_para92,12_temp_para93,12_time_para16,12_time_para29,12_time_para30,12_time_para34,12_time_para37,12_time_para41,12_time_para42,12_time_para43,12_time_para44,12_time_para5,12_time_para62,12_time_para64,12_time_para67,12_time_para7,12_time_para75,12_time_para77,12_time_para8,12_time_para81,12_time_para83,12_time_para89,12_time_para9,12_time_para90,12_tmp_para31,13_efem_para2,13_efem_para25,13_efem_para78,13_epd_para4,13_epd_para40,13_epd_para63,13_epd_para80,13_esc_para84,13_esc_para94,13_fr_para28,13_fr_para35,13_fr_para61,13_fr_para69,13_gas_para10,13_gas_para13,13_gas_para15,13_gas_para19,13_gas_para21,13_gas_para26,13_gas_para27,13_gas_para33,13_gas_para36,13_gas_para39,13_gas_para46,13_gas_para48,13_gas_para50,13_gas_para51,13_gas_para52,13_gas_para59,13_gas_para6,13_gas_para70,13_gas_para71,13_gas_para73,13_gas_para74,13_gas_para85,13_he_para1,13_he_para22,13_he_para88,13_he_para95,13_hv_para3,13_hv_para45,13_hv_para47,13_hv_para56,13_position_para72,13_power_para14,13_power_para49,13_power_para57,13_power_para68,13_power_para76,13_power_para82,13_pressure_para91,13_temp_para11,13_temp_para12,13_temp_para17,13_temp_para18,13_temp_para20,13_temp_para23,13_temp_para24,13_temp_para32,13_temp_para38,13_temp_para53,13_temp_para54,13_temp_para55,13_temp_para58,13_temp_para60,13_temp_para65,13_temp_para66,13_temp_para79,13_temp_para86,13_temp_para87,13_temp_para92,13_temp_para93,13_time_para16,13_time_para29,13_time_para30,13_time_para34,13_time_para37,13_time_para41,13_time_para42,13_time_para43,13_time_para44,13_time_para5,13_time_para62,13_time_para64,13_time_para67,13_time_para7,13_time_para75,13_time_para77,13_time_para8,13_time_para81,13_time_para83,13_time_para89,13_time_para9,13_time_para90,13_tmp_para31,17_efem_para2,17_efem_para25,17_efem_para78,17_epd_para4,17_epd_para40,17_epd_para63,17_epd_para80,17_esc_para84,17_esc_para94,17_fr_para28,17_fr_para35,17_fr_para61,17_fr_para69,17_gas_para10,17_gas_para13,17_gas_para15,17_gas_para19,17_gas_para21,17_gas_para26,17_gas_para27,17_gas_para33,17_gas_para36,17_gas_para39,17_gas_para46,17_gas_para48,17_gas_para50,17_gas_para51,17_gas_para52,17_gas_para59,17_gas_para6,17_gas_para70,17_gas_para71,17_gas_para73,17_gas_para74,17_gas_para85,17_he_para1,17_he_para22,17_he_para88,17_he_para95,17_hv_para3,17_hv_para45,17_hv_para47,17_hv_para56,17_position_para72,17_power_para14,17_power_para49,17_power_para57,17_power_para68,17_power_para76,17_power_para82,17_pressure_para91,17_temp_para11,17_temp_para12,17_temp_para17,17_temp_para18,17_temp_para20,17_temp_para23,17_temp_para24,17_temp_para32,17_temp_para38,17_temp_para53,17_temp_para54,17_temp_para55,17_temp_para58,17_temp_para60,17_temp_para65,17_temp_para66,17_temp_para79,17_temp_para86,17_temp_para87,17_temp_para92,17_temp_para93,17_time_para16,17_time_para29,17_time_para30,17_time_para34,17_time_para37,17_time_para41,17_time_para42,17_time_para43,17_time_para44,17_time_para5,17_time_para62,17_time_para64,17_time_para67,17_time_para7,17_time_para75,17_time_para77,17_time_para8,17_time_para81,17_time_para83,17_time_para89,17_time_para9,17_time_para90,17_tmp_para31,18_efem_para2,18_efem_para25,18_efem_para78,18_epd_para4,18_epd_para40,18_epd_para63,18_epd_para80,18_esc_para84,18_esc_para94,18_fr_para28,18_fr_para35,18_fr_para61,18_fr_para69,18_gas_para10,18_gas_para13,18_gas_para15,18_gas_para19,18_gas_para21,18_gas_para26,18_gas_para27,18_gas_para33,18_gas_para36,18_gas_para39,18_gas_para46,18_gas_para48,18_gas_para50,18_gas_para51,18_gas_para52,18_gas_para59,18_gas_para6,18_gas_para70,18_gas_para71,18_gas_para73,18_gas_para74,18_gas_para85,18_he_para1,18_he_para22,18_he_para88,18_he_para95,18_hv_para3,18_hv_para45,18_hv_para47,18_hv_para56,18_position_para72,18_power_para14,18_power_para49,18_power_para57,18_power_para68,18_power_para76,18_power_para82,18_pressure_para91,18_temp_para11,18_temp_para12,18_temp_para17,18_temp_para18,18_temp_para20,18_temp_para23,18_temp_para24,18_temp_para32,18_temp_para38,18_temp_para53,18_temp_para54,18_temp_para55,18_temp_para58,18_temp_para60,18_temp_para65,18_temp_para66,18_temp_para79,18_temp_para86,18_temp_para87,18_temp_para92,18_temp_para93,18_time_para16,18_time_para29,18_time_para30,18_time_para34,18_time_para37,18_time_para41,18_time_para42,18_time_para43,18_time_para44,18_time_para5,18_time_para62,18_time_para64,18_time_para67,18_time_para7,18_time_para75,18_time_para77,18_time_para8,18_time_para81,18_time_para83,18_time_para89,18_time_para9,18_time_para90,18_tmp_para31,20_efem_para2,20_efem_para25,20_efem_para78,20_epd_para4,20_epd_para40,20_epd_para63,20_epd_para80,20_esc_para84,20_esc_para94,20_fr_para28,20_fr_para35,20_fr_para61,20_fr_para69,20_gas_para10,20_gas_para13,20_gas_para15,20_gas_para19,20_gas_para21,20_gas_para26,20_gas_para27,20_gas_para33,20_gas_para36,20_gas_para39,20_gas_para46,20_gas_para48,20_gas_para50,20_gas_para51,20_gas_para52,20_gas_para59,20_gas_para6,20_gas_para70,20_gas_para71,20_gas_para73,20_gas_para74,20_gas_para85,20_he_para1,20_he_para22,20_he_para88,20_he_para95,20_hv_para3,20_hv_para45,20_hv_para47,20_hv_para56,20_position_para72,20_power_para14,20_power_para49,20_power_para57,20_power_para68,20_power_para76,20_power_para82,20_pressure_para91,20_temp_para11,20_temp_para12,20_temp_para17,20_temp_para18,20_temp_para20,20_temp_para23,20_temp_para24,20_temp_para32,20_temp_para38,20_temp_para53,20_temp_para54,20_temp_para55,20_temp_para58,20_temp_para60,20_temp_para65,20_temp_para66,20_temp_para79,20_temp_para86,20_temp_para87,20_temp_para92,20_temp_para93,20_time_para16,20_time_para29,20_time_para30,20_time_para34,20_time_para37,20_time_para41,20_time_para42,20_time_para43,20_time_para44,20_time_para5,20_time_para62,20_time_para64,20_time_para67,20_time_para7,20_time_para75,20_time_para77,20_time_para8,20_time_para81,20_time_para83,20_time_para89,20_time_para9,20_time_para90,20_tmp_para31,04_end_time,06_end_time,12_end_time,13_end_time,17_end_time,18_end_time,20_end_time,gen_tmdiff,gen_tmdiff_0406,gen_tmdiff_0612,gen_tmdiff_1213,gen_tmdiff_1317,gen_tmdiff_1718,gen_tmdiff_1820,hour,month,day,date,weekday,hour_sin,hour_cos,date_sin,date_cos,month_sin,month_cos,weekday_sin,weekday_cos
0,EQ10_PM1,LOT5_21,2021-10-03 07:10:22,1260.0892,1631.273,1639.727,5.06654,0.0,0.0,0.0,0.0,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,40.0,0.0,2.41871,30.04839,0.0,0.0,2.477097,0.0,0.0,0.0,0.0,11.6,24.95807,0.09697,2.834516,0.0,0.0,44.90645,50.1,70.26363,15.0,0.906452,0.726923,30.0,0.09,299.9667,150.0,0.241482,7.033333,0.0,0.0,1000.0,0.0,1062.933,0.0,45.02667,0.598182,25.1,-0.021212,15.0697,26.60606,150.1848,20.0,19.96061,-9.842424,149.8545,47.52941,22.94848,24.5,44.51515,21.45152,90.0,0.579091,22.04243,149.7182,35.01515,149.8545,2460.4,723.6,2460.4,2460.4,132.3333,132.3333,2460.4,2460.4,2460.4,0.0,2460.4,132.3333,2460.4,132.3333,2460.4,2460.4,132.3333,2460.4,132.3333,2460.4,132.3333,2460.4,2.351515,1631.445,1640.059,5.074046,0.0,-1.201783,0.0,0.0,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,0.0,0.0,3.481353,32.89549,0.0,0.0,3.674587,0.0,0.0,0.0,0.0,0.0,34.09699,179.697,3.162481,15.0,120.9215,33.00075,0.0,0.0,15.0,0.900752,0.704688,30.0,0.0,0.0,0.0,0.0,9.39697,0.0,0.007576,2000.0,0.0,1525.651,0.0,25.00379,0.59763,25.1,0.042222,15.06667,26.62222,149.9948,20.0,19.96,-12.28519,150.0393,42.75735,22.95185,24.5,36.92593,22.02148,89.99185,0.579111,23.38593,150.4793,35.00296,150.0393,2460.417,723.6166,2460.417,2460.417,132.3333,132.3333,2460.417,2460.417,2460.417,0.0,2460.417,132.3333,2460.417,132.3333,2460.417,2460.417,132.3333,2460.417,132.3333,2460.417,132.3333,2460.417,2.619259,1631.368,1642.6316,5.038704,0.0,0.0,0.0,0.0,2999.037,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,5.136,35.01454,32.99474,0.0,5.382364,0.0,132.9,0.0,0.0,0.0,33.22364,184.5947,4.430727,0.0,0.0,31.81273,36.0,0.0,20.0,1.3,1.4,50.0,0.299804,272.0,434.6111,0.560196,16.19445,6827.481,0.0,4000.0,0.0,979.4445,15001.09,19.99815,0.601053,25.1,-0.005263,14.7,14.77193,150.0,70.26667,14.95088,-31.40175,149.9702,0.0,22.94386,24.5,18.0,20.67544,90.0,0.577368,26.92456,150.1,35.0,149.9702,2460.483,723.6667,2460.483,2460.483,132.4,132.4,2460.483,2460.483,2460.483,137.0,2460.483,132.4,2460.483,132.4,2460.483,2460.483,132.4,2460.483,132.4,2460.483,132.4,2460.483,3.71579,1630.923,1639.692,5.079873,0.0,0.0,0.0,0.0,2998.9,0.0,2998.9,0.0,0.0,2999.0,0.0,144.6,0.0,0.0,4.876666,35.09167,24.66923,0.0,5.1225,0.0,34.93077,100.0,0.0,7.0,33.16667,118.6,4.275833,0.0,0.0,31.85833,0.0,0.0,20.0,1.3,1.3,50.0,0.23,272.0,434.8,0.388571,18.03,7558.8,0.0,4000.0,0.0,895.5,15001.33,15.01,0.601538,25.1,0.0,14.66154,14.0,150.0,73.5,15.03077,-30.98462,150.1923,0.0,22.94615,24.5,17.30769,20.66154,90.0,0.577692,26.9,150.1,35.0,150.1923,2460.4834,723.6667,2460.483,2460.483,132.4,132.4,2460.483,2460.4834,2460.483,137.0,2460.483,132.4,2460.483,132.4,2460.483,2460.483,132.4,2460.483,132.4,2460.483,132.4,2460.483,3.553846,1632.684,1641.842,5.084785,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,5.144909,35.00182,47.99649,0.0,5.394,0.0,117.8983,0.0,0.0,0.0,33.20182,184.5947,4.451818,0.0,0.0,31.91636,36.0,0.0,20.0,1.3,1.392,50.0,0.290588,272.0,434.7963,0.550196,16.37408,6800.796,0.0,4000.0,0.0,977.8889,15001.11,19.99815,0.601754,25.09474,0.045614,14.69123,12.0,150.0,72.56667,14.9386,-31.54912,150.014,0.0,22.94912,24.5,14.0,20.7,89.99123,0.577895,27.05439,150.0,35.0,150.014,2460.7,723.8834,2460.7,2460.7,132.6167,132.6167,2460.7,2460.7,2460.7,137.0,2460.7,132.6167,2460.7,132.6167,2460.7,2460.7,132.6167,2460.7,132.6167,2460.7,132.6167,2460.7,3.768421,1632.077,1637.385,5.116795,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,159.8,0.0,0.0,4.812728,35.05455,24.67692,0.0,5.018182,0.0,19.88462,100.0,0.0,7.0,33.14545,109.6,4.236363,0.0,0.0,31.93636,0.0,0.0,20.0,1.3,1.3,50.0,0.218571,272.0,434.7,0.338571,17.7,7649.3,0.0,4000.0,0.0,881.0,15001.22,15.04,0.602308,25.1,0.023077,14.7,12.0,150.0,72.8125,14.94615,-31.41538,150.0692,0.0,22.96154,24.5,14.0,20.73846,90.0,0.577692,27.11538,150.0,35.0,150.0692,2460.7,723.8834,2460.7,2460.7,132.6167,132.6167,2460.7,2460.7,2460.7,137.0,2460.7,132.6167,2460.7,132.6167,2460.7,2460.7,132.6167,2460.7,132.6167,2460.7,132.6167,2460.7,3.5,1632.209,1642.458,5.056325,999.9999,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,60.0,0.0,0.0,4.839175,35.075,29.99478,119.9,5.357225,6.1,56.99975,0.0,99.9301,0.0,35.79625,94.91542,4.005325,0.0,0.0,28.90425,0.0,0.0,20.0,1.2985,1.283797,50.0,0.27,300.0,300.0,0.659293,10.67644,5507.023,0.0,2800.0,0.0,856.4361,10200.11,35.001,0.600323,25.1,-0.010945,14.6908,13.73632,149.9821,52.21481,14.94627,-29.72363,149.8346,0.0,22.94279,24.5,16.27115,20.19403,89.99477,0.576269,25.88433,149.942,34.93358,149.8346,2460.817,724.0167,2460.817,2460.817,132.7333,132.7333,2460.817,2460.817,2460.817,137.0,2460.817,132.7333,2460.817,132.7333,2460.817,2460.817,132.7333,2460.817,132.7333,2460.817,132.7333,2460.817,3.399254,2021-10-02 22:14:27,2021-10-02 22:16:53,2021-10-02 22:30:59,2021-10-02 22:31:15,2021-10-02 22:39:12,2021-10-02 22:39:28,2021-10-02 22:46:19,1912.0,146.0,846.0,16.0,477.0,16.0,411.0,22,10,2,2021-10-02,5,-0.269797,0.962917,0.848644,-0.528964,0.866025,-0.5,0.781831,-0.62349


In [16]:
df_predict.head(1)

Unnamed: 0,module_name,key_val,04_efem_para2,04_efem_para25,04_efem_para78,04_epd_para4,04_epd_para40,04_epd_para63,04_epd_para80,04_esc_para84,04_esc_para94,04_fr_para28,04_fr_para35,04_fr_para61,04_fr_para69,04_gas_para10,04_gas_para13,04_gas_para15,04_gas_para19,04_gas_para21,04_gas_para26,04_gas_para27,04_gas_para33,04_gas_para36,04_gas_para39,04_gas_para46,04_gas_para48,04_gas_para50,04_gas_para51,04_gas_para52,04_gas_para59,04_gas_para6,04_gas_para70,04_gas_para71,04_gas_para73,04_gas_para74,04_gas_para85,04_he_para1,04_he_para22,04_he_para88,04_he_para95,04_hv_para3,04_hv_para45,04_hv_para47,04_hv_para56,04_position_para72,04_power_para14,04_power_para49,04_power_para57,04_power_para68,04_power_para76,04_power_para82,04_pressure_para91,04_temp_para11,04_temp_para12,04_temp_para17,04_temp_para18,04_temp_para20,04_temp_para23,04_temp_para24,04_temp_para32,04_temp_para38,04_temp_para53,04_temp_para54,04_temp_para55,04_temp_para58,04_temp_para60,04_temp_para65,04_temp_para66,04_temp_para79,04_temp_para86,04_temp_para87,04_temp_para92,04_temp_para93,04_time_para16,04_time_para29,04_time_para30,04_time_para34,04_time_para37,04_time_para41,04_time_para42,04_time_para43,04_time_para44,04_time_para5,04_time_para62,04_time_para64,04_time_para67,04_time_para7,04_time_para75,04_time_para77,04_time_para8,04_time_para81,04_time_para83,04_time_para89,04_time_para9,04_time_para90,04_tmp_para31,06_efem_para2,06_efem_para25,06_efem_para78,06_epd_para4,06_epd_para40,06_epd_para63,06_epd_para80,06_esc_para84,06_esc_para94,06_fr_para28,06_fr_para35,06_fr_para61,06_fr_para69,06_gas_para10,06_gas_para13,06_gas_para15,06_gas_para19,06_gas_para21,06_gas_para26,06_gas_para27,06_gas_para33,06_gas_para36,06_gas_para39,06_gas_para46,06_gas_para48,06_gas_para50,06_gas_para51,06_gas_para52,06_gas_para59,06_gas_para6,06_gas_para70,06_gas_para71,06_gas_para73,06_gas_para74,06_gas_para85,06_he_para1,06_he_para22,06_he_para88,06_he_para95,06_hv_para3,06_hv_para45,06_hv_para47,06_hv_para56,06_position_para72,06_power_para14,06_power_para49,06_power_para57,06_power_para68,06_power_para76,06_power_para82,06_pressure_para91,06_temp_para11,06_temp_para12,06_temp_para17,06_temp_para18,06_temp_para20,06_temp_para23,06_temp_para24,06_temp_para32,06_temp_para38,06_temp_para53,06_temp_para54,06_temp_para55,06_temp_para58,06_temp_para60,06_temp_para65,06_temp_para66,06_temp_para79,06_temp_para86,06_temp_para87,06_temp_para92,06_temp_para93,06_time_para16,06_time_para29,06_time_para30,06_time_para34,06_time_para37,06_time_para41,06_time_para42,06_time_para43,06_time_para44,06_time_para5,06_time_para62,06_time_para64,06_time_para67,06_time_para7,06_time_para75,06_time_para77,06_time_para8,06_time_para81,06_time_para83,06_time_para89,06_time_para9,06_time_para90,06_tmp_para31,12_efem_para2,12_efem_para25,12_efem_para78,12_epd_para4,12_epd_para40,12_epd_para63,12_epd_para80,12_esc_para84,12_esc_para94,12_fr_para28,12_fr_para35,12_fr_para61,12_fr_para69,12_gas_para10,12_gas_para13,12_gas_para15,12_gas_para19,12_gas_para21,12_gas_para26,12_gas_para27,12_gas_para33,12_gas_para36,12_gas_para39,12_gas_para46,12_gas_para48,12_gas_para50,12_gas_para51,12_gas_para52,12_gas_para59,12_gas_para6,12_gas_para70,12_gas_para71,12_gas_para73,12_gas_para74,12_gas_para85,12_he_para1,12_he_para22,12_he_para88,12_he_para95,12_hv_para3,12_hv_para45,12_hv_para47,12_hv_para56,12_position_para72,12_power_para14,12_power_para49,12_power_para57,12_power_para68,12_power_para76,12_power_para82,12_pressure_para91,12_temp_para11,12_temp_para12,12_temp_para17,12_temp_para18,12_temp_para20,12_temp_para23,12_temp_para24,12_temp_para32,12_temp_para38,12_temp_para53,12_temp_para54,12_temp_para55,12_temp_para58,12_temp_para60,12_temp_para65,12_temp_para66,12_temp_para79,12_temp_para86,12_temp_para87,12_temp_para92,12_temp_para93,12_time_para16,12_time_para29,12_time_para30,12_time_para34,12_time_para37,12_time_para41,12_time_para42,12_time_para43,12_time_para44,12_time_para5,12_time_para62,12_time_para64,12_time_para67,12_time_para7,12_time_para75,12_time_para77,12_time_para8,12_time_para81,12_time_para83,12_time_para89,12_time_para9,12_time_para90,12_tmp_para31,13_efem_para2,13_efem_para25,13_efem_para78,13_epd_para4,13_epd_para40,13_epd_para63,13_epd_para80,13_esc_para84,13_esc_para94,13_fr_para28,13_fr_para35,13_fr_para61,13_fr_para69,13_gas_para10,13_gas_para13,13_gas_para15,13_gas_para19,13_gas_para21,13_gas_para26,13_gas_para27,13_gas_para33,13_gas_para36,13_gas_para39,13_gas_para46,13_gas_para48,13_gas_para50,13_gas_para51,13_gas_para52,13_gas_para59,13_gas_para6,13_gas_para70,13_gas_para71,13_gas_para73,13_gas_para74,13_gas_para85,13_he_para1,13_he_para22,13_he_para88,13_he_para95,13_hv_para3,13_hv_para45,13_hv_para47,13_hv_para56,13_position_para72,13_power_para14,13_power_para49,13_power_para57,13_power_para68,13_power_para76,13_power_para82,13_pressure_para91,13_temp_para11,13_temp_para12,13_temp_para17,13_temp_para18,13_temp_para20,13_temp_para23,13_temp_para24,13_temp_para32,13_temp_para38,13_temp_para53,13_temp_para54,13_temp_para55,13_temp_para58,13_temp_para60,13_temp_para65,13_temp_para66,13_temp_para79,13_temp_para86,13_temp_para87,13_temp_para92,13_temp_para93,13_time_para16,13_time_para29,13_time_para30,13_time_para34,13_time_para37,13_time_para41,13_time_para42,13_time_para43,13_time_para44,13_time_para5,13_time_para62,13_time_para64,13_time_para67,13_time_para7,13_time_para75,13_time_para77,13_time_para8,13_time_para81,13_time_para83,13_time_para89,13_time_para9,13_time_para90,13_tmp_para31,17_efem_para2,17_efem_para25,17_efem_para78,17_epd_para4,17_epd_para40,17_epd_para63,17_epd_para80,17_esc_para84,17_esc_para94,17_fr_para28,17_fr_para35,17_fr_para61,17_fr_para69,17_gas_para10,17_gas_para13,17_gas_para15,17_gas_para19,17_gas_para21,17_gas_para26,17_gas_para27,17_gas_para33,17_gas_para36,17_gas_para39,17_gas_para46,17_gas_para48,17_gas_para50,17_gas_para51,17_gas_para52,17_gas_para59,17_gas_para6,17_gas_para70,17_gas_para71,17_gas_para73,17_gas_para74,17_gas_para85,17_he_para1,17_he_para22,17_he_para88,17_he_para95,17_hv_para3,17_hv_para45,17_hv_para47,17_hv_para56,17_position_para72,17_power_para14,17_power_para49,17_power_para57,17_power_para68,17_power_para76,17_power_para82,17_pressure_para91,17_temp_para11,17_temp_para12,17_temp_para17,17_temp_para18,17_temp_para20,17_temp_para23,17_temp_para24,17_temp_para32,17_temp_para38,17_temp_para53,17_temp_para54,17_temp_para55,17_temp_para58,17_temp_para60,17_temp_para65,17_temp_para66,17_temp_para79,17_temp_para86,17_temp_para87,17_temp_para92,17_temp_para93,17_time_para16,17_time_para29,17_time_para30,17_time_para34,17_time_para37,17_time_para41,17_time_para42,17_time_para43,17_time_para44,17_time_para5,17_time_para62,17_time_para64,17_time_para67,17_time_para7,17_time_para75,17_time_para77,17_time_para8,17_time_para81,17_time_para83,17_time_para89,17_time_para9,17_time_para90,17_tmp_para31,18_efem_para2,18_efem_para25,18_efem_para78,18_epd_para4,18_epd_para40,18_epd_para63,18_epd_para80,18_esc_para84,18_esc_para94,18_fr_para28,18_fr_para35,18_fr_para61,18_fr_para69,18_gas_para10,18_gas_para13,18_gas_para15,18_gas_para19,18_gas_para21,18_gas_para26,18_gas_para27,18_gas_para33,18_gas_para36,18_gas_para39,18_gas_para46,18_gas_para48,18_gas_para50,18_gas_para51,18_gas_para52,18_gas_para59,18_gas_para6,18_gas_para70,18_gas_para71,18_gas_para73,18_gas_para74,18_gas_para85,18_he_para1,18_he_para22,18_he_para88,18_he_para95,18_hv_para3,18_hv_para45,18_hv_para47,18_hv_para56,18_position_para72,18_power_para14,18_power_para49,18_power_para57,18_power_para68,18_power_para76,18_power_para82,18_pressure_para91,18_temp_para11,18_temp_para12,18_temp_para17,18_temp_para18,18_temp_para20,18_temp_para23,18_temp_para24,18_temp_para32,18_temp_para38,18_temp_para53,18_temp_para54,18_temp_para55,18_temp_para58,18_temp_para60,18_temp_para65,18_temp_para66,18_temp_para79,18_temp_para86,18_temp_para87,18_temp_para92,18_temp_para93,18_time_para16,18_time_para29,18_time_para30,18_time_para34,18_time_para37,18_time_para41,18_time_para42,18_time_para43,18_time_para44,18_time_para5,18_time_para62,18_time_para64,18_time_para67,18_time_para7,18_time_para75,18_time_para77,18_time_para8,18_time_para81,18_time_para83,18_time_para89,18_time_para9,18_time_para90,18_tmp_para31,20_efem_para2,20_efem_para25,20_efem_para78,20_epd_para4,20_epd_para40,20_epd_para63,20_epd_para80,20_esc_para84,20_esc_para94,20_fr_para28,20_fr_para35,20_fr_para61,20_fr_para69,20_gas_para10,20_gas_para13,20_gas_para15,20_gas_para19,20_gas_para21,20_gas_para26,20_gas_para27,20_gas_para33,20_gas_para36,20_gas_para39,20_gas_para46,20_gas_para48,20_gas_para50,20_gas_para51,20_gas_para52,20_gas_para59,20_gas_para6,20_gas_para70,20_gas_para71,20_gas_para73,20_gas_para74,20_gas_para85,20_he_para1,20_he_para22,20_he_para88,20_he_para95,20_hv_para3,20_hv_para45,20_hv_para47,20_hv_para56,20_position_para72,20_power_para14,20_power_para49,20_power_para57,20_power_para68,20_power_para76,20_power_para82,20_pressure_para91,20_temp_para11,20_temp_para12,20_temp_para17,20_temp_para18,20_temp_para20,20_temp_para23,20_temp_para24,20_temp_para32,20_temp_para38,20_temp_para53,20_temp_para54,20_temp_para55,20_temp_para58,20_temp_para60,20_temp_para65,20_temp_para66,20_temp_para79,20_temp_para86,20_temp_para87,20_temp_para92,20_temp_para93,20_time_para16,20_time_para29,20_time_para30,20_time_para34,20_time_para37,20_time_para41,20_time_para42,20_time_para43,20_time_para44,20_time_para5,20_time_para62,20_time_para64,20_time_para67,20_time_para7,20_time_para75,20_time_para77,20_time_para8,20_time_para81,20_time_para83,20_time_para89,20_time_para9,20_time_para90,20_tmp_para31,04_end_time,06_end_time,12_end_time,13_end_time,17_end_time,18_end_time,20_end_time,gen_tmdiff,gen_tmdiff_0406,gen_tmdiff_0612,gen_tmdiff_1213,gen_tmdiff_1317,gen_tmdiff_1718,gen_tmdiff_1820,hour,month,day,date,weekday,hour_sin,hour_cos,date_sin,date_cos,month_sin,month_cos,weekday_sin,weekday_cos
0,EQ10_PM1,LOT380_17,1626.8125,1635.9062,5.141955,0.0,0.0,0.0,0.0,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,40.0,0.0,2.296333,30.066668,0.0,0.0,2.341,0.0,0.0,0.0,0.0,9.3,24.963333,0.1,2.659333,0.0,0.0,44.883335,50.1,70.296875,15.0,0.8,0.824,30.0,0.0,0.0,0.0,0.0,7.096552,0.0,0.0,1000.0,0.0,1078.7931,0.0,44.99655,0.600625,25.1,0.0,15.084375,28.71875,150.16562,20.0,19.971874,-9.809375,149.87813,44.969696,22.946875,24.5,43.78125,21.421875,90.0,0.580313,22.05625,149.8625,35.021873,149.87813,2965.1,1228.3,2965.1,37.8,637.01666,37.8,2965.1,2965.1,2965.1,0.0,2965.1,37.8,2965.1,637.01666,2965.1,2965.1,37.8,2965.1,637.01666,2965.1,37.8,37.8,2.378125,1627.4822,1637.7142,5.141538,98.89975,-0.114427,-0.11261,0.0,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,0.0,0.0,3.314455,32.89909,0.0,0.0,3.538273,0.0,0.0,0.0,0.0,0.0,34.09727,179.7,2.963,15.0,120.92411,32.986362,0.0,0.0,15.0,0.789091,0.895238,30.0,0.0,0.0,0.0,0.0,9.4,0.0,0.0,2500.0,0.0,1679.6422,0.0,25.0,0.600625,25.1,0.041964,15.082143,26.267857,150.10893,20.0,19.957144,-12.7,150.04018,42.230087,22.951786,24.5,38.535713,22.144644,89.998215,0.579911,23.786608,150.44643,35.01518,150.04018,2965.1,1228.3,2965.1,37.816666,637.01666,37.816666,2965.1,2965.1,2965.1,0.0,2965.1,37.816666,2965.1,637.01666,2965.1,2965.1,37.816666,2965.1,637.01666,2965.1,37.816666,37.816666,2.603571,1626.6316,1637.2632,5.148175,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,4.888727,35.02727,32.98772,0.0,5.198727,0.0,132.89648,0.0,0.0,0.0,33.18727,184.6,4.131273,0.0,0.0,31.896364,36.0,0.0,20.0,1.1,1.396,50.0,0.31,258.72223,484.0,0.541765,16.196297,6868.815,0.0,4000.0,0.0,981.5926,15001.151,19.998148,0.602807,25.1,0.054386,14.692983,12.017544,150.0,71.083336,14.940351,-31.598246,150.02808,0.0,22.93158,24.5,17.0,20.68772,90.00702,0.57807,26.970175,150.04736,35.0,150.02808,2965.15,1228.35,2965.15,37.85,637.06665,37.85,2965.15,2965.15,2965.15,113.2,2965.15,37.85,2965.15,637.06665,2965.15,2965.15,37.85,2965.15,637.06665,2965.15,37.85,37.85,3.691228,1626.3077,1637.3846,5.091155,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,144.6,0.0,0.0,4.614545,35.00909,24.684616,0.0,4.924545,0.0,34.93077,100.0,0.0,7.0,33.2,118.6,3.976364,0.0,0.0,31.9,0.0,0.0,20.0,1.1,1.266667,50.0,0.248571,259.0,484.0,0.367143,18.04,7601.4,0.0,4000.0,0.0,900.4,15001.333,15.01,0.602308,25.1,0.007692,14.7,12.0,150.0,73.0,14.969231,-31.184616,150.16924,0.0,22.984615,24.5,17.0,20.638462,90.03077,0.579231,26.923077,150.0,35.0,150.16924,2965.15,1228.35,2965.15,37.85,637.06665,37.85,2965.15,2965.15,2965.15,113.2,2965.15,37.85,2965.15,637.06665,2965.15,2965.15,37.85,2965.15,637.06665,2965.15,37.85,37.85,3.461539,1626.1052,1634.3684,5.169695,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,4.902727,35.04727,47.991226,0.0,5.218364,0.0,117.89825,0.0,0.0,0.0,33.17818,184.6,4.146545,0.0,0.0,31.901817,36.0,0.0,20.0,1.1,1.398,50.0,0.310196,258.72223,484.0,0.532157,16.366667,6837.4814,0.0,4000.0,0.0,980.2778,15001.227,19.996296,0.602807,25.1,0.003509,14.7,10.0,150.0,72.11667,14.938597,-31.589474,150.01404,0.0,22.947369,24.5,15.964912,20.708773,90.01755,0.579298,27.1,150.1,34.970177,150.01404,2965.3667,1228.55,2965.3667,38.066666,637.2833,38.066666,2965.3667,2965.3667,2965.3667,113.2,2965.3667,38.066666,2965.3667,637.2833,2965.3667,2965.3667,38.066666,2965.3667,637.2833,2965.3667,38.066666,38.066666,3.740351,1629.0,1634.5,5.194314,0.0,0.0,0.0,0.0,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,159.8,0.0,0.0,4.557,35.02,24.708334,0.0,4.844,0.0,20.091667,100.0,0.0,7.0,33.16,109.6,3.927,0.0,0.0,31.94,0.0,0.0,20.0,1.06,1.24,50.0,0.23,258.8889,484.0,0.32,17.766666,7691.1113,0.0,4000.0,0.0,885.7778,15001.25,15.066667,0.601667,25.1,-0.2,14.7,9.916667,150.0,74.375,14.908334,-30.941668,150.24167,0.0,22.958334,24.5,15.166667,20.7,89.98333,0.5775,27.116667,150.1,35.0,150.24167,2965.3667,1228.55,2965.3667,38.066666,637.2833,38.066666,2965.3667,2965.3667,2965.3667,113.2,2965.3667,38.066666,2965.3667,637.2833,2965.3667,2965.3667,38.066666,2965.3667,637.2833,2965.3667,38.066666,38.066666,3.516667,1627.4414,1637.0997,5.138525,999.9999,0.0,0.0,0.0,2999.0051,0.0,2999.0,0.0,0.0,2999.0,0.0,60.0,0.0,0.0,4.588847,35.07193,29.995262,119.9,5.187869,7.6,56.999752,0.0,99.92918,0.0,35.80075,94.92344,3.713809,0.0,0.0,28.892733,0.0,0.0,20.0,1.097995,1.301523,50.0,0.271747,300.0,300.0,0.649772,10.699749,5522.9297,0.0,2800.0,0.0,860.7638,10200.045,35.0,0.602369,25.1,-0.00399,14.691771,12.274314,149.97456,52.044556,14.947382,-29.786533,149.83441,0.0,22.942394,24.5,17.608479,20.189028,90.002495,0.578653,25.906235,149.94913,34.942642,149.83441,2965.4834,1228.6833,2965.4834,38.2,637.4,38.2,2965.4834,2965.4834,2965.4834,113.2,2965.4834,38.2,2965.4834,637.4,2965.4834,2965.4834,38.2,2965.4834,637.4,2965.4834,38.2,38.2,3.393516,2021-10-28 08:15:36,2021-10-28 08:17:39,2021-10-28 08:30:54,2021-10-28 08:31:10,2021-10-28 08:39:06,2021-10-28 08:39:22,2021-10-28 08:46:13,1837.0,123.0,795.0,16.0,476.0,16.0,411.0,8,10,28,2021-10-28,3,0.81697,-0.57668,0.543222,-0.839589,0.866025,-0.5,0.433884,0.900969


# Category 변수 처리

In [17]:
''' CATEGORY 변수 처리 및 NUM FEATURE 정의 '''
module2idx = {}
for i, module in enumerate(df_train['module_name'].unique()):
    module2idx[module] = i
    
# eq2idx = {}
# for i, eq in enumerate(df_train['module_name_eq'].unique()):
#     eq2idx[eq] = i
    
def col2cat(df, col, dict):
    df[col] = df[col].apply(lambda x: dict[x])
    df[col] = df[col].astype('category')
    return df[col]

# module_name cat 화
col2cat(df_train, 'module_name', module2idx)
col2cat(df_predict, 'module_name', module2idx)
# eq cat 화
# col2cat(df_train, 'module_name_eq', eq2idx)
# col2cat(df_predict, 'module_name_eq', eq2idx)
    
# num feature 정의(y 제외하고 해야함.)
cat_features = ['module_name']
num_features = list(df_train.columns[df_train.dtypes==float])
num_features.remove('y')

col_numerical = num_features + df_train.filter(regex='^gen_').columns.tolist()

In [18]:
# threshold 인자를 변경하여 분산의 기준값을 수동으로 선택할 수 있습니다.
thresholder = VarianceThreshold(threshold=0)
_ = thresholder.fit_transform(df_train[col_numerical])

 # 분산이 0이면 True 이므로 제거할 컬럼을 추출합니다.  
mask = ~thresholder.get_support()
cols_var_drop = np.asarray(col_numerical)[mask].tolist()
print(f'** {len(cols_var_drop)} Features to Drop by Low Variance')
print(f'{cols_var_drop}')

** 67 Features to Drop by Low Variance
['04_epd_para4', '04_epd_para40', '04_epd_para63', '04_epd_para80', '04_he_para1', '04_power_para14', '04_power_para49', '04_power_para68', '04_power_para82', '04_temp_para24', '04_time_para5', '06_epd_para80', '06_he_para1', '06_hv_para3', '06_hv_para45', '06_hv_para47', '06_hv_para56', '06_power_para14', '06_power_para68', '06_power_para82', '06_temp_para24', '06_time_para5', '12_epd_para4', '12_epd_para40', '12_epd_para63', '12_epd_para80', '12_he_para1', '12_he_para95', '12_power_para49', '12_power_para68', '12_temp_para54', '13_epd_para4', '13_epd_para40', '13_epd_para63', '13_epd_para80', '13_he_para1', '13_he_para95', '13_power_para49', '13_power_para68', '13_temp_para54', '17_epd_para4', '17_epd_para40', '17_epd_para63', '17_epd_para80', '17_he_para1', '17_he_para95', '17_power_para49', '17_power_para68', '17_temp_para54', '18_epd_para4', '18_epd_para40', '18_epd_para63', '18_epd_para80', '18_he_para1', '18_he_para95', '18_power_para49', '

In [19]:
df_train.drop(cols_var_drop,axis=1, inplace=True)
df_predict.drop(cols_var_drop,axis=1, inplace=True)

# 모델링 진행

In [20]:
df_final = df_train.copy()
df_predict_final = df_predict.copy()

In [21]:
def prep_cate_feats(df_tr, df_te, feat_nm):

    df_merge = pd.concat([df_tr, df_te])

    # 컬럼명과 범주형 변수의 레벨명을 이용한 새로운 컬럼명을 자동생성합니다. 
    # ex. module_name_eq -> module_name_eq_EQ01, module_name_eq_EQ02, etc.
    df_merge = pd.get_dummies(df_merge, columns=[feat_nm])

    df_tr = df_merge.iloc[:df_tr.shape[0], :].reset_index(drop=True)
    df_te = df_merge.iloc[df_tr.shape[0]:, :].reset_index(drop=True)

    return df_tr, df_te

# module_name_eq 의 원-핫 인코딩 변수를 생성합니다.
# df_train, df_predict = prep_cate_feats(df_train, df_predict, 'module_name_eq')
df_final, df_predict_final = prep_cate_feats(df_final, df_predict_final, 'module_name')

In [22]:
# num feature 정의(y 제외하고 해야함.)
num_features = list(df_final.columns[df_final.dtypes==float])
module_col = df_final.filter(regex='module_name').columns.tolist()
num_features.remove('y')

COLS = num_features+module_col

In [23]:
df_final.head(1)

Unnamed: 0,key_val,end_dt_tm,y,04_efem_para2,04_efem_para25,04_efem_para78,04_esc_para84,04_esc_para94,04_fr_para28,04_fr_para35,04_fr_para61,04_fr_para69,04_gas_para10,04_gas_para13,04_gas_para15,04_gas_para19,04_gas_para21,04_gas_para26,04_gas_para27,04_gas_para33,04_gas_para36,04_gas_para39,04_gas_para46,04_gas_para48,04_gas_para50,04_gas_para51,04_gas_para52,04_gas_para59,04_gas_para6,04_gas_para70,04_gas_para71,04_gas_para73,04_gas_para74,04_gas_para85,04_he_para22,04_he_para88,04_he_para95,04_hv_para3,04_hv_para45,04_hv_para47,04_hv_para56,04_position_para72,04_power_para57,04_power_para76,04_pressure_para91,04_temp_para11,04_temp_para12,04_temp_para17,04_temp_para18,04_temp_para20,04_temp_para23,04_temp_para32,04_temp_para38,04_temp_para53,04_temp_para54,04_temp_para55,04_temp_para58,04_temp_para60,04_temp_para65,04_temp_para66,04_temp_para79,04_temp_para86,04_temp_para87,04_temp_para92,04_temp_para93,04_time_para16,04_time_para29,04_time_para30,04_time_para34,04_time_para37,04_time_para41,04_time_para42,04_time_para43,04_time_para44,04_time_para62,04_time_para64,04_time_para67,04_time_para7,04_time_para75,04_time_para77,04_time_para8,04_time_para81,04_time_para83,04_time_para89,04_time_para9,04_time_para90,04_tmp_para31,06_efem_para2,06_efem_para25,06_efem_para78,06_epd_para4,06_epd_para40,06_epd_para63,06_esc_para84,06_esc_para94,06_fr_para28,06_fr_para35,06_fr_para61,06_fr_para69,06_gas_para10,06_gas_para13,06_gas_para15,06_gas_para19,06_gas_para21,06_gas_para26,06_gas_para27,06_gas_para33,06_gas_para36,06_gas_para39,06_gas_para46,06_gas_para48,06_gas_para50,06_gas_para51,06_gas_para52,06_gas_para59,06_gas_para6,06_gas_para70,06_gas_para71,06_gas_para73,06_gas_para74,06_gas_para85,06_he_para22,06_he_para88,06_he_para95,06_position_para72,06_power_para49,06_power_para57,06_power_para76,06_pressure_para91,06_temp_para11,06_temp_para12,06_temp_para17,06_temp_para18,06_temp_para20,06_temp_para23,06_temp_para32,06_temp_para38,06_temp_para53,06_temp_para54,06_temp_para55,06_temp_para58,06_temp_para60,06_temp_para65,06_temp_para66,06_temp_para79,06_temp_para86,06_temp_para87,06_temp_para92,06_temp_para93,06_time_para16,06_time_para29,06_time_para30,06_time_para34,06_time_para37,06_time_para41,06_time_para42,06_time_para43,06_time_para44,06_time_para62,06_time_para64,06_time_para67,06_time_para7,06_time_para75,06_time_para77,06_time_para8,06_time_para81,06_time_para83,06_time_para89,06_time_para9,06_time_para90,06_tmp_para31,12_efem_para2,12_efem_para25,12_efem_para78,12_esc_para84,12_esc_para94,12_fr_para28,12_fr_para35,12_fr_para61,12_fr_para69,12_gas_para10,12_gas_para13,12_gas_para15,12_gas_para19,12_gas_para21,12_gas_para26,12_gas_para27,12_gas_para33,12_gas_para36,12_gas_para39,12_gas_para46,12_gas_para48,12_gas_para50,12_gas_para51,12_gas_para52,12_gas_para59,12_gas_para6,12_gas_para70,12_gas_para71,12_gas_para73,12_gas_para74,12_gas_para85,12_he_para22,12_he_para88,12_hv_para3,12_hv_para45,12_hv_para47,12_hv_para56,12_position_para72,12_power_para14,12_power_para57,12_power_para76,12_power_para82,12_pressure_para91,12_temp_para11,12_temp_para12,12_temp_para17,12_temp_para18,12_temp_para20,12_temp_para23,12_temp_para24,12_temp_para32,12_temp_para38,12_temp_para53,12_temp_para55,12_temp_para58,12_temp_para60,12_temp_para65,12_temp_para66,12_temp_para79,12_temp_para86,12_temp_para87,12_temp_para92,12_temp_para93,12_time_para16,12_time_para29,12_time_para30,12_time_para34,12_time_para37,12_time_para41,12_time_para42,12_time_para43,12_time_para44,12_time_para5,12_time_para62,12_time_para64,12_time_para67,12_time_para7,12_time_para75,12_time_para77,12_time_para8,12_time_para81,12_time_para83,12_time_para89,12_time_para9,12_time_para90,12_tmp_para31,13_efem_para2,13_efem_para25,13_efem_para78,13_esc_para84,13_esc_para94,13_fr_para28,13_fr_para35,13_fr_para61,13_fr_para69,13_gas_para10,13_gas_para13,13_gas_para15,13_gas_para19,13_gas_para21,13_gas_para26,13_gas_para27,13_gas_para33,13_gas_para36,13_gas_para39,13_gas_para46,13_gas_para48,13_gas_para50,13_gas_para51,13_gas_para52,13_gas_para59,13_gas_para6,13_gas_para70,13_gas_para71,13_gas_para73,13_gas_para74,13_gas_para85,13_he_para22,13_he_para88,13_hv_para3,13_hv_para45,13_hv_para47,13_hv_para56,13_position_para72,13_power_para14,13_power_para57,13_power_para76,13_power_para82,13_pressure_para91,13_temp_para11,13_temp_para12,13_temp_para17,13_temp_para18,13_temp_para20,13_temp_para23,13_temp_para24,13_temp_para32,13_temp_para38,13_temp_para53,13_temp_para55,13_temp_para58,13_temp_para60,13_temp_para65,13_temp_para66,13_temp_para79,13_temp_para86,13_temp_para87,13_temp_para92,13_temp_para93,13_time_para16,13_time_para29,13_time_para30,13_time_para34,13_time_para37,13_time_para41,13_time_para42,13_time_para43,13_time_para44,13_time_para5,13_time_para62,13_time_para64,13_time_para67,13_time_para7,13_time_para75,13_time_para77,13_time_para8,13_time_para81,13_time_para83,13_time_para89,13_time_para9,13_time_para90,13_tmp_para31,17_efem_para2,17_efem_para25,17_efem_para78,17_esc_para84,17_esc_para94,17_fr_para28,17_fr_para35,17_fr_para61,17_fr_para69,17_gas_para10,17_gas_para13,17_gas_para15,17_gas_para19,17_gas_para21,17_gas_para26,17_gas_para27,17_gas_para33,17_gas_para36,17_gas_para39,17_gas_para46,17_gas_para48,17_gas_para50,17_gas_para51,17_gas_para52,17_gas_para59,17_gas_para6,17_gas_para70,17_gas_para71,17_gas_para73,17_gas_para74,17_gas_para85,17_he_para22,17_he_para88,17_hv_para3,17_hv_para45,17_hv_para47,17_hv_para56,17_position_para72,17_power_para14,17_power_para57,17_power_para76,17_power_para82,17_pressure_para91,17_temp_para11,17_temp_para12,17_temp_para17,17_temp_para18,17_temp_para20,17_temp_para23,17_temp_para24,17_temp_para32,17_temp_para38,17_temp_para53,17_temp_para55,17_temp_para58,17_temp_para60,17_temp_para65,17_temp_para66,17_temp_para79,17_temp_para86,17_temp_para87,17_temp_para92,17_temp_para93,17_time_para16,17_time_para29,17_time_para30,17_time_para34,17_time_para37,17_time_para41,17_time_para42,17_time_para43,17_time_para44,17_time_para5,17_time_para62,17_time_para64,17_time_para67,17_time_para7,17_time_para75,17_time_para77,17_time_para8,17_time_para81,17_time_para83,17_time_para89,17_time_para9,17_time_para90,17_tmp_para31,18_efem_para2,18_efem_para25,18_efem_para78,18_esc_para84,18_esc_para94,18_fr_para28,18_fr_para35,18_fr_para61,18_fr_para69,18_gas_para10,18_gas_para13,18_gas_para15,18_gas_para19,18_gas_para21,18_gas_para26,18_gas_para27,18_gas_para33,18_gas_para36,18_gas_para39,18_gas_para46,18_gas_para48,18_gas_para50,18_gas_para51,18_gas_para52,18_gas_para59,18_gas_para6,18_gas_para70,18_gas_para71,18_gas_para73,18_gas_para74,18_gas_para85,18_he_para22,18_he_para88,18_hv_para3,18_hv_para45,18_hv_para47,18_hv_para56,18_position_para72,18_power_para14,18_power_para57,18_power_para68,18_power_para76,18_power_para82,18_pressure_para91,18_temp_para11,18_temp_para12,18_temp_para17,18_temp_para18,18_temp_para20,18_temp_para23,18_temp_para24,18_temp_para32,18_temp_para38,18_temp_para53,18_temp_para55,18_temp_para58,18_temp_para60,18_temp_para65,18_temp_para66,18_temp_para79,18_temp_para86,18_temp_para87,18_temp_para92,18_temp_para93,18_time_para16,18_time_para29,18_time_para30,18_time_para34,18_time_para37,18_time_para41,18_time_para42,18_time_para43,18_time_para44,18_time_para5,18_time_para62,18_time_para64,18_time_para67,18_time_para7,18_time_para75,18_time_para77,18_time_para8,18_time_para81,18_time_para83,18_time_para89,18_time_para9,18_time_para90,18_tmp_para31,20_efem_para2,20_efem_para25,20_efem_para78,20_epd_para4,20_esc_para84,20_esc_para94,20_fr_para28,20_fr_para35,20_fr_para61,20_fr_para69,20_gas_para10,20_gas_para13,20_gas_para15,20_gas_para19,20_gas_para21,20_gas_para26,20_gas_para27,20_gas_para33,20_gas_para36,20_gas_para39,20_gas_para46,20_gas_para48,20_gas_para50,20_gas_para51,20_gas_para52,20_gas_para59,20_gas_para6,20_gas_para70,20_gas_para71,20_gas_para73,20_gas_para74,20_gas_para85,20_he_para22,20_he_para88,20_hv_para3,20_hv_para45,20_hv_para56,20_position_para72,20_power_para14,20_power_para57,20_power_para68,20_power_para76,20_power_para82,20_pressure_para91,20_temp_para11,20_temp_para12,20_temp_para17,20_temp_para18,20_temp_para20,20_temp_para23,20_temp_para24,20_temp_para32,20_temp_para38,20_temp_para53,20_temp_para55,20_temp_para58,20_temp_para60,20_temp_para65,20_temp_para66,20_temp_para79,20_temp_para86,20_temp_para87,20_temp_para92,20_temp_para93,20_time_para16,20_time_para29,20_time_para30,20_time_para34,20_time_para37,20_time_para41,20_time_para42,20_time_para43,20_time_para44,20_time_para5,20_time_para62,20_time_para64,20_time_para67,20_time_para7,20_time_para75,20_time_para77,20_time_para8,20_time_para81,20_time_para83,20_time_para89,20_time_para9,20_time_para90,20_tmp_para31,04_end_time,06_end_time,12_end_time,13_end_time,17_end_time,18_end_time,20_end_time,gen_tmdiff,gen_tmdiff_0406,gen_tmdiff_0612,gen_tmdiff_1213,gen_tmdiff_1317,gen_tmdiff_1718,gen_tmdiff_1820,hour,month,day,date,weekday,hour_sin,hour_cos,date_sin,date_cos,weekday_sin,weekday_cos,module_name_0,module_name_1,module_name_2,module_name_3,module_name_4,module_name_5,module_name_6,module_name_7,module_name_8,module_name_9,module_name_10,module_name_11,module_name_12,module_name_13,module_name_14,module_name_15,module_name_16,module_name_17,module_name_18,module_name_19,module_name_20,module_name_21,module_name_22,module_name_23,module_name_24,module_name_25,module_name_26,module_name_27,module_name_28,module_name_29,module_name_30,module_name_31,module_name_32,module_name_33,module_name_34,module_name_35,module_name_36,module_name_37,module_name_38,module_name_39,module_name_40,module_name_41,module_name_42,module_name_43,module_name_44,module_name_45,module_name_46
0,LOT5_21,2021-10-03 07:10:22,1260.0892,1631.273,1639.727,5.06654,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,40.0,0.0,2.41871,30.04839,0.0,0.0,2.477097,0.0,0.0,0.0,0.0,11.6,24.95807,0.09697,2.834516,0.0,0.0,44.90645,50.1,70.26363,0.906452,0.726923,30.0,0.09,299.9667,150.0,0.241482,7.033333,1000.0,1062.933,45.02667,0.598182,25.1,-0.021212,15.0697,26.60606,150.1848,19.96061,-9.842424,149.8545,47.52941,22.94848,24.5,44.51515,21.45152,90.0,0.579091,22.04243,149.7182,35.01515,149.8545,2460.4,723.6,2460.4,2460.4,132.3333,132.3333,2460.4,2460.4,2460.4,2460.4,132.3333,2460.4,132.3333,2460.4,2460.4,132.3333,2460.4,132.3333,2460.4,132.3333,2460.4,2.351515,1631.445,1640.059,5.074046,0.0,-1.201783,0.0,2999.0,0.0,-2999.0,0.0,0.0,2999.0,0.0,0.0,0.0,0.0,3.481353,32.89549,0.0,0.0,3.674587,0.0,0.0,0.0,0.0,0.0,34.09699,179.697,3.162481,15.0,120.9215,33.00075,0.0,0.0,0.900752,0.704688,30.0,9.39697,0.007576,2000.0,1525.651,25.00379,0.59763,25.1,0.042222,15.06667,26.62222,149.9948,19.96,-12.28519,150.0393,42.75735,22.95185,24.5,36.92593,22.02148,89.99185,0.579111,23.38593,150.4793,35.00296,150.0393,2460.417,723.6166,2460.417,2460.417,132.3333,132.3333,2460.417,2460.417,2460.417,2460.417,132.3333,2460.417,132.3333,2460.417,2460.417,132.3333,2460.417,132.3333,2460.417,132.3333,2460.417,2.619259,1631.368,1642.6316,5.038704,2999.037,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,5.136,35.01454,32.99474,0.0,5.382364,0.0,132.9,0.0,0.0,0.0,33.22364,184.5947,4.430727,0.0,0.0,31.81273,36.0,0.0,1.3,1.4,0.299804,272.0,434.6111,0.560196,16.19445,6827.481,4000.0,979.4445,15001.09,19.99815,0.601053,25.1,-0.005263,14.7,14.77193,150.0,70.26667,14.95088,-31.40175,149.9702,22.94386,24.5,18.0,20.67544,90.0,0.577368,26.92456,150.1,35.0,149.9702,2460.483,723.6667,2460.483,2460.483,132.4,132.4,2460.483,2460.483,2460.483,137.0,2460.483,132.4,2460.483,132.4,2460.483,2460.483,132.4,2460.483,132.4,2460.483,132.4,2460.483,3.71579,1630.923,1639.692,5.079873,2998.9,0.0,2998.9,0.0,0.0,2999.0,0.0,144.6,0.0,0.0,4.876666,35.09167,24.66923,0.0,5.1225,0.0,34.93077,100.0,0.0,7.0,33.16667,118.6,4.275833,0.0,0.0,31.85833,0.0,0.0,1.3,1.3,0.23,272.0,434.8,0.388571,18.03,7558.8,4000.0,895.5,15001.33,15.01,0.601538,25.1,0.0,14.66154,14.0,150.0,73.5,15.03077,-30.98462,150.1923,22.94615,24.5,17.30769,20.66154,90.0,0.577692,26.9,150.1,35.0,150.1923,2460.4834,723.6667,2460.483,2460.483,132.4,132.4,2460.483,2460.4834,2460.483,137.0,2460.483,132.4,2460.483,132.4,2460.483,2460.483,132.4,2460.483,132.4,2460.483,132.4,2460.483,3.553846,1632.684,1641.842,5.084785,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,99.3,0.0,0.0,5.144909,35.00182,47.99649,0.0,5.394,0.0,117.8983,0.0,0.0,0.0,33.20182,184.5947,4.451818,0.0,0.0,31.91636,36.0,0.0,1.3,1.392,0.290588,272.0,434.7963,0.550196,16.37408,6800.796,4000.0,977.8889,15001.11,19.99815,0.601754,25.09474,0.045614,14.69123,12.0,150.0,72.56667,14.9386,-31.54912,150.014,22.94912,24.5,14.0,20.7,89.99123,0.577895,27.05439,150.0,35.0,150.014,2460.7,723.8834,2460.7,2460.7,132.6167,132.6167,2460.7,2460.7,2460.7,137.0,2460.7,132.6167,2460.7,132.6167,2460.7,2460.7,132.6167,2460.7,132.6167,2460.7,132.6167,2460.7,3.768421,1632.077,1637.385,5.116795,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,159.8,0.0,0.0,4.812728,35.05455,24.67692,0.0,5.018182,0.0,19.88462,100.0,0.0,7.0,33.14545,109.6,4.236363,0.0,0.0,31.93636,0.0,0.0,1.3,1.3,0.218571,272.0,434.7,0.338571,17.7,7649.3,4000.0,0.0,881.0,15001.22,15.04,0.602308,25.1,0.023077,14.7,12.0,150.0,72.8125,14.94615,-31.41538,150.0692,22.96154,24.5,14.0,20.73846,90.0,0.577692,27.11538,150.0,35.0,150.0692,2460.7,723.8834,2460.7,2460.7,132.6167,132.6167,2460.7,2460.7,2460.7,137.0,2460.7,132.6167,2460.7,132.6167,2460.7,2460.7,132.6167,2460.7,132.6167,2460.7,132.6167,2460.7,3.5,1632.209,1642.458,5.056325,999.9999,2999.0,0.0,2999.0,0.0,0.0,2999.0,0.0,60.0,0.0,0.0,4.839175,35.075,29.99478,119.9,5.357225,6.1,56.99975,0.0,99.9301,0.0,35.79625,94.91542,4.005325,0.0,0.0,28.90425,0.0,0.0,1.2985,1.283797,0.27,300.0,0.659293,10.67644,5507.023,2800.0,0.0,856.4361,10200.11,35.001,0.600323,25.1,-0.010945,14.6908,13.73632,149.9821,52.21481,14.94627,-29.72363,149.8346,22.94279,24.5,16.27115,20.19403,89.99477,0.576269,25.88433,149.942,34.93358,149.8346,2460.817,724.0167,2460.817,2460.817,132.7333,132.7333,2460.817,2460.817,2460.817,137.0,2460.817,132.7333,2460.817,132.7333,2460.817,2460.817,132.7333,2460.817,132.7333,2460.817,132.7333,2460.817,3.399254,2021-10-02 22:14:27,2021-10-02 22:16:53,2021-10-02 22:30:59,2021-10-02 22:31:15,2021-10-02 22:39:12,2021-10-02 22:39:28,2021-10-02 22:46:19,1912.0,146.0,846.0,16.0,477.0,16.0,411.0,22,10,2,2021-10-02,5,-0.269797,0.962917,0.848644,-0.528964,0.781831,-0.62349,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# 모델링

In [24]:
from catboost import CatBoostRegressor
import xgboost as xgb

import optuna
from optuna.samplers import TPESampler

In [27]:
E = df_final[df_final['gen_tmdiff']<=1870]
L = df_final[df_final['gen_tmdiff']>1870]
E_predict = df_predict_final[df_predict_final['gen_tmdiff']<=1870]
L_predict = df_predict_final[df_predict_final['gen_tmdiff']>1870]

In [28]:
# mean_squared_error 의 매개변수 squared 가 False 이면 RMSE 를 반환합니다.
def rmse(y_true, y_pred):
    return round(mean_squared_error(y_true, y_pred, squared=False), 4)

In [42]:
def objective(trial):
    params_xgb = {
        'optimizer':trial.suggest_categorical('optimizer',['gbtree','dart']),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),
        'colsample_bytree': trial.suggest_int('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'max_depth': trial.suggest_int("max_depth", 4, 12),
        'random_state': trial.suggest_categorical('random_state', [0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'tree_methold':'gpu_hist',
        'gpu_id':'0'
    }
    
    # 학습 데이터 중 일부를 검증 데이터 셋으로 분할합니다. 
    X_train, X_valid, y_train, y_valid = train_test_split(E[COLS], np.log1p(E['y']), test_size=0.15, shuffle=True, random_state=71)

    model = xgb.XGBRegressor(**params_xgb)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=35,
        verbose=False
    )

    xgb_pred = model.predict(X_valid)
    rmse_val = rmse(y_valid, xgb_pred)
    
    return rmse_val

In [43]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="xgb_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=30)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2022-07-09 16:45:17,556][0m A new study created in memory with name: xgb_parameter_opt[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:17,796][0m Trial 0 finished with value: 6.6423 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7319939418114051, 'colsample_bytree': 0, 'subsample': 0.7190609389379257, 'learning_rate': 0.018410729205738687, 'n_estimators': 1644, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 260}. Best is trial 0 with value: 6.6423.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:18,148][0m Trial 1 finished with value: 0.0041 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.020584494295802447, 'colsample_bytree': 0, 'subsample': 0.978936896513396, 'learning_rate': 0.2595942550311264, 'n_estimators': 2202, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 56}. Best is trial 1 with value: 0.0041.[0m
[32m[I 2022-07-09 16:45:18,330][0m Trial 2 finished with value: 6.6423 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.43194501864211576, 'colsample_bytree': 0, 'subsample': 0.5038603981386294, 'learning_rate': 0.10952662748632554, 'n_estimators': 1481, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 110}. Best is trial 1 with value: 0.0041.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:18,864][0m Trial 3 finished with value: 0.0041 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.19967378215835974, 'colsample_bytree': 0, 'subsample': 0.6599641068895281, 'learning_rate': 0.10150667045928574, 'n_estimators': 559, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 52}. Best is trial 1 with value: 0.0041.[0m
[32m[I 2022-07-09 16:45:19,047][0m Trial 4 finished with value: 6.6423 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.9656320330745594, 'colsample_bytree': 0, 'subsample': 0.8658781436815228, 'learning_rate': 0.032925293631105246, 'n_estimators': 1067, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 133}. Best is trial 1 with value: 0.0041.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:19,230][0m Trial 5 finished with value: 6.6423 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.034388521115218396, 'colsample_bytree': 0, 'subsample': 0.9365242814551473, 'learning_rate': 0.02752069685079053, 'n_estimators': 6659, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 157}. Best is trial 1 with value: 0.0041.[0m
[32m[I 2022-07-09 16:45:19,409][0m Trial 6 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.9695846277645586, 'colsample_bytree': 0, 'subsample': 0.8425929763527802, 'learning_rate': 0.3946212980759094, 'n_estimators': 8959, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 277}. Best is trial 1 with value: 0.0041.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



[32m[I 2022-07-09 16:45:19,589][0m Trial 7 finished with value: 6.6423 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.045227288910538066, 'colsample_bytree': 0, 'subsample': 0.527731231534285, 'learning_rate': 0.04574578205475402, 'n_estimators': 2786, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 108}. Best is trial 1 with value: 0.0041.[0m



Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:21,973][0m Trial 8 finished with value: 0.0041 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.14092422497476265, 'colsample_bytree': 0, 'subsample': 0.8615378865278278, 'learning_rate': 0.01338626158454391, 'n_estimators': 9871, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 60}. Best is trial 1 with value: 0.0041.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:22,282][0m Trial 9 finished with value: 0.0041 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7068573438476171, 'colsample_bytree': 0, 'subsample': 0.810305017628691, 'learning_rate': 0.20434554984161393, 'n_estimators': 833, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 35}. Best is trial 1 with value: 0.0041.[0m
[32m[I 2022-07-09 16:45:22,480][0m Trial 10 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3637276736216312, 'colsample_bytree': 0, 'subsample': 0.3052831235214052, 'learning_rate': 0.41534235816464743, 'n_estimators': 4280, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 197}. Best is trial 1 with value: 0.0041.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:23,188][0m Trial 11 finished with value: 0.0037 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.26120125603452177, 'colsample_bytree': 0, 'subsample': 0.9928645659901887, 'learning_rate': 0.11976079885579768, 'n_estimators': 3625, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 15}. Best is trial 11 with value: 0.0037.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:23,769][0m Trial 12 finished with value: 0.0038 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.2400665831386536, 'colsample_bytree': 0, 'subsample': 0.9965656351535926, 'learning_rate': 0.19854019827517552, 'n_estimators': 4085, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 21}. Best is trial 11 with value: 0.0037.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:24,152][0m Trial 13 finished with value: 0.0036 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.31425937505629714, 'colsample_bytree': 0, 'subsample': 0.970341879073611, 'learning_rate': 0.15532959780770886, 'n_estimators': 4795, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 6}. Best is trial 13 with value: 0.0036.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:24,743][0m Trial 14 finished with value: 0.0038 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.6068362668275169, 'colsample_bytree': 0, 'subsample': 0.7328684439242217, 'learning_rate': 0.07513631820837491, 'n_estimators': 6230, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 15}. Best is trial 13 with value: 0.0036.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:25,461][0m Trial 15 finished with value: 0.003 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.33091284180681496, 'colsample_bytree': 0, 'subsample': 0.9122407019367289, 'learning_rate': 0.12223184461378751, 'n_estimators': 5526, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 7}. Best is trial 15 with value: 0.003.[0m
[32m[I 2022-07-09 16:45:25,664][0m Trial 16 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.5217057720839351, 'colsample_bytree': 0, 'subsample': 0.524595125033872, 'learning_rate': 0.052852280851053264, 'n_estimators': 5937, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 86}. Best is trial 15 with value: 0.003.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



[32m[I 2022-07-09 16:45:25,876][0m Trial 17 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.369788201113915, 'colsample_bytree': 0, 'subsample': 0.9128355935376671, 'learning_rate': 0.17459609198826648, 'n_estimators': 7656, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 192}. Best is trial 15 with value: 0.003.[0m



Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:26,637][0m Trial 18 finished with value: 0.0029 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.535876953841101, 'colsample_bytree': 0, 'subsample': 0.7699045450262089, 'learning_rate': 0.07528651228099323, 'n_estimators': 4963, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 4}. Best is trial 18 with value: 0.0029.[0m
[32m[I 2022-07-09 16:45:26,846][0m Trial 19 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.5147624588882621, 'colsample_bytree': 0, 'subsample': 0.7561517224263441, 'learning_rate': 0.07154731028191787, 'n_estimators': 5461, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 84}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.



[32m[I 2022-07-09 16:45:27,042][0m Trial 20 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.7964786889421052, 'colsample_bytree': 0, 'subsample': 0.5984426662658658, 'learning_rate': 0.0425666591878479, 'n_estimators': 7479, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 235}. Best is trial 18 with value: 0.0029.[0m



Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:27,586][0m Trial 21 finished with value: 0.0037 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3259288841222604, 'colsample_bytree': 0, 'subsample': 0.8031835239072828, 'learning_rate': 0.12509011112593493, 'n_estimators': 4729, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 3}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:27,880][0m Trial 22 finished with value: 0.0041 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.45251260594649734, 'colsample_bytree': 0, 'subsample': 0.908881833667849, 'learning_rate': 0.29054525343938103, 'n_estimators': 3049, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 40}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:28,281][0m Trial 23 finished with value: 0.0035 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.5660181633858341, 'colsample_bytree': 0, 'subsample': 0.9131593272288857, 'learning_rate': 0.14813358619476363, 'n_estimators': 4828, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 4}. Best is trial 18 with value: 0.0029.[0m
[32m[I 2022-07-09 16:45:28,479][0m Trial 24 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.5985328587048739, 'colsample_bytree': 0, 'subsample': 0.7792280438705914, 'learning_rate': 0.08683811892104702, 'n_estimators': 6886, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 80}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:29,232][0m Trial 25 finished with value: 0.0041 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.6129199221268027, 'colsample_bytree': 0, 'subsample': 0.6939815835482199, 'learning_rate': 0.05741893923267266, 'n_estimators': 5211, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 34}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:29,640][0m Trial 26 finished with value: 0.0041 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.8510132170389461, 'colsample_bytree': 0, 'subsample': 0.8848208812831008, 'learning_rate': 0.14865054445969544, 'n_estimators': 3492, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 66}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:30,187][0m Trial 27 finished with value: 0.0035 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.45915399237750215, 'colsample_bytree': 0, 'subsample': 0.808948554712387, 'learning_rate': 0.2630878228477416, 'n_estimators': 5778, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 33}. Best is trial 18 with value: 0.0029.[0m
[32m[I 2022-07-09 16:45:30,408][0m Trial 28 finished with value: 6.6423 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.4443840764964726, 'colsample_bytree': 0, 'subsample': 0.6116579958192485, 'learning_rate': 0.28880739311964615, 'n_estimators': 8366, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 108}. Best is trial 18 with value: 0.0029.[0m


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer", "tree_methold" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:45:31,794][0m Trial 29 finished with value: 0.0041 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.7068329287099647, 'colsample_bytree': 0, 'subsample': 0.6805589732130196, 'learning_rate': 0.025317706255244734, 'n_estimators': 5750, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 35}. Best is trial 18 with value: 0.0029.[0m


Best Score: 0.0029
Best trial: {'optimizer': 'gbtree', 'reg_lambda': 0.535876953841101, 'colsample_bytree': 0, 'subsample': 0.7699045450262089, 'learning_rate': 0.07528651228099323, 'n_estimators': 4963, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 4}


In [45]:
model_xgb = xgb.XGBRegressor(**study.best_params, tree_method='gpu_hist',gpu_id='0')
model_xgb.fit(E[COLS], np.log1p(E['y']))

Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0, gamma=0, gpu_id='0',
             importance_type='gain', interaction_constraints='',
             learning_rate=0.07528651228099323, max_delta_step=0, max_depth=12,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=4963, n_jobs=12, num_parallel_tree=1,
             optimizer='gbtree', random_state=0, reg_alpha=0,
             reg_lambda=0.535876953841101, scale_pos_weight=1,
             subsample=0.7699045450262089, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

# L_XGB

In [48]:
def objective2(trial):
    params_xgb = {
        'optimizer':trial.suggest_categorical('optimizer',['gbtree','dart']),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),
        'colsample_bytree': trial.suggest_int('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'max_depth': trial.suggest_int("max_depth", 4, 12),
        'random_state': trial.suggest_categorical('random_state', [0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'tree_method':'gpu_hist',
        'gpu_id':'0'
    }
    
    # 학습 데이터 중 일부를 검증 데이터 셋으로 분할합니다. 
    X_train, X_valid, y_train, y_valid = train_test_split(L[COLS], np.log1p(L['y']), test_size=0.15, shuffle=True, random_state=71)

    model = xgb.XGBRegressor(**params_xgb)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=35,
        verbose=False
    )

    xgb_pred = model.predict(X_valid)
    rmse_val = rmse(y_valid, xgb_pred)
    
    return rmse_val

In [49]:
sampler = TPESampler(seed=42)
study2 = optuna.create_study(
    study_name="xgb_parameter_opt2",
    direction="minimize",
    sampler=sampler,
)
study2.optimize(objective2, n_trials=30)
print("Best Score:", study2.best_value)
print("Best trial:", study2.best_trial.params)

[32m[I 2022-07-09 16:48:29,365][0m A new study created in memory with name: xgb_parameter_opt2[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:31,770][0m Trial 0 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7319939418114051, 'colsample_bytree': 0, 'subsample': 0.7190609389379257, 'learning_rate': 0.018410729205738687, 'n_estimators': 1644, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 260}. Best is trial 0 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:32,175][0m Trial 1 finished with value: 0.005 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.020584494295802447, 'colsample_bytree': 0, 'subsample': 0.978936896513396, 'learning_rate': 0.2595942550311264, 'n_estimators': 2202, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 56}. Best is trial 1 with value: 0.005.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:32,689][0m Trial 2 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.43194501864211576, 'colsample_bytree': 0, 'subsample': 0.5038603981386294, 'learning_rate': 0.10952662748632554, 'n_estimators': 1481, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 110}. Best is trial 1 with value: 0.005.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:33,616][0m Trial 3 finished with value: 0.0048 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.19967378215835974, 'colsample_bytree': 0, 'subsample': 0.6599641068895281, 'learning_rate': 0.10150667045928574, 'n_estimators': 559, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 52}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:35,906][0m Trial 4 finished with value: 0.0049 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.9656320330745594, 'colsample_bytree': 0, 'subsample': 0.8658781436815228, 'learning_rate': 0.032925293631105246, 'n_estimators': 1067, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 133}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:38,463][0m Trial 5 finished with value: 0.005 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.034388521115218396, 'colsample_bytree': 0, 'subsample': 0.9365242814551473, 'learning_rate': 0.02752069685079053, 'n_estimators': 6659, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 157}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:38,770][0m Trial 6 finished with value: 0.0053 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.9695846277645586, 'colsample_bytree': 0, 'subsample': 0.8425929763527802, 'learning_rate': 0.3946212980759094, 'n_estimators': 8959, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 277}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:40,178][0m Trial 7 finished with value: 0.0052 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.045227288910538066, 'colsample_bytree': 0, 'subsample': 0.527731231534285, 'learning_rate': 0.04574578205475402, 'n_estimators': 2786, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 108}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:43,756][0m Trial 8 finished with value: 0.0049 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.14092422497476265, 'colsample_bytree': 0, 'subsample': 0.8615378865278278, 'learning_rate': 0.01338626158454391, 'n_estimators': 9871, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 60}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:44,612][0m Trial 9 finished with value: 0.0049 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7068573438476171, 'colsample_bytree': 0, 'subsample': 0.810305017628691, 'learning_rate': 0.20434554984161393, 'n_estimators': 833, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 35}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:45,589][0m Trial 10 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3091901711466168, 'colsample_bytree': 0, 'subsample': 0.3052831235214052, 'learning_rate': 0.09317719298044715, 'n_estimators': 4323, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 5}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:46,175][0m Trial 11 finished with value: 0.007 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.9469618763901493, 'colsample_bytree': 0, 'subsample': 0.64286551197029, 'learning_rate': 0.0464808506704097, 'n_estimators': 152, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 197}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:47,840][0m Trial 12 finished with value: 0.005 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.6247568732023709, 'colsample_bytree': 0, 'subsample': 0.63444528539133, 'learning_rate': 0.0548743611236223, 'n_estimators': 3990, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 113}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:48,328][0m Trial 13 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.3025562205835817, 'colsample_bytree': 0, 'subsample': 0.7435029410857881, 'learning_rate': 0.12754447170883343, 'n_estimators': 5262, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 196}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:50,003][0m Trial 14 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.5149704140209432, 'colsample_bytree': 0, 'subsample': 0.3888893908007609, 'learning_rate': 0.027227944242606474, 'n_estimators': 3114, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 155}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:50,624][0m Trial 15 finished with value: 0.0051 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.805829554446391, 'colsample_bytree': 0, 'subsample': 0.5408842799901548, 'learning_rate': 0.07014070679615513, 'n_estimators': 131, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 86}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:51,625][0m Trial 16 finished with value: 0.0051 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.29424160044289926, 'colsample_bytree': 0, 'subsample': 0.7336147848297419, 'learning_rate': 0.17149778440546237, 'n_estimators': 5937, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 4}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:55,207][0m Trial 17 finished with value: 0.005 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.1795775547090383, 'colsample_bytree': 0, 'subsample': 0.9108066765665903, 'learning_rate': 0.01062555622659351, 'n_estimators': 9775, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 61}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:48:59,393][0m Trial 18 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.1673897185199577, 'colsample_bytree': 0, 'subsample': 0.7986200949884399, 'learning_rate': 0.010879674352538393, 'n_estimators': 7370, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 45}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:00,155][0m Trial 19 finished with value: 0.0048 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.5859682255690069, 'colsample_bytree': 0, 'subsample': 0.6013164580325275, 'learning_rate': 0.2048335878587522, 'n_estimators': 837, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 27}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:00,566][0m Trial 20 finished with value: 0.0049 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.4969401180737596, 'colsample_bytree': 0, 'subsample': 0.5984426662658658, 'learning_rate': 0.43641643276192993, 'n_estimators': 3410, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 29}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:01,514][0m Trial 21 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.33411331432097297, 'colsample_bytree': 0, 'subsample': 0.36312183706319945, 'learning_rate': 0.0936959266567672, 'n_estimators': 4280, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 7}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:01,914][0m Trial 22 finished with value: 0.0052 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.41022042258930985, 'colsample_bytree': 0, 'subsample': 0.33144384748487543, 'learning_rate': 0.2506382205081364, 'n_estimators': 2243, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 7}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:02,476][0m Trial 23 finished with value: 0.0051 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.1642338258496323, 'colsample_bytree': 0, 'subsample': 0.48816200931364906, 'learning_rate': 0.1371541207135209, 'n_estimators': 8402, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 79}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:02,785][0m Trial 24 finished with value: 0.0051 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.5781168250133443, 'colsample_bytree': 0, 'subsample': 0.679586955296413, 'learning_rate': 0.30905633075783373, 'n_estimators': 7390, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 78}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:03,590][0m Trial 25 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3816494115978138, 'colsample_bytree': 0, 'subsample': 0.4224483329672565, 'learning_rate': 0.09003685300432625, 'n_estimators': 4759, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 30}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:04,482][0m Trial 26 finished with value: 0.005 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3920143747831992, 'colsample_bytree': 0, 'subsample': 0.41843248710863135, 'learning_rate': 0.07270830507431202, 'n_estimators': 799, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 28}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:05,030][0m Trial 27 finished with value: 0.005 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.27704583279509687, 'colsample_bytree': 0, 'subsample': 0.3061567391399682, 'learning_rate': 0.17637811503431974, 'n_estimators': 2303, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 19}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:05,771][0m Trial 28 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.2570308852630384, 'colsample_bytree': 0, 'subsample': 0.5944293885572675, 'learning_rate': 0.14392194491198518, 'n_estimators': 3601, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 83}. Best is trial 3 with value: 0.0048.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:49:06,962][0m Trial 29 finished with value: 0.0049 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.22132384160066584, 'colsample_bytree': 0, 'subsample': 0.6001284979244605, 'learning_rate': 0.14751728694233782, 'n_estimators': 1653, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 91}. Best is trial 3 with value: 0.0048.[0m


Best Score: 0.0048
Best trial: {'optimizer': 'dart', 'reg_lambda': 0.19967378215835974, 'colsample_bytree': 0, 'subsample': 0.6599641068895281, 'learning_rate': 0.10150667045928574, 'n_estimators': 559, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 52}


In [50]:
model_xgb2 = xgb.XGBRegressor(**study2.best_params, tree_method='gpu_hist',gpu_id='0')
model_xgb2.fit(L[COLS], np.log1p(L['y']))

Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0, gamma=0, gpu_id='0',
             importance_type='gain', interaction_constraints='',
             learning_rate=0.10150667045928574, max_delta_step=0, max_depth=9,
             min_child_weight=52, missing=nan, monotone_constraints='()',
             n_estimators=559, n_jobs=12, num_parallel_tree=1, optimizer='dart',
             random_state=0, reg_alpha=0, reg_lambda=0.19967378215835974,
             scale_pos_weight=1, subsample=0.6599641068895281,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [51]:
predict.loc[E_predict.index,'xgb_pred'] = np.exp(model_xgb.predict(E_predict[COLS]))
predict.loc[L_predict.index,'xgb_pred'] = np.exp(model_xgb2.predict(L_predict[COLS]))

In [52]:
df_submission = predict[['key_val', 'xgb_pred']] 
df_submission.head()

Unnamed: 0,key_val,xgb_pred
0,LOT380_17,1268.52832
1,LOT122_18,1264.2229
2,LOT313_18,1262.627563
3,LOT459_12,1263.685303
4,LOT459_18,1264.122803


In [53]:
# 예측 파일을 저장합니다. 
# 제출용 파일 이름은 cds_submission_팀명_차수.csv 형태로 제출합니다.
df_submission.set_index('key_val', inplace=True)
df_submission.to_csv('cds_submission_데이터조무사_18.csv')

# 전체 XGBOOST

In [54]:
def objective(trial):
    params_xgb = {
        'optimizer':trial.suggest_categorical('optimizer',['gbtree','dart']),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1.0),
        'colsample_bytree': trial.suggest_int('colsample_bytree', 0.3,1.0),
        'subsample': trial.suggest_float('subsample', 0.3, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
        'max_depth': trial.suggest_int("max_depth", 4, 12),
        'random_state': trial.suggest_categorical('random_state', [0]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'tree_method':'gpu_hist',
        'gpu_id':'0'
    }
    
    # 학습 데이터 중 일부를 검증 데이터 셋으로 분할합니다. 
    X_train, X_valid, y_train, y_valid = train_test_split(df_final[COLS], np.log1p(df_final['y']), test_size=0.15, shuffle=True, random_state=71)

    model = xgb.XGBRegressor(**params_xgb)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=35,
        verbose=False
    )

    xgb_pred = model.predict(X_valid)
    rmse_val = rmse(y_valid, xgb_pred)
    
    return rmse_val

In [55]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="xgb_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective, n_trials=30)
print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

[32m[I 2022-07-09 16:50:47,787][0m A new study created in memory with name: xgb_parameter_opt[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:50,465][0m Trial 0 finished with value: 0.0056 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7319939418114051, 'colsample_bytree': 0, 'subsample': 0.7190609389379257, 'learning_rate': 0.018410729205738687, 'n_estimators': 1644, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 260}. Best is trial 0 with value: 0.0056.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:51,051][0m Trial 1 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.020584494295802447, 'colsample_bytree': 0, 'subsample': 0.978936896513396, 'learning_rate': 0.2595942550311264, 'n_estimators': 2202, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 56}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:51,609][0m Trial 2 finished with value: 0.0056 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.43194501864211576, 'colsample_bytree': 0, 'subsample': 0.5038603981386294, 'learning_rate': 0.10952662748632554, 'n_estimators': 1481, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 110}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:52,521][0m Trial 3 finished with value: 0.0055 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.19967378215835974, 'colsample_bytree': 0, 'subsample': 0.6599641068895281, 'learning_rate': 0.10150667045928574, 'n_estimators': 559, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 52}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:54,106][0m Trial 4 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.9656320330745594, 'colsample_bytree': 0, 'subsample': 0.8658781436815228, 'learning_rate': 0.032925293631105246, 'n_estimators': 1067, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 133}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:56,396][0m Trial 5 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.034388521115218396, 'colsample_bytree': 0, 'subsample': 0.9365242814551473, 'learning_rate': 0.02752069685079053, 'n_estimators': 6659, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 157}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:56,706][0m Trial 6 finished with value: 0.0056 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.9695846277645586, 'colsample_bytree': 0, 'subsample': 0.8425929763527802, 'learning_rate': 0.3946212980759094, 'n_estimators': 8959, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 277}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:50:58,398][0m Trial 7 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.045227288910538066, 'colsample_bytree': 0, 'subsample': 0.527731231534285, 'learning_rate': 0.04574578205475402, 'n_estimators': 2786, 'max_depth': 11, 'random_state': 0, 'min_child_weight': 108}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:01,965][0m Trial 8 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.14092422497476265, 'colsample_bytree': 0, 'subsample': 0.8615378865278278, 'learning_rate': 0.01338626158454391, 'n_estimators': 9871, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 60}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:02,513][0m Trial 9 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.7068573438476171, 'colsample_bytree': 0, 'subsample': 0.810305017628691, 'learning_rate': 0.20434554984161393, 'n_estimators': 833, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 35}. Best is trial 1 with value: 0.0053.[0m
[32m[I 2022-07-09 16:51:02,737][0m Trial 10 finished with value: 6.6424 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.3637276736216312, 'colsample_bytree': 0, 'subsample': 0.3052831235214052, 'learning_rate': 0.41534235816464743, 'n_estimators': 4280, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 197}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:03,325][0m Trial 11 finished with value: 0.0054 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.6407399438625951, 'colsample_bytree': 0, 'subsample': 0.9952980603637356, 'learning_rate': 0.19194036307491474, 'n_estimators': 3625, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 12}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:03,829][0m Trial 12 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.6882171771487813, 'colsample_bytree': 0, 'subsample': 0.7822650211522437, 'learning_rate': 0.20145290975009167, 'n_estimators': 5639, 'max_depth': 6, 'random_state': 0, 'min_child_weight': 25}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:04,304][0m Trial 13 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.5825158354173895, 'colsample_bytree': 0, 'subsample': 0.9641738995667064, 'learning_rate': 0.20486876181579894, 'n_estimators': 2788, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 69}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:04,667][0m Trial 14 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.3213180375606428, 'colsample_bytree': 0, 'subsample': 0.7434982260511511, 'learning_rate': 0.3016707900952943, 'n_estimators': 352, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 81}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:05,627][0m Trial 15 finished with value: 0.0054 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.832661538561756, 'colsample_bytree': 0, 'subsample': 0.9053141223608127, 'learning_rate': 0.08478694898898, 'n_estimators': 2607, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 9}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:06,091][0m Trial 16 finished with value: 0.0056 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.5344696976659984, 'colsample_bytree': 0, 'subsample': 0.6079120952892296, 'learning_rate': 0.1346374870408681, 'n_estimators': 5378, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 190}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:07,362][0m Trial 17 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.8467469983228296, 'colsample_bytree': 0, 'subsample': 0.8136020598682961, 'learning_rate': 0.05755939074118105, 'n_estimators': 1857, 'max_depth': 7, 'random_state': 0, 'min_child_weight': 38}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:08,199][0m Trial 18 finished with value: 0.0056 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.8360728147769023, 'colsample_bytree': 0, 'subsample': 0.3220379233645285, 'learning_rate': 0.05108088722155475, 'n_estimators': 7247, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 95}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:08,545][0m Trial 19 finished with value: 0.0055 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.2662890013604837, 'colsample_bytree': 0, 'subsample': 0.6938839037354738, 'learning_rate': 0.27970646032891194, 'n_estimators': 4221, 'max_depth': 5, 'random_state': 0, 'min_child_weight': 153}. Best is trial 1 with value: 0.0053.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:10,245][0m Trial 20 finished with value: 0.0051 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.4934004933386089, 'colsample_bytree': 0, 'subsample': 0.9981377934532484, 'learning_rate': 0.13703825810468487, 'n_estimators': 3436, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 3}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:11,345][0m Trial 21 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.8187568380821293, 'colsample_bytree': 0, 'subsample': 0.9845522926062795, 'learning_rate': 0.06300088429941299, 'n_estimators': 2522, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 43}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:11,832][0m Trial 22 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.47146278399967734, 'colsample_bytree': 0, 'subsample': 0.9274417895934663, 'learning_rate': 0.14113560369789424, 'n_estimators': 3545, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 11}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:13,261][0m Trial 23 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.1472613596667962, 'colsample_bytree': 0, 'subsample': 0.902563552383456, 'learning_rate': 0.07567745002693821, 'n_estimators': 1987, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 4}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:14,104][0m Trial 24 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.11315151444651189, 'colsample_bytree': 0, 'subsample': 0.9074626154682007, 'learning_rate': 0.07910034107656362, 'n_estimators': 4325, 'max_depth': 9, 'random_state': 0, 'min_child_weight': 4}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:14,671][0m Trial 25 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.4132881621248071, 'colsample_bytree': 0, 'subsample': 0.9935879093110618, 'learning_rate': 0.1316024776964484, 'n_estimators': 3415, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 45}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:15,030][0m Trial 26 finished with value: 0.0054 and parameters: {'optimizer': 'gbtree', 'reg_lambda': 0.2401028640708824, 'colsample_bytree': 0, 'subsample': 0.9030967195975954, 'learning_rate': 0.4950753678354835, 'n_estimators': 2387, 'max_depth': 10, 'random_state': 0, 'min_child_weight': 64}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:15,491][0m Trial 27 finished with value: 0.0053 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.12499189791283251, 'colsample_bytree': 0, 'subsample': 0.9424701904793276, 'learning_rate': 0.25998862505769205, 'n_estimators': 1872, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 90}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:15,885][0m Trial 28 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.0010680606531620132, 'colsample_bytree': 0, 'subsample': 0.7670978178455348, 'learning_rate': 0.28880739311964615, 'n_estimators': 6161, 'max_depth': 12, 'random_state': 0, 'min_child_weight': 121}. Best is trial 20 with value: 0.0051.[0m


Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2022-07-09 16:51:17,402][0m Trial 29 finished with value: 0.0054 and parameters: {'optimizer': 'dart', 'reg_lambda': 0.1956574071361431, 'colsample_bytree': 0, 'subsample': 0.4363638319909704, 'learning_rate': 0.03633341898559157, 'n_estimators': 4714, 'max_depth': 4, 'random_state': 0, 'min_child_weight': 25}. Best is trial 20 with value: 0.0051.[0m


Best Score: 0.0051
Best trial: {'optimizer': 'dart', 'reg_lambda': 0.4934004933386089, 'colsample_bytree': 0, 'subsample': 0.9981377934532484, 'learning_rate': 0.13703825810468487, 'n_estimators': 3436, 'max_depth': 8, 'random_state': 0, 'min_child_weight': 3}


In [56]:
model_xgb = xgb.XGBRegressor(**study.best_params, tree_method='gpu_hist',gpu_id='0')
model_xgb.fit(df_final[COLS], np.log1p(df_final['y']))

Parameters: { "optimizer" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0, gamma=0, gpu_id='0',
             importance_type='gain', interaction_constraints='',
             learning_rate=0.13703825810468487, max_delta_step=0, max_depth=8,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=3436, n_jobs=12, num_parallel_tree=1,
             optimizer='dart', random_state=0, reg_alpha=0,
             reg_lambda=0.4934004933386089, scale_pos_weight=1,
             subsample=0.9981377934532484, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

In [59]:
predict['xgb_pred_all'] = np.exp(model_xgb.predict(df_predict_final[COLS]))
df_submission = predict[['key_val', 'xgb_pred_all']] 
df_submission.head()

Unnamed: 0,key_val,xgb_pred_all
0,LOT380_17,1265.726562
1,LOT122_18,1265.573364
2,LOT313_18,1267.249023
3,LOT459_12,1261.981079
4,LOT459_18,1261.86377


In [60]:
# 예측 파일을 저장합니다. 
# 제출용 파일 이름은 cds_submission_팀명_차수.csv 형태로 제출합니다.
df_submission.set_index('key_val', inplace=True)
df_submission.to_csv('cds_submission_데이터조무사_19.csv')

# CATBOOST REGRESSOR

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [None]:
train_idxs = list(df_final.groupby('module_name')['key_val'].nunique().values)
predict_idxs = list(df_predict_final.groupby('module_name')['key_val'].nunique().values)

In [None]:
len(train_idxs)

In [None]:
cv = LeaveOneOut()
cats = [CatBoostRegressor() for i in range(47)]
idexss = 0
for i in range(len(train_idxs)):
    if i==0:
        train_idx = range(train_idxs[i])
        idexss += train_idxs[i]
    else:
        train_idx = range(train_idxs[i-1], idexss+train_idxs[i])
        idexss += train_idxs[i]
    X = df_final.loc[train_idx, num_features]
    y = df_final.loc[train_idx, 'y']
    cat = cats[i]
    scores = cross_val_score(cat, X, y, scoring='neg_mean_squared_error',
                             cv=cv, n_jobs=-1)
    scores = np.sqrt(scores)
    print(np.mean(scores))

# Feature Selection

In [None]:
from probatus.feature_elimination import EarlyStoppingShapRFECV

In [None]:
# Run feature elimination
shap_elimination = EarlyStoppingShapRFECV(
    clf=model, step=0.2, cv=10, scoring='neg_mean_squared_error', early_stopping_rounds=15, n_jobs=-1, eval_metric='rmse')
report = shap_elimination.fit_compute(X_XGB, np.log1p(y))

# Make plots
performance_plot = shap_elimination.plot()

In [None]:
report

In [None]:
# Get final feature set
''' num_features는 마지막에 남길 feature 수임. '''
final_features_set = shap_elimination.get_reduced_features_set(num_features=69)

The provided number of features has not been achieved at any stage of the process. You can select one of the following: [402, 322, 258, 207, 166, 133, 107, 86, 69, 56, 45, 36, 29, 24, 20, 16, 13, 11, 9, 8, 7, 6, 5, 4, 3, 2, 1]

In [None]:
final_features_set

# 예측 결과 제출

In [None]:
df_predict.head()

In [None]:
pred_X = pd.get_dummies(df_predict[COLS])

In [None]:
predict['msure_val'] = np.exp(cat.predict(df_predict_final[COLS]))
df_submission = predict[['key_val', 'msure_val']] 
df_submission.head()

In [None]:
# 예측값에 결측치가 포함되어 있는지 확인합니다.
df_submission.isnull().sum()

In [None]:
# 예측값의 갯수가 평가용 데이터의 갯수와 동일한지 확인합니다.
assert len(df_submission) == len(predict)
print(f'No. of Predict DataSet : {len(predict)}\nNo. of Submission DataSet : {len(df_submission)}')

In [None]:
# 예측 파일을 저장합니다. 
# 제출용 파일 이름은 cds_submission_팀명_차수.csv 형태로 제출합니다.
df_submission.set_index('key_val', inplace=True)
df_submission.to_csv('cds_submission_데이터조무사_12.csv')

# module 별 LeaveOneOut으로 모델링해보기

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

In [None]:
train_idxs = list(df_final.groupby('module_name')['key_val'].nunique().values)
predict_idxs = list(df_predict_final.groupby('module_name')['key_val'].nunique().values)

In [None]:
cv = LeaveOneOut()
cats = [CatBoostRegressor() for i in range(47)]
for i in range(len(idx)):
    if i==0:
        train_idx = range(idx[i])
    else:
        train_idx = range(idx[i-1], idx[i])
    

scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error',
                         cv=cv, n_jobs=-1)

In [None]:
mean(scores)

# Optuna + CatRegressor

In [None]:
cats = [CatBoostRegressor() for i in range(47)]
idexss = 0
for i,idx in enumerate(train_idxs):
    if i==0:
        train_idx = range(idx)
        idexss += idx
    else:
        train_idx = range(idexss, idexss+idx)
        idexss += idx
    X = df_final.loc[train_idx, num_features]
    y = df_final.loc[train_idx, 'y']
    
    def objective_CAT(trial):
        param = {
          "random_state":42,
          'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
          'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    #       "n_estimators":trial.suggest_int("n_estimators", 100, 10000),
          "max_depth":trial.suggest_int("max_depth", 4, 12),
          'random_strength' :trial.suggest_int('random_strength', 0, 30),
    #       "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
          "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
          "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
          "max_bin": trial.suggest_int("max_bin", 200, 400),
          'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
          'boosting_type':trial.suggest_categorical('boosting_type', ['Plain', 'Ordered']),
          'task_type':'GPU',
          'devices':'0:7',
          'iterations':100,
          'rsm':1
      }
        X_cat = X
        y_cat = pd.Series(np.log1p(y))
        cv = LeaveOneOut()
        cat = cats[i]
        cat.set_params(**param)
        scores = cross_val_score(cat, X_cat, y_cat, scoring='neg_mean_squared_error',
                                 cv=cv)
        scores = np.sqrt(-scores)

        return np.mean(scores)
    
    sampler = TPESampler(seed=42)
    study = optuna.create_study(
            study_name="cat_parameter_opt",
            direction="minimize",
            sampler=sampler
    )
    study.optimize(objective_CAT, n_trials=3)
    print("Best Score:", study.best_value)
    print("Best trial:", study.best_trial.params)
    CAT = CatBoostRegressor(**study.best_params)
    CAT.fit(X, np.log1p(y),verbose=0)
    cats[i] = CAT
    print('{}번째 모델 훈련이 완료되었습니다.'.format(i+1))

In [None]:
pred = []
idexss_pred = 0
for i, (idx,model) in enumerate(zip(predict_idxs, cats)):
    if i == 0:
        pred_idx = range(idx)
        idexss_pred += idx
    else:
        pred_idx = range(idexss_pred, idexss_pred+idx)
        idexss_pred += idx
    X_pred = df_predict_final.loc[pred_idx, num_features]
    CAT_pred = model.predict(X_pred)
    pred.extend(CAT_pred)

In [None]:
predict['msure_val'] = np.exp(pred)
df_submission = predict[['key_val', 'msure_val']] 
df_submission.head()

In [None]:
len(df_submission)

In [None]:
# 예측 파일을 저장합니다. 
# 제출용 파일 이름은 cds_submission_팀명_차수.csv 형태로 제출합니다.
df_submission.set_index('key_val', inplace=True)
df_submission.to_csv('cds_submission_데이터조무사_15.csv')

# Cat 기본모델과 LooCV

In [None]:
base_cat_params = {'task_type':'GPU','devices':'0:7','iterations':100,'rsm':1}
cats = [CatBoostRegressor(random_state=1,**base_cat_params) for i in range(47)]
idexss = 0
for i in range(len(train_idxs)):
    if i==0:
        train_idx = range(train_idxs[i])
        idexss += train_idxs[i]
    else:
        train_idx = range(train_idxs[i-1], idexss+train_idxs[i])
        idexss += train_idxs[i]
    X = df_final.loc[train_idx, num_features]
    y = df_final.loc[train_idx, 'y']
    
    
    X_cat = X
    y_cat = pd.Series(np.log1p(y))
    cv = LeaveOneOut()
    cat = cats[i]
    scores = cross_val_score(cat, X_cat, y_cat, scoring='neg_mean_squared_error',
                                cv=cv)
    scores = np.sqrt(-scores)
    print(np.mean(scores))
    
    cat.fit(X, np.log1p(y))
    cats[i] = CAT
    print('{}번째 모델 훈련이 완료되었습니다.'.format(i+1))

In [None]:
cats = []
idexss = 0
for i in range(len(train_idxs)):
    if i==0:
        train_idx = range(train_idxs[i])
        idexss += train_idxs[i]
    else:
        train_idx = range(train_idxs[i-1], idexss+train_idxs[i])
        idexss += train_idxs[i]
    X = df_final.loc[train_idx, num_features]
    y = df_final.loc[train_idx, 'y']
    
    def objective_CAT(trial):
        param = {
          "random_state":42,
          'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.5),
          'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
    #       "n_estimators":trial.suggest_int("n_estimators", 100, 10000),
          "max_depth":trial.suggest_int("max_depth", 4, 12),
          'random_strength' :trial.suggest_int('random_strength', 0, 30),
    #       "colsample_bylevel":trial.suggest_float("colsample_bylevel", 0.4, 1.0),
          "l2_leaf_reg":trial.suggest_float("l2_leaf_reg",1e-8,3e-5),
          "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
          "max_bin": trial.suggest_int("max_bin", 200, 400),
          'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
          'boosting_type':trial.suggest_categorical('boosting_type', ['Plain', 'Ordered']),
          'task_type':'GPU',
          'devices':'0:8',
          'iterations':50,
          'rsm':1
      }
        X_cat = X
        y_cat = pd.Series(np.log1p(y))
        loo = LeaveOneOut()
        cat = CatBoostRegressor(**param)
        loo.get_n_splits(X_cat)
        rmsle = []
        for train_idx, test_idx in loo.split(X_cat):
            cat.fit(X_cat.iloc[train_idx,:], y_cat.iloc[train_idx], eval_set=[(X_cat.iloc[test_idx,:],y_cat.iloc[test_idx])], early_stopping_rounds=15, silent=True)
            cat_pred = cat.predict(X_cat.iloc[test_idx,:])
            rmsle_val = np.sqrt(mean_squared_error(y_cat.iloc[test_idx], cat_pred))
            rmsle.append(rmsle_val)
        return np.mean(rmsle)
    
    sampler = TPESampler(seed=42)
    study = optuna.create_study(
            study_name="cat_parameter_opt",
            direction="minimize",
            sampler=sampler
    )
    study.optimize(objective_CAT, n_trials=3)
    print("Best Score:", study.best_value)
    print("Best trial:", study.best_trial.params)
    X_cat = X
    y_cat = pd.Series(np.log1p(y))
    loo = LeaveOneOut()
    CAT = CatBoostRegressor(**study.best_params)
    for train_idx, test_idx in loo.split(X_cat):
            CAT.fit(X_cat.iloc[train_idx,:], y_cat.iloc[train_idx], eval_set=[(X_cat.iloc[test_idx,:], y_cat.iloc[test_idx])], early_stopping_rounds=15)
    cats.append(CAT)
    print('{}번째 모델 훈련이 완료되었습니다.'.format(i+1))

In [None]:
import lightgbm as lgb

In [None]:
cats = []
idexss = 0
for i in range(len(train_idxs)):
    if i==0:
        train_idx = range(train_idxs[i])
        idexss += train_idxs[i]
    else:
        train_idx = range(train_idxs[i-1], idexss+train_idxs[i])
        idexss += train_idxs[i]
    X = df_final.loc[train_idx, num_features]
    y = df_final.loc[train_idx, 'y']
    
    def objective(trial):
        lgbm_param = {
            'objective': 'regression',
            'verbose': -1,
            'metric': 'rmsle', 
            'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
            'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
            'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
            'max_depth': trial.suggest_int('max_depth',3, 15),
            'learning_rate': trial.suggest_uniform("learning_rate", 0.01, 0.5),
            'n_estimators': trial.suggest_int('n_estimators', 100, 10000),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'subsample': trial.suggest_loguniform('subsample', 0.4, 1),
            'random_state':0
        }
        X_cat = X
        y_cat = pd.Series(np.log1p(y))
        loo = LeaveOneOut()
        cat = lgb.LGBMRegressor(**lgbm_param)
        loo.get_n_splits(X_cat)
        rmsle = []
        for train_idx, test_idx in loo.split(X_cat):
            cat.fit(X_cat.iloc[train_idx,:], y_cat.iloc[train_idx], eval_set=[(X_cat.iloc[test_idx,:],y_cat.iloc[test_idx])], early_stopping_rounds=15, eval_metric='rmse')
            cat_pred = cat.predict(X_cat.iloc[test_idx,:])
            rmsle_val = np.sqrt(mean_squared_error(y_cat.iloc[test_idx], cat_pred))
            rmsle.append(rmsle_val)
        return np.mean(rmsle)
    
    sampler = TPESampler(seed=42)
    study = optuna.create_study(
            study_name="lgb_parameter_opt",
            direction="minimize",
            sampler=sampler
    )
    study.optimize(objective, n_trials=3)
    print("Best Score:", study.best_value)
    print("Best trial:", study.best_trial.params)
    
    X_cat = X
    y_cat = pd.Series(np.log1p(y))
    loo = LeaveOneOut()
    CAT =  lgb.LGBMRegressor(**study.best_params)
    for train_idx, test_idx in loo.split(X_cat):
            CAT.fit(X_cat.iloc[train_idx,:], y_cat.iloc[train_idx], eval_set=[(X_cat.iloc[test_idx,:], y_cat.iloc[test_idx])], early_stopping_rounds=15, eval_metric='rmse')
    cats.append(CAT)
    print('{}번째 모델 훈련이 완료되었습니다.'.format(i+1))