In [3]:
import random
import pandas as pd
import numpy as np
import os
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [21]:
#데이터 불러오기
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
building_info =  pd.read_csv('./data/building_info.csv')

train_df = train_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test_df = test_df.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)


train_df.head()

Unnamed: 0,num_date_time,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [22]:
test_df.head()

Unnamed: 0,num_date_time,building_number,date_time,temperature,rainfall,windspeed,humidity
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77


In [23]:
building_info.head()

Unnamed: 0,building_number,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,Other Buildings,110634.0,39570.0,-,-,-
1,2,Other Buildings,122233.47,99000.0,-,-,-
2,3,Other Buildings,171243.0,113950.0,40,-,-
3,4,Other Buildings,74312.98,34419.62,60,-,-
4,5,Other Buildings,205884.0,150000.0,-,2557,1000


Train 과 Test의 차이: columns에 일조, 일사, 전력 소비량이 없다.

## 전처리 (병합, 빈칸 채우기, 형 변환, num_date_time 삭제)

In [24]:
def preprocessing(df):
  #공통적인 부분만 변경

  #결측값을 0으로 채웁니다 안바꿔도 됨
  df = df.fillna(0)
  #시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
  df['date_time'] = pd.to_datetime(df['date_time'], format='%Y%m%d %H')
  #람다식을 이용해 일시에서 달,일,시간을 잘라낸것
  df['month'] = df['date_time'].dt.month
  df['day'] = df['date_time'].dt.day
  df['hour'] = df['date_time'].dt.hour

  # 'building_number'를 기준으로 두 데이터프레임 병합 및 전처리
  merged_df = pd.merge(df, building_info, on='building_number',how='right')
  #merge후 전처리
  merged_df['solar_power_capacity'] = merged_df['solar_power_capacity'].replace('-', 0)
  merged_df['ess_capacity'] = merged_df['ess_capacity'].replace('-', 0)
  merged_df['pcs_capacity'] = merged_df['pcs_capacity'].replace('-', 0)

  merged_df['solar_power_capacity'] = merged_df['solar_power_capacity'].astype('float64')
  merged_df['ess_capacity'] = merged_df['ess_capacity'].astype('float64')
  merged_df['pcs_capacity'] = merged_df['pcs_capacity'].astype('float64')

  return merged_df.drop(columns=['num_date_time'])


In [25]:
merged_train_df = preprocessing(train_df)
merged_test_df = preprocessing(test_df)

merged_train_df.head()

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,month,day,hour,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,0.0,0.0,986.4,6,1,4,Other Buildings,110634.0,39570.0,0.0,0.0,0.0


In [26]:
merged_test_df.head()

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,month,day,hour,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity
0,1,2022-08-25 00:00:00,23.5,0.0,2.2,72,8,25,0,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
1,1,2022-08-25 01:00:00,23.0,0.0,0.9,72,8,25,1,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
2,1,2022-08-25 02:00:00,22.7,0.0,1.5,75,8,25,2,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
3,1,2022-08-25 03:00:00,22.1,0.0,1.3,78,8,25,3,Other Buildings,110634.0,39570.0,0.0,0.0,0.0
4,1,2022-08-25 04:00:00,21.8,0.0,1.0,77,8,25,4,Other Buildings,110634.0,39570.0,0.0,0.0,0.0


## 추가 전처리

In [8]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series
from xgboost import XGBRegressor

pd.set_option('display.max_columns', 30)

In [28]:
#######################################
## 건물별, 요일별, 시간별 발전량 평균 넣어주기
#######################################
power_mean = pd.pivot_table(merged_train_df, values = 'power_consumption', index = ['building_number', 'hour', 'day'], aggfunc = np.mean).reset_index()
power_mean

Unnamed: 0,building_number,hour,day,power_consumption
0,1,0,1,1371.52
1,1,0,2,1620.16
2,1,0,3,1565.12
3,1,0,4,1539.12
4,1,0,5,1675.20
...,...,...,...,...
74395,100,23,27,586.08
74396,100,23,28,580.44
74397,100,23,29,651.72
74398,100,23,30,669.36


In [29]:
#######################################
## 건물별 시간별 발전량 평균 넣어주기
#######################################
tqdm.pandas()
merged_train_df['day_hour_mean'] = merged_train_df.progress_apply(lambda x : power_mean.loc[(power_mean.building_number == x['building_number']) & (power_mean.hour == x['hour']) & (power_mean.day == x['day']) ,'power_consumption'].values[0], axis = 1)

100%|██████████| 204000/204000 [02:07<00:00, 1606.02it/s]


In [30]:
merged_train_df['day_hour_mean'].head()

0    1371.52
1    1323.52
2    1236.48
3    1189.92
4    1210.08
Name: day_hour_mean, dtype: float64

In [31]:
#######################################
## 건물별 시간별 발전량 표준편차 넣어주기
#######################################
power_hour_std = pd.pivot_table(merged_train_df, values = 'power_consumption', index = ['building_number', 'hour'], aggfunc = np.std).reset_index()
tqdm.pandas()
merged_train_df['hour_std'] = merged_train_df.progress_apply(lambda x : power_hour_std.loc[(power_hour_std.building_number == x['building_number']) & (power_hour_std.hour == x['hour']) ,'power_consumption'].values[0], axis = 1)


100%|██████████| 204000/204000 [01:07<00:00, 3011.34it/s]


In [32]:
### 공휴일 변수 추가
merged_train_df['holiday'] = merged_train_df.apply(lambda x : 0 if x['day']<5 else 1, axis = 1)
merged_train_df.loc[('2022-08-17'<=merged_train_df.date_time)&(merged_train_df.date_time<'2022-08-18'), 'holiday'] = 1

## https://dacon.io/competitions/official/235680/codeshare/2366?page=1&dtype=recent
merged_train_df['sin_time'] = np.sin(2*np.pi*merged_train_df.hour/24)
merged_train_df['cos_time'] = np.cos(2*np.pi*merged_train_df.hour/24)

## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
merged_train_df['THI'] = 9/5*merged_train_df['temperature'] - 0.55*(1-merged_train_df['humidity']/100)*(9/5*merged_train_df['humidity']-26)+32

In [33]:
merged_train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 24 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   building_number       204000 non-null  int64         
 1   date_time             204000 non-null  datetime64[ns]
 2   temperature           204000 non-null  float64       
 3   rainfall              204000 non-null  float64       
 4   windspeed             204000 non-null  float64       
 5   humidity              204000 non-null  float64       
 6   sunshine              204000 non-null  float64       
 7   solar_radiation       204000 non-null  float64       
 8   power_consumption     204000 non-null  float64       
 9   month                 204000 non-null  int64         
 10  day                   204000 non-null  int64         
 11  hour                  204000 non-null  int64         
 12  building_type         204000 non-null  object        
 13 

In [34]:
merged_train_df.head()

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,month,day,hour,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,day_hour_mean,hour_std,holiday,sin_time,cos_time,THI
0,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,0.0,0.0,1085.28,6,1,0,Other Buildings,110634.0,39570.0,0.0,0.0,0.0,1371.52,446.882767,0,0.0,1.0,49.6576
1,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,0.0,0.0,1047.36,6,1,1,Other Buildings,110634.0,39570.0,0.0,0.0,0.0,1323.52,439.662704,0,0.258819,0.965926,47.7625
2,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,0.0,0.0,974.88,6,1,2,Other Buildings,110634.0,39570.0,0.0,0.0,0.0,1236.48,412.071906,0,0.5,0.866025,47.2225
3,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,0.0,0.0,953.76,6,1,3,Other Buildings,110634.0,39570.0,0.0,0.0,0.0,1189.92,391.205981,0,0.707107,0.707107,44.7856
4,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,0.0,0.0,986.4,6,1,4,Other Buildings,110634.0,39570.0,0.0,0.0,0.0,1210.08,381.099697,0,0.866025,0.5,49.0061


In [5]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])
for num in range(1,101,1):
    temp = merged_train_df[merged_train_df['building_number'] == num]
    cdh = CDH(temp['temperature'].values)
    cdhs = np.concatenate([cdhs, cdh])
merged_train_df['CDH'] = cdhs

In [4]:
## save the preprocessed data
#merged_train_df.to_csv('./data/train_preprocessed.csv')
merged_train_df = pd.read_csv('./data/train_preprocessed.csv')
merged_train_df = merged_train_df.drop(columns = ['Unnamed: 0', 'building_type', 'date_time'])

## 원핫 인코딩(안해도 됨)

In [8]:
# merged_train_df = pd.get_dummies(merged_train_df, prefix=["bt:"], columns=["building_type"])
# merged_train_df.head()
# merged_test_df = pd.get_dummies(merged_test_df, prefix=["bt:"], columns=["building_type"])
# merged_test_df.head()

1.7.3


AttributeError: module 'xgboost.core' has no attribute 'ENVIRONMENT_AVAILABLE_GPU'

## 성능 평가지표 정의 및 SMAPE 정의

In [6]:
#### alpha를 argument로 받는 함수로 실제 objective function을 wrapping하여 alpha값을 쉽게 조정할 수 있도록 작성했습니다.
# custom objective function for forcing model not to underestimate
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

 #점수 측정을 위한 코드
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

## 최적의 파라미터 탐색

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit, KFold, TimeSeriesSplit
from sklearn.metrics import make_scorer
df = pd.DataFrame(columns = ['n_estimators', 'eta', 'min_child_weight','max_depth', 'colsample_bytree', 'subsample'])
preds = np.array([])

grid = {'n_estimators' : [100], 'eta' : [0.01], 'min_child_weight' : np.arange(1, 8, 1),
        'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1),
        'subsample' :np.arange(0.8, 1.0, 0.1)} # fix the n_estimators & eta(learning rate)

#tscv = TimeSeriesSplit(n_splits=3)
smape_score = make_scorer(SMAPE, greater_is_better=False)

for i in tqdm(np.arange(1, 101, 1)):
    y = merged_train_df.loc[merged_train_df.building_number == i, 'power_consumption']
    x = merged_train_df.loc[merged_train_df.building_number == i].drop(['power_consumption'], axis=1)
    y_train, y_test, x_train, x_test = temporal_train_test_split(y = y, X = x, test_size = 168)

    pds = PredefinedSplit(np.append(-np.ones(len(x_train)-168), np.zeros(168)))
    gcv = GridSearchCV(estimator = XGBRegressor(seed=0),
                       param_grid=grid, scoring=smape_score, cv=pds, refit=True, verbose=True)
    
    # gcv = GridSearchCV(estimator = XGBRegressor(seed = 0, gpu_id = 1, tree_method = 'gpu_hist', predictor= 'gpu_predictor'),
    #                     param_grid = grid, scoring = smape_score, cv = pds, refit = True, verbose = True)

    gcv.fit(x_train, y_train)
    best = gcv.best_estimator_
    params = gcv.best_params_
    print(params)
    pred = best.predict(x_test)
    building = 'building'+str(i)
    print(building + '|| SMAPE : {}'.format(SMAPE(y_test, pred)))
    preds = np.append(preds, pred)
    df = pd.concat([df, pd.DataFrame(params, index=[0])], axis=0)
    df.to_csv('./hyperparameter_xgb2.csv', index=False) # save the tuned parameters


  0%|          | 0/100 [00:00<?, ?it/s]

Fitting 1 folds for each of 168 candidates, totalling 168 fits


  1%|          | 1/100 [00:15<24:51, 15.06s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building1|| SMAPE : 45.947052485590625
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  2%|▏         | 2/100 [00:30<24:31, 15.01s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.8}
building2|| SMAPE : 43.66444289190987
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  3%|▎         | 3/100 [00:44<24:10, 14.95s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.9}
building3|| SMAPE : 46.539450318596074
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  4%|▍         | 4/100 [00:56<21:59, 13.74s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.9}
building4|| SMAPE : 45.05464216438577
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  5%|▌         | 5/100 [01:12<22:44, 14.36s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building5|| SMAPE : 47.92517783184984
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  6%|▌         | 6/100 [01:25<21:41, 13.84s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building6|| SMAPE : 47.391389844680106
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  7%|▋         | 7/100 [01:40<22:02, 14.22s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building7|| SMAPE : 38.54832927297091
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  8%|▊         | 8/100 [01:52<20:56, 13.65s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building8|| SMAPE : 44.70964734512367
Fitting 1 folds for each of 168 candidates, totalling 168 fits


  9%|▉         | 9/100 [02:06<21:05, 13.90s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9}
building9|| SMAPE : 46.31988939973536
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 10%|█         | 10/100 [02:20<20:30, 13.68s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building10|| SMAPE : 35.57886365222239
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 11%|█         | 11/100 [02:32<19:31, 13.16s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.8}
building11|| SMAPE : 43.99683190058922
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 12%|█▏        | 12/100 [02:45<19:35, 13.36s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building12|| SMAPE : 46.06998211435019
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 13%|█▎        | 13/100 [02:58<19:08, 13.21s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building13|| SMAPE : 46.82056821965615
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 14%|█▍        | 14/100 [03:12<19:15, 13.44s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.8}
building14|| SMAPE : 37.76213594286436
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 15%|█▌        | 15/100 [03:25<18:45, 13.24s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building15|| SMAPE : 43.106140464949
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 16%|█▌        | 16/100 [03:39<18:47, 13.42s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building16|| SMAPE : 44.24857902922196
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 17%|█▋        | 17/100 [03:55<19:50, 14.34s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
building17|| SMAPE : 49.34614314081953
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 18%|█▊        | 18/100 [04:11<19:56, 14.59s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.9}
building18|| SMAPE : 47.71633009751964
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 19%|█▉        | 19/100 [04:26<20:12, 14.97s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building19|| SMAPE : 48.77516222853498
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 20%|██        | 20/100 [04:42<20:17, 15.22s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
building20|| SMAPE : 56.21168813728917
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 21%|██        | 21/100 [04:58<20:11, 15.33s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
building21|| SMAPE : 50.53876606601669
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 22%|██▏       | 22/100 [05:11<19:12, 14.78s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9}
building22|| SMAPE : 45.002055698162145
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 23%|██▎       | 23/100 [05:25<18:27, 14.38s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.9}
building23|| SMAPE : 44.54035774091185
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 24%|██▍       | 24/100 [05:40<18:27, 14.57s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building24|| SMAPE : 43.950420857080815
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 25%|██▌       | 25/100 [05:55<18:20, 14.67s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.9}
building25|| SMAPE : 45.30247559362088
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 26%|██▌       | 26/100 [06:10<18:24, 14.93s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building26|| SMAPE : 44.13974803358078
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 27%|██▋       | 27/100 [06:25<18:14, 15.00s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building27|| SMAPE : 44.28225084563625
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 28%|██▊       | 28/100 [06:40<18:00, 15.01s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
building28|| SMAPE : 41.426360107428714
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 29%|██▉       | 29/100 [06:54<17:23, 14.70s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.8}
building29|| SMAPE : 43.134084967452324
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 30%|███       | 30/100 [07:11<17:42, 15.17s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building30|| SMAPE : 49.36318717329855
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 31%|███       | 31/100 [07:26<17:23, 15.12s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 7, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building31|| SMAPE : 43.59066259164819
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 32%|███▏      | 32/100 [07:33<14:38, 12.92s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building32|| SMAPE : 44.816629908172125
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 33%|███▎      | 33/100 [07:41<12:36, 11.29s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building33|| SMAPE : 45.555701450699765
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 34%|███▍      | 34/100 [07:49<11:26, 10.39s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}
building34|| SMAPE : 45.24398506204396
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 35%|███▌      | 35/100 [07:57<10:17,  9.51s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building35|| SMAPE : 45.23481999471565
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 36%|███▌      | 36/100 [08:04<09:18,  8.73s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building36|| SMAPE : 45.01395546996144
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 37%|███▋      | 37/100 [08:18<10:56, 10.42s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building37|| SMAPE : 47.85006199749607
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 38%|███▊      | 38/100 [08:34<12:38, 12.24s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.9}
building38|| SMAPE : 45.75824516159831
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 39%|███▉      | 39/100 [08:51<13:37, 13.40s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9}
building39|| SMAPE : 48.20934481462406
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 40%|████      | 40/100 [09:07<14:12, 14.21s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building40|| SMAPE : 49.55298917513037
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 41%|████      | 41/100 [09:22<14:12, 14.45s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 100, 'subsample': 0.9}
building41|| SMAPE : 46.719661878268745
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 42%|████▏     | 42/100 [09:37<14:19, 14.81s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.8}
building42|| SMAPE : 47.46832914889027
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 43%|████▎     | 43/100 [09:52<14:08, 14.88s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.8}
building43|| SMAPE : 43.82450026293067
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 44%|████▍     | 44/100 [10:07<13:45, 14.74s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 100, 'subsample': 0.9}
building44|| SMAPE : 43.74556600929688
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 45%|████▌     | 45/100 [10:20<13:12, 14.41s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.9}
building45|| SMAPE : 46.681650154691496
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 46%|████▌     | 46/100 [10:35<13:08, 14.60s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building46|| SMAPE : 46.77753016159643
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 47%|████▋     | 47/100 [10:49<12:42, 14.39s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building47|| SMAPE : 42.63742241584887
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 48%|████▊     | 48/100 [11:04<12:26, 14.35s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building48|| SMAPE : 45.57529391854206
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 49%|████▉     | 49/100 [11:17<11:56, 14.06s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
building49|| SMAPE : 48.18749465431288
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 50%|█████     | 50/100 [11:30<11:33, 13.88s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building50|| SMAPE : 44.82843871189365
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 51%|█████     | 51/100 [11:45<11:28, 14.06s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 8, 'min_child_weight': 3, 'n_estimators': 100, 'subsample': 0.9}
building51|| SMAPE : 45.33086009295845
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 52%|█████▏    | 52/100 [12:00<11:27, 14.31s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 4, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building52|| SMAPE : 45.33930027020286
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 53%|█████▎    | 53/100 [12:18<12:00, 15.34s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 5, 'min_child_weight': 2, 'n_estimators': 100, 'subsample': 0.8}
building53|| SMAPE : 55.02618617352693
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 54%|█████▍    | 54/100 [12:35<12:12, 15.91s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building54|| SMAPE : 50.26047017702054
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 55%|█████▌    | 55/100 [12:45<10:44, 14.32s/it]

{'colsample_bytree': 0.9, 'eta': 0.01, 'max_depth': 6, 'min_child_weight': 7, 'n_estimators': 100, 'subsample': 0.9}
building55|| SMAPE : 45.02853595435476
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 56%|█████▌    | 56/100 [12:55<09:24, 12.83s/it]

{'colsample_bytree': 0.8, 'eta': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}
building56|| SMAPE : 44.6373313333808
Fitting 1 folds for each of 168 candidates, totalling 168 fits


 56%|█████▌    | 56/100 [13:01<10:13, 13.95s/it]


KeyboardInterrupt: 

## 1등코드 원본


In [38]:
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.metrics import make_scorer
df = pd.DataFrame(columns = ['n_estimators', 'eta', 'min_child_weight','max_depth', 'colsample_bytree', 'subsample'])
preds = np.array([])

grid = {'n_estimators' : [100], 'eta' : [0.01], 'min_child_weight' : np.arange(1, 8, 1),
        'max_depth' : np.arange(3,9,1) , 'colsample_bytree' :np.arange(0.8, 1.0, 0.1),
        'subsample' :np.arange(0.8, 1.0, 0.1)} # fix the n_estimators & eta(learning rate)

for i in tqdm(np.arange(1, 101, 1)):
    y = merged_train_df.loc[merged_train_df.building_number == i, 'power_consumption']
    x = merged_train_df.loc[merged_train_df.building_number == i, ].drop('power_consumption', axis=1)
    y_train, y_test, x_train, x_test = temporal_train_test_split(y = y, X = x, test_size = 168)

    smape_score = make_scorer(SMAPE, graeter_is_better=False)
    pds = PredefinedSplit(np.append(-np.ones(len(x)-168), np.zeros(168)))
    gcv = GridSearchCV(estimator = XGBRegressor(seed = 0, gpu_id = 1,
                                                 tree_method = 'gpu_hist', predictor= 'gpu_predictor'),
                         param_grid = grid, scoring = smape_score, cv = pds, refit = True, verbose = True)
    # gcv = GridSearchCV(estimator = XGBRegressor(seed = 0, gpu_id = 1,
    #                                             tree_method = 'gpu_hist', predictor= 'gpu_predictor'),
    #                    param_grid = grid, scoring = smape_score, cv = pds, refit = True, verbose = True)


    gcv.fit(x_train, y_train)
    best = gcv.best_estimator_
    params = gcv.best_params_
    print(params)
    pred = best.predict(x_test)
    building = 'building'+str(i)
    print(building + '|| SMAPE : {}'.format(SMAPE(y_test, pred)))
    preds = np.append(preds, pred)
    df = pd.concat([df, pd.DataFrame(params, index = [0])], axis = 0)
    df.to_csv('./hyperparameter_xgb.csv', index = False) # save the tuned parameters

  0%|          | 0/100 [00:00<?, ?it/s]


Fitting 1 folds for each of 168 candidates, totalling 168 fits


IndexError: indices are out-of-bounds

## 

## 학습

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 데이터셋을 로드합니다. 이 예에서는 pandas DataFrame을 사용하고 있습니다.
# df = pandas.read_csv('your_dataset.csv')

models = {}

for building_number in range(1, 101):  # 건물 번호는 1부터 100까지 있다고 가정합니다.
    # 건물 번호에 따라 데이터를 분할합니다.
    building_data = df[df['building_number'] == building_number]

    # 특징과 목표 변수를 분할합니다.
    X = building_data.drop('target', axis=1)
    y = building_data['target']

    # 훈련 세트와 테스트 세트를 분할합니다.
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # XGBoost 모델을 훈련시킵니다.
    model = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)
    model.fit(X_train, y_train)

    # 모델을 딕셔너리에 저장합니다.
    models[building_number] = model


## 테스트

In [None]:
from sklearn.model_selection import train_test_split
merged_train_df_Y = merged_train_df['power_consumption']
X_train, X_test, y_train, y_test = train_test_split(merged_train_df.drop(columns=['power_consumption','sunshine','windspeed', 'solar_radiation','total_area','cooling_area']), merged_train_df_Y, test_size=0.2, random_state=42)

In [None]:
# 테스트 데이터셋을 로드합니다. 이 예에서는 pandas DataFrame을 사용하고 있습니다.
# test_df = pandas.read_csv('your_test_dataset.csv')

predictions = {}

for building_number in models.keys():
    # 건물 번호에 따라 데이터를 분할합니다.
    building_data = test_df[test_df['building_number'] == building_number]

    # 특징을 분할합니다.
    X_test = building_data.drop('target', axis=1)

    # 해당 건물 번호의 모델로 예측을 수행합니다.
    y_pred = models[building_number].predict(X_test)

    # 예측 결과를 딕셔너리에 저장합니다.
    predictions[building_number] = y_pred


In [None]:
model = XGBRegressor(n_estimators=1000, max_depth=5, eta=0.05, min_child_weight= 1, num_boost_around= 1, sub_sample= 0.5)
model.fit(merged_train_df.drop(columns=['power_consumption','sunshine','windspeed', 'solar_radiation','total_area','cooling_area']), merged_train_df_Y)

In [None]:
preds = model.predict(X_test)
#점수 산출
print("SMAPE: %0.2f" % (SMAPE(y_test,preds)))

In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt

# 피처 중요도 확인
feature_importances = model.feature_importances_

# 피처 중요도를 출력합니다.(위에서 5개만)
for i, feat_importance in enumerate(feature_importances):
  if i <6:
    print(f"Feature {i}: {feat_importance}")

# 피처 중요도를 시각화합니다.
xgb.plot_importance(model)
plt.show()

## 제출

In [None]:
pred = model.predict(merged_test_df.drop(columns=['windspeed','total_area','cooling_area']))
submission = pd.read_csv('/content/drive/MyDrive/data/sample_submission.csv')
submission['answer'] = pred
submission.to_csv('/content/drive/MyDrive/data/baseline_submission_0723.csv', index=False)