## Import

In [19]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
data_path = 'C:\Workspace/power_consumption_comp\data'
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed

In [20]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Load Data

In [21]:
train_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/train.csv')
test_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/test.csv')

## Train Data Pre-Processing

In [22]:
#결측값을 0으로 채웁니다 안바꿔도 됨
train_df = train_df.fillna(0)

In [23]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

In [24]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train_df['전력소비량(kWh)']

## Regression Model Fit

In [25]:
model = RandomForestRegressor(n_jobs=-1)
#model = RandomForestRegressor()


In [26]:
# Split for Tuning
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(train_x, train_y, \
                                                  test_size=0.01, random_state=42)

XX_train, XX_test, yy_train, yy_test=train_test_split(X_test, y_test, \
                                                  test_size=0.2, random_state=42)


In [10]:
from sklearn.model_selection import train_test_split
#train_df8 = train_df[train_df['month'] == 8].copy()
train_df8 = train_df[train_df['month'].isin([8, 7, 6])].copy()
train_x8 = train_df8.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y8 = train_df8['전력소비량(kWh)']
XX_train, XX_test, yy_train, yy_test=train_test_split(train_x8, train_y8, \
                                                  test_size=0.2, random_state=42)

In [29]:
# Tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

parameters = {'max_depth': [25], 'n_estimators': [426],
              'min_samples_split': [2], 'min_samples_leaf': [1]}
#parameters = {'max_depth': [16], 'n_estimators': [400],
#               'min_samples_split': [2], 'min_samples_leaf': [1]}

grid_dclf = GridSearchCV(model, param_grid=parameters, n_jobs=-1, cv=5)
#grid_dclf = GridSearchCV(model, param_grid=parameters, cv=5)
grid_dclf.fit(XX_train, yy_train)

print('GridSearchCV 최적 하이퍼 파라미터:', grid_dclf.best_params_)
print('GridSearchCV 최고 R 제곱 값: {0:.4f}'.format(grid_dclf.best_score_))
best_dclf = grid_dclf.best_estimator_

# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행. 
dpredictions = best_dclf.predict(XX_test)    
r2 = r2_score(yy_test, dpredictions)
print('테스트 세트에서의 DecisionTreeRegressor R 제곱 값: {0:.4f}'.format(r2))


GridSearchCV 최적 하이퍼 파라미터: {'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 426}
GridSearchCV 최고 R 제곱 값: 0.8068
테스트 세트에서의 DecisionTreeRegressor R 제곱 값: 0.8916


In [30]:
# GridSearchCV의 최적 하이퍼 파라미터로 학습된 Estimator로 예측 및 평가 수행. 
dpredictions = best_dclf.predict(X_test)
#yy_test_array = yy_test.values  # DataFrame을 1차원 배열로 변환
r2 = r2_score(y_test, dpredictions)
print('더 큰 테스트 세트에서의 DecisionTreeRegressor R 제곱 값 : {0:.4f}'.format(r2))


더 큰 테스트 세트에서의 DecisionTreeRegressor R 제곱 값 : 0.9629


In [14]:
model1 = RandomForestRegressor(n_estimators=426, max_depth=25, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model1.fit(XX_train, yy_train)
model2 = RandomForestRegressor(n_estimators=425, max_depth=25, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model2.fit(XX_train, yy_train)
model3 = RandomForestRegressor(n_estimators=426, max_depth=26, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model3.fit(XX_train, yy_train)
model4 = RandomForestRegressor(n_estimators=425, max_depth=26, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model4.fit(XX_train, yy_train)
model5 = RandomForestRegressor(n_estimators=400, max_depth=16, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model5.fit(XX_train, yy_train)

predictions_model1 = model1.predict(X_test)
predictions_model2 = model2.predict(X_test)
predictions_model3 = model3.predict(X_test)
predictions_model4 = model4.predict(X_test)
predictions_model5 = model5.predict(X_test)

# 예측 결과를 평균하여 앙상블 예측값을 계산합니다.
ensemble_predictions = (predictions_model1 + predictions_model2 +
                        predictions_model3 + predictions_model4 +
                        predictions_model5) / 5

# 앙상블 예측값의 성능을 평가합니다.
r2_ensemble = r2_score(y_test, ensemble_predictions)
print('앙상블 모델의 R 제곱 값: {0:.4f}'.format(r2_ensemble))


앙상블 모델의 R 제곱 값: 0.9643


In [16]:
from sklearn.ensemble import VotingRegressor

# VotingRegressor를 생성하고 다섯 개의 모델을 넣어줍니다.
voting_regressor = VotingRegressor(estimators=[('model1', model1), ('model2', model2), ('model3', model3),
                                               ('model4', model4), ('model5', model5)])

# VotingRegressor를 학습시킵니다.
voting_regressor.fit(XX_train, yy_train)

# 예측 결과를 얻습니다.
voting_predictions = voting_regressor.predict(X_test)

# 앙상블 예측값의 성능을 평가합니다.
r2_voting = r2_score(y_test, voting_predictions)
print('보팅 앙상블의 R 제곱 값: {0:.4f}'.format(r2_voting))


보팅 앙상블의 R 제곱 값: 0.9641


In [17]:
from sklearn.ensemble import StackingRegressor

# StackingRegressor를 생성하고 다섯 개의 모델을 넣어줍니다.
stacking_regressor = StackingRegressor(estimators=[('model1', model1), ('model2', model2), ('model3', model3),
                                                   ('model4', model4), ('model5', model5)],
                                       final_estimator=RandomForestRegressor())

# StackingRegressor를 학습시킵니다.
stacking_regressor.fit(XX_train, yy_train)

# 예측 결과를 얻습니다.
stacking_predictions = stacking_regressor.predict(X_test)

# 앙상블 예측값의 성능을 평가합니다.
r2_stacking = r2_score(y_test, stacking_predictions)
print('스태킹 앙상블의 R 제곱 값: {0:.4f}'.format(r2_stacking))

스태킹 앙상블의 R 제곱 값: 0.9410


In [31]:
model = RandomForestRegressor(n_estimators=426, max_depth=25, min_samples_split=2, min_samples_leaf=1, n_jobs=-1)
model.fit(train_x, train_y)
#model.fit(XX_train, yy_train)

## Test Data Pre-Processing

In [32]:
test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [33]:
test_x = test_df.drop(columns=['num_date_time', '일시'])

## Inference

In [36]:
preds = model.predict(test_x)

## Submission

In [37]:
submission = pd.read_csv('C:\Workspace/power_consumption_comp\data/sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [38]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2071.983099
1,1_20220825 01,2079.595493
2,1_20220825 02,1998.223099
3,1_20220825 03,1958.334648
4,1_20220825 04,1948.348169
...,...,...
16795,100_20220831 19,879.469859
16796,100_20220831 20,785.738592
16797,100_20220831 21,744.081690
16798,100_20220831 22,642.161127


In [39]:
submission.to_csv('C:\Workspace/power_consumption_comp\data/baseline_submission_426.csv', index=False)