## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
data_path = 'C:\Workspace/power_consumption_comp\data'
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Train Data Pre-Processing

In [2]:
## Load Data
train_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/train.csv')
test_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/test.csv')
building_info  = pd.read_csv('C:\Workspace/power_consumption_comp\data/building_info.csv')

#데이터를 building_info와 합치기
def preprocessing(df, merg=1):
    #결측값을 0으로 채웁니다 안바꿔도 됨
    df = df.fillna(0)
    #시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
    #람다식을 이용해 일시에서 달,일,시간을 잘라낸것
    df['month'] = df['일시'].apply(lambda x : int(x[4:6]))
    df['day'] = df['일시'].apply(lambda x : int(x[6:8]))
    df['time'] = df['일시'].apply(lambda x : int(x[9:11]))
    #df = df[df['month'].isin([8, 7])].copy()
    if merg == 1:
        # 'building_number'를 기준으로 두 데이터프레임 병합 및 전처리
        merged_df = pd.merge(df, building_info, on='건물번호')
        merged_df['태양광용량(kW)'] = merged_df['태양광용량(kW)'].replace('-', 0)
        merged_df['ESS저장용량(kWh)'] = merged_df['ESS저장용량(kWh)'].replace('-', 0)
        merged_df['PCS용량(kW)'] = merged_df['PCS용량(kW)'].replace('-', 0)

        merged_df['태양광용량(kW)'] = merged_df['태양광용량(kW)'].astype('float64')
        merged_df['ESS저장용량(kWh)'] = merged_df['ESS저장용량(kWh)'].astype('float64')
        merged_df['PCS용량(kW)'] = merged_df['PCS용량(kW)'].astype('float64')

        merged_df = merged_df.drop(columns=['num_date_time', '일시','건물번호','건물유형'])
        #merged_df.info()
        return merged_df
    df = df.drop(columns=['num_date_time', '일시','건물번호'])
    return df

In [3]:
# # 병합된 데이터프레임의 상관계수 계산
# # 데이터프레임 생성
# corr_df = pd.DataFrame({
#     '전소_연면적': [merged_df['전력소비량(kWh)'].corr(merged_df['연면적(m2)'])], 
#     '전소_냉방면적': [merged_df['전력소비량(kWh)'].corr(merged_df['냉방면적(m2)'])], 
#     '전소_태양광용량': [merged_df['전력소비량(kWh)'].corr(merged_df['태양광용량(kW)'])],
#     '전소_일조': [merged_df['전력소비량(kWh)'].corr(merged_df['일조(hr)'])],
#     '전소_일사': [merged_df['전력소비량(kWh)'].corr(merged_df['일사(MJ/m2)'])],
#     '전소_기온': [merged_df['전력소비량(kWh)'].corr(merged_df['기온(C)'])],
#     '전소_습도': [merged_df['전력소비량(kWh)'].corr(merged_df['습도(%)'])],
#     '전소_강수량': [merged_df['전력소비량(kWh)'].corr(merged_df['강수량(mm)'])],
#     })

# # 상관계수 행렬 출력
# corr_df

In [4]:
# #데이터와 label분리를 위한 함수
# label_list = ['전력소비량(kWh)','일조(hr)','일사(MJ/m2)']
# def make_labels(df, level = 0, best_parameter = False, scale = 576):
#     #하이퍼 파라미터 튜닝 할때만 작동함
#     if best_parameter:
#         df = df[0:scale][:]
#     #데이터와 label분리
    
#     if level == 1:
#         train_y = df['전력소비량(kWh)']
#         train_x = df.drop(columns=['전력소비량(kWh)'])
#         return train_x.sort_index(axis=1),train_y
#     if level == 2:
#         train_y = df['일조(hr)']
#         train_x = df.drop(columns=['전력소비량(kWh)','일조(hr)'])
#         return train_x.sort_index(axis=1),train_y
#     if level == 3:
#         train_y = df['일사(MJ/m2)']
#         train_x = df.drop(columns=['전력소비량(kWh)','일조(hr)','일사(MJ/m2)'])   
#         return train_x.sort_index(axis=1),train_y
    
#     test_x = df
#     return test_x.sort_index(axis=1)

# # 테스트 데이터 = 일조x 일사x 전소x -> 모델1 -> 일사pred, 일조x 전소x -> 모델2 -> 일조pred, 전소x -> 모델3 -> 전소pred
# # 모델1 = 일조x 일사x 전소x 데이터셋 = level=3
# # 모델2 = 일조x 전소x 데이터셋 = level=2
# # 모델3 = 전소x 데이터셋 = level=1
    

In [5]:
#데이터와 label분리를 위한 함수
label_list = ['전력소비량(kWh)','일조(hr)','일사(MJ/m2)']
def make_labels(df, level = 0, best_parameter = False, scale = 576):
    #하이퍼 파라미터 튜닝 할때만 작동함
    if best_parameter:
        df = df[0:scale][:]
    #데이터와 label분리
    z_threshold = 3 # Z-score 기준값 설정 (일반적으로 3을 사용)

    if level == 1:
        train_y = df['전력소비량(kWh)']
        train_x = df.drop(columns=['전력소비량(kWh)'])
        z_scores = np.abs((train_y - train_y.mean()) / train_y.std())
        train_x = train_x[z_scores < z_threshold]
        train_y = train_y[z_scores < z_threshold]
        return train_x.sort_index(axis=1),train_y
    if level == 2:
        train_y = df['일조(hr)']
        train_x = df.drop(columns=['전력소비량(kWh)','일조(hr)'])
        z_scores = np.abs((train_y - train_y.mean()) / train_y.std())
        train_x = train_x[z_scores < z_threshold]
        train_y = train_y[z_scores < z_threshold]
        return train_x.sort_index(axis=1),train_y
    if level == 3:
        train_y = df['일사(MJ/m2)']
        train_x = df.drop(columns=['전력소비량(kWh)','일조(hr)','일사(MJ/m2)'])
        z_scores = np.abs((train_y - train_y.mean()) / train_y.std())
        train_x = train_x[z_scores < z_threshold]
        train_y = train_y[z_scores < z_threshold]
        return train_x.sort_index(axis=1),train_y
    
    test_x = df
    return test_x.sort_index(axis=1)



In [6]:
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# SMAPE 계산 함수 정의
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

#모델의 반복선언을 막기위함
def Model(parameters):
    return RandomForestRegressor(**parameters,n_jobs=-1)


In [7]:
#제출전 pretest 모델 병합이후 레이블 만들고 나누고 fit, smaple점수까지 출력
from sklearn.model_selection import train_test_split
class Train_Test_Submit:
    def __init__(self, df, best_paraemter, level = 0):
        super(Train_Test_Submit)
        self.df = df
        self.best_paraemter = best_paraemter
        self.train_x, self.train_y = make_labels(df,level)
        #데이터프레임,베스트 파라미터 가지고 데이터,라벨 분리
        
    def pretest(self):
        X_train, X_test, y_train, y_test=train_test_split(self.train_x, self.train_y, test_size=0.2, random_state=42)
        self.train_x = X_train
        self.train_y = y_train
        model = self.train(self)
        preds = model.predict(X_test)
        #점수 산출
        print("SMAPE: %0.2f" % (SMAPE(y_test,preds)))
        
        return model, preds, y_test
        
    def train(self):
        #모델 선언
        model = Model(self.best_paraemter)
        model.fit(self.train_x, self.train_y)
        return model
    
    def submit(self,pred):
        pred = pred
        submission = pd.read_csv('C:\Workspace/power_consumption_comp\data/sample_submission.csv')
        submission['answer'] = pred
        submission.to_csv('C:\Workspace/power_consumption_comp\data/baseline_submission_0722.csv', index=False)

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import os

def best_parameter(df,level):
    
    os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
    
    #모델에 맞게 변경하기
    parameters = {'max_depth':[8,16,25],'n_estimators':[100,200,400],
             'min_samples_split':[2,8,16], 'min_samples_leaf':[1,6,12]}
    #parameters = {'max_depth':[19],'n_estimators':[108],
    #         'min_samples_split':[16], 'min_samples_leaf':[8]}

    train_x,train_y = make_labels(df, level, best_parameter=True)
    smape_score = make_scorer(SMAPE, greater_is_better=False)
    
    grid_dclf = GridSearchCV(Model(parameters), param_grid=parameters,scoring=smape_score,n_jobs=-1, cv=5)
    grid_dclf.fit(train_x, train_y)
    
    mean_scores = np.mean(grid_dclf.cv_results_['mean_test_score'])
    std_scores = np.std(grid_dclf.cv_results_['mean_test_score'])
   
    
    print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
    print("GridSearchCV 베스트 SMAPE 점수:%0.4f" %((-1)*grid_dclf.best_score_))
    print("GridSearchCV SMAPE 평균:%0.4f" %((-1)*mean_scores))
    print("GridSearchCV SMAPE 표준편차:%0.4f" %((std_scores)))
    
    return grid_dclf.best_params_

merged_df = preprocessing(train_df)
bp_final = best_parameter(merged_df, level=1)

# 테스트 데이터 = 일조x 일사x 전소x -> 모델1 -> 일사pred, 일조x 전소x -> 모델2 -> 일조pred, 전소x -> 모델3 -> 전소pred
# 모델1 = 일조x 일사x 전소x 데이터셋 = level=3
# 모델2 = 일조x 전소x 데이터셋 = level=2
# 모델3 = 전소x 데이터셋 = level=1

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 16, 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 100}
GridSearchCV 베스트 SMAPE 점수:10.6647
GridSearchCV SMAPE 평균:11.1146
GridSearchCV SMAPE 표준편차:0.2917


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import os

def best_parameter_ilzo(df,level):
    
    os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
    
    #모델에 맞게 변경하기
    #parameters = {'max_depth':[8,16,25],'n_estimators':[100,200,400],
    #         'min_samples_split':[2,8,16], 'min_samples_leaf':[1,6,12]}
    parameters = {'max_depth':[5,6,7],'n_estimators':[100,130],
             'min_samples_split':[4,6,8], 'min_samples_leaf':[4,6,8]}

    train_x,train_y = make_labels(df, True, level)
    smape_score = make_scorer(SMAPE, greater_is_better=False)
    
    grid_dclf = GridSearchCV(Model(parameters), param_grid=parameters,scoring=smape_score,n_jobs=-1, cv=5)
    grid_dclf.fit(train_x, train_y)
    
    mean_scores = np.mean(grid_dclf.cv_results_['mean_test_score'])
    std_scores = np.std(grid_dclf.cv_results_['mean_test_score'])
   
    
    print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
    print("GridSearchCV 베스트 SMAPE 점수:%0.4f" %((-1)*grid_dclf.best_score_))
    print("GridSearchCV SMAPE 평균:%0.4f" %((-1)*mean_scores))
    print("GridSearchCV SMAPE 표준편차:%0.4f" %((std_scores)))
    
    return grid_dclf.best_params_

ilil_df = preprocessing(train_df, merg=0)
bp_ilzo = best_parameter_ilzo(ilil_df, level=2)

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 4, 'n_estimators': 100}
GridSearchCV 베스트 SMAPE 점수:10.6259
GridSearchCV SMAPE 평균:10.7872
GridSearchCV SMAPE 표준편차:0.1154


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import os

def best_parameter_ilsa(df,level):
    
    os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
    
    #모델에 맞게 변경하기
    #parameters = {'max_depth':[8,16,25],'n_estimators':[100,200,400],
    #         'min_samples_split':[2,8,16], 'min_samples_leaf':[1,6,12]}
    parameters = {'max_depth':[5,6,7],'n_estimators':[100,130],
             'min_samples_split':[4,6,8], 'min_samples_leaf':[4,6,8]}

    train_x,train_y = make_labels(df, True, level)
    smape_score = make_scorer(SMAPE, greater_is_better=False)
    
    grid_dclf = GridSearchCV(Model(parameters), param_grid=parameters,scoring=smape_score,n_jobs=-1, cv=5)
    grid_dclf.fit(train_x, train_y)
    
    mean_scores = np.mean(grid_dclf.cv_results_['mean_test_score'])
    std_scores = np.std(grid_dclf.cv_results_['mean_test_score'])
   
    
    print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
    print("GridSearchCV 베스트 SMAPE 점수:%0.4f" %((-1)*grid_dclf.best_score_))
    print("GridSearchCV SMAPE 평균:%0.4f" %((-1)*mean_scores))
    print("GridSearchCV SMAPE 표준편차:%0.4f" %((std_scores)))
    
    return grid_dclf.best_params_

ilil_df = preprocessing(train_df, merg=0)
bp_ilsa = best_parameter_ilsa(ilil_df, level=3)

GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 8, 'n_estimators': 100}
GridSearchCV 베스트 SMAPE 점수:10.6037
GridSearchCV SMAPE 평균:10.8029
GridSearchCV SMAPE 표준편차:0.1403


## Regression Model Fit

In [11]:
#1단계 : 테스트 데이터에 일사 예측 후 추가
il_test_df = make_labels(preprocessing(test_df, merg=0), level=0)
merged_test_df = make_labels(preprocessing(test_df), level=0)

train_test_model_index2 = Train_Test_Submit(ilil_df,bp_ilsa,level=3).train()

pred = train_test_model_index2.predict(il_test_df)

il_test_df['일사(MJ/m2)'] = pred
merged_test_df['일사(MJ/m2)'] = pred

il_test_df = il_test_df.sort_index(axis=1)
merged_test_df = merged_test_df.sort_index(axis=1)

In [12]:
#2단계 : 테스트 데이터에 일조 예측 후 추가
train_test_model_index1 = Train_Test_Submit(ilil_df,bp_ilzo,level=2).train()

pred = train_test_model_index1.predict(il_test_df)

merged_test_df['일조(hr)'] = pred

merged_test_df = merged_test_df.sort_index(axis=1)

In [13]:
# #3단계 : 테스트 데이터에 전력소비량 예측 후 추가 
# train_test_model_index0 = Train_Test_Submit(merged_df,bp_final,level=1).train()

# pred = train_test_model_index0.predict(merged_test_df)

test data에 columns추가

In [14]:
train_test_model_index0 = Train_Test_Submit(merged_df,bp_final,level=1).train()

pred = train_test_model_index0.predict(merged_test_df)

submission = pd.read_csv('C:\Workspace/power_consumption_comp\data/sample_submission.csv')

submission['answer'] = pred

submission.to_csv('C:\Workspace/power_consumption_comp\data/baseline_submission_done.csv', index=False)