## Import

In [1]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
data_path = 'C:\Workspace/power_consumption_comp\data'
import warnings
warnings.filterwarnings(action='ignore') 

## Fixed Random-Seed
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Train Data Pre-Processing

In [16]:
## Load Data
train_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/train.csv')
test_df = pd.read_csv('C:\Workspace/power_consumption_comp\data/test.csv')
building_info  = pd.read_csv('C:\Workspace/power_consumption_comp\data/building_info.csv')

#데이터를 building_info와 합치기
def preprocessing(df):
    #결측값을 0으로 채웁니다 안바꿔도 됨
    df = df.fillna(0)
    #시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
    #람다식을 이용해 일시에서 달,일,시간을 잘라낸것
    df['month'] = df['일시'].apply(lambda x : int(x[4:6]))
    df['day'] = df['일시'].apply(lambda x : int(x[6:8]))
    df['time'] = df['일시'].apply(lambda x : int(x[9:11]))

    # 'building_number'를 기준으로 두 데이터프레임 병합 및 전처리
    merged_df = pd.merge(df, building_info, on='건물번호')
    merged_df['태양광용량(kW)'] = merged_df['태양광용량(kW)'].replace('-', 0)
    merged_df['ESS저장용량(kWh)'] = merged_df['ESS저장용량(kWh)'].replace('-', 0)
    merged_df['PCS용량(kW)'] = merged_df['PCS용량(kW)'].replace('-', 0)

    merged_df['태양광용량(kW)'] = merged_df['태양광용량(kW)'].astype('float64')
    merged_df['ESS저장용량(kWh)'] = merged_df['ESS저장용량(kWh)'].astype('float64')
    merged_df['PCS용량(kW)'] = merged_df['PCS용량(kW)'].astype('float64')

    merged_df = merged_df.drop(columns=['num_date_time', '일시','건물번호','건물유형'])
    merged_df.info()
    return merged_df

In [3]:
# 병합된 데이터프레임의 상관계수 계산
# 데이터프레임 생성
corr_df = pd.DataFrame({
    '전소_연면적': [merged_df['전력소비량(kWh)'].corr(merged_df['연면적(m2)'])], 
    '전소_냉방면적': [merged_df['전력소비량(kWh)'].corr(merged_df['냉방면적(m2)'])], 
    '전소_태양광용량': [merged_df['전력소비량(kWh)'].corr(merged_df['태양광용량(kW)'])],
    '전소_일조': [merged_df['전력소비량(kWh)'].corr(merged_df['일조(hr)'])],
    '전소_일사': [merged_df['전력소비량(kWh)'].corr(merged_df['일사(MJ/m2)'])],
    '전소_기온': [merged_df['전력소비량(kWh)'].corr(merged_df['기온(C)'])],
    '전소_습도': [merged_df['전력소비량(kWh)'].corr(merged_df['습도(%)'])],
    '전소_강수량': [merged_df['전력소비량(kWh)'].corr(merged_df['강수량(mm)'])],
    })

# 상관계수 행렬 출력
corr_df

NameError: name 'merged_df' is not defined

In [12]:
# #데이터와 label분리를 위한 함수
# label_list = ['전력소비량(kWh)','일조(hr)','일사(MJ/m2)']
# def make_labels(df,label_name = label_list[0], best_parameter = False,scale = 576):
#     #하이퍼 파라미터 튜닝 할때만 작동함
#     if best_parameter:
#         df = df[0:scale][:]
#     #데이터와 label분리    
#     if '전력소비량(kWh)' in df.columns:
#         train_y = df['전력소비량(kWh)']
#         if label_name != '전력소비량(kWh)': 
#             train_x = df.drop(columns=[label_name,'전력소비량(kWh)'])
#         else: 
#             train_x = df.drop(columns=['전력소비량(kWh)'])
#         return train_x,train_y
#     else:
#         test_x = df
#         return test_x
    

In [18]:
#데이터와 label분리를 위한 함수
label_list = ['전력소비량(kWh)','일조(hr)','일사(MJ/m2)']
def make_labels(df, level, best_parameter = False, scale = 576):
    #하이퍼 파라미터 튜닝 할때만 작동함
    if best_parameter:
        df = df[0:scale][:]
    #데이터와 label분리
    for i in range(level):    
        if label_list[i] in df.columns:
            train_y = df[label_list[i]]
            train_x = df.drop(columns=[label_list[i]])
            return train_x,train_y
        else:
            test_x = df
            return test_x
        
# 테스트 데이터 = 일조x 일사x 전소x -> 모델1 -> 일사pred, 일조x 전소x -> 모델2 -> 일조pred, 전소x -> 모델3 -> 전소pred
# 모델1 = 일조x 일사x 전소x 데이터셋 = level=3
# 모델2 = 일조x 전소x 데이터셋 = level=2
# 모델3 = 전소x 데이터셋 = level=1

In [17]:
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# SMAPE 계산 함수 정의
def SMAPE(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

#모델의 반복선언을 막기위함
def Model(parameters):
    return RandomForestRegressor(**parameters,n_jobs=-1)


In [19]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import os

def best_parameter(df,level):
    
    os.environ['JOBLIB_TEMP_FOLDER'] = '/tmp'
    
    #모델에 맞게 변경하기
    #parameters = {'max_depth':[8,16,25],'n_estimators':[100,200,400],
    #         'min_samples_split':[2,8,16], 'min_samples_leaf':[1,6,12]}
    parameters = {'max_depth':[25],'n_estimators':[200],
             'min_samples_split':[2], 'min_samples_leaf':[1]}

    train_x,train_y = make_labels(df, True, level)
    smape_score = make_scorer(SMAPE, greater_is_better=False)
    
    grid_dclf = GridSearchCV(Model(parameters), param_grid=parameters,scoring=smape_score,n_jobs=-1, cv=5)
    grid_dclf.fit(train_x, train_y)
    
    mean_scores = np.mean(grid_dclf.cv_results_['mean_test_score'])
    std_scores = np.mean(grid_dclf.cv_results_['mean_test_score'])
   
    
    print('GridSearchCV 최적 하이퍼 파라미터 :',grid_dclf.best_params_)
    print("GridSearchCV 베스트 SMAPE 점수:%0.4f" %((-1)*grid_dclf.best_score_))
    print("GridSearchCV SMAPE 평균:%0.4f" %((-1)*mean_scores))
    print("GridSearchCV SMAPE 표준편차:%0.4f" %((std_scores)))
    
    return grid_dclf.best_params_

merged_df = preprocessing(train_df)
bp_final = best_parameter(merged_df, level=1)
bp_ilzo = best_parameter(merged_df, level=2)
bp_ilsa = best_parameter(merged_df, level=3)

# 테스트 데이터 = 일조x 일사x 전소x -> 모델1 -> 일사pred, 일조x 전소x -> 모델2 -> 일조pred, 전소x -> 모델3 -> 전소pred
# 모델1 = 일조x 일사x 전소x 데이터셋 = level=3
# 모델2 = 일조x 전소x 데이터셋 = level=2
# 모델3 = 전소x 데이터셋 = level=1

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204000 entries, 0 to 203999
Data columns (total 15 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   기온(C)         204000 non-null  float64
 1   강수량(mm)       204000 non-null  float64
 2   풍속(m/s)       204000 non-null  float64
 3   습도(%)         204000 non-null  float64
 4   일조(hr)        204000 non-null  float64
 5   일사(MJ/m2)     204000 non-null  float64
 6   전력소비량(kWh)    204000 non-null  float64
 7   month         204000 non-null  int64  
 8   day           204000 non-null  int64  
 9   time          204000 non-null  int64  
 10  연면적(m2)       204000 non-null  float64
 11  냉방면적(m2)      204000 non-null  float64
 12  태양광용량(kW)     204000 non-null  float64
 13  ESS저장용량(kWh)  204000 non-null  float64
 14  PCS용량(kW)     204000 non-null  float64
dtypes: float64(12), int64(3)
memory usage: 23.3 MB
GridSearchCV 최적 하이퍼 파라미터 : {'max_depth': 25, 'min_samples_leaf': 1, 'min_samples_split': 

## Regression Model Fit

In [8]:
#여기서 부터 아직 안함
#제출전 pretest 모델 병합이후 레이블 만들고 나누고 fit, smaple점수까지 출력
from sklearn.model_selection import train_test_split
class Train_Test_Submit:
    def __init__(self, df, best_paraemter, label_name = label_list[0]):
        super(Train_Test_Submit)
        self.df = df
        self.best_paraemter = best_paraemter
        self.label_name = label_name
        self.train_x, self.train_y = make_labels(df,label_name)
        
    def pretest(self):
        X_train, X_test, y_train, y_test=train_test_split(self.train_x, self.train_y, test_size=0.2, random_state=42)
        model = self.train(X_train,y_train)
        preds = model.predict(X_test)
        #점수 산출
        print("SMAPE: %0.2f" % (SMAPE(y_test,preds)))
        
        return model
        
    
    def train(self,X,Y):
        self.X = X
        self.Y = Y
        #모델 선언
        model = Model(self.best_paraemter)
        model.fit(X, Y)
        return model
    
    def make_test_col(self, model):
        self.model = model
        
    def submit():
        pred = []
        return pred

In [9]:
train_test_model_index1 = Train_Test_Submit(merged_df,bp, label_list[1]).pretest()
train_test_model_index2 = Train_Test_Submit(merged_df,bp, label_list[2]).pretest()

KeyboardInterrupt: 

test data에 columns추가

In [None]:
merged_test_df = make_labels(preprocessing(test_df), label_name= label_list[1])
test_model = Train_Test_Submit(merged_test_df,bp, label_list[1]).make_test_col(train_test_model)

In [None]:
submit_model = Train_Test_Submit(merged_df,bp, label_list[1]).submit()

submission = pd.read_csv('C:\Workspace/power_consumption_comp\data/sample_submission.csv')

submission['answer'] = preds

submission.to_csv('C:\Workspace/power_consumption_comp\data/baseline_submission.csv', index=False)