<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#현재-상태" data-toc-modified-id="현재-상태-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>현재 상태</a></span></li></ul></div>

### 현재 상태

In [113]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import pymysql
import warnings
warnings.filterwarnings(action='ignore')

def get_NMAE(true, pred):
    mae = np.abs(true-pred)/true
    nmae = round(np.mean(mae), 3)
    return nmae

def load_datasets(path) -> pd.DataFrame:
    train = pd.read_csv(os.path.join(path, '한국가스공사_시간별 공급량_20181231.csv'),
                        encoding='cp949')
    test = pd.read_csv(os.path.join(path, 'test.csv'))
    # test 데이터 전처리
    test['연월일'] = test['일자|시간|구분'].str.split(' ', expand=True)[0]
    test['시간'] = test['일자|시간|구분'].str.split(' ', expand=True)[1].astype(int)
    test['구분'] = test['일자|시간|구분'].str.split(' ', expand=True)[2]
    del test['일자|시간|구분']

    data = pd.concat([train, test], axis=0)
    data['연월일'] = pd.to_datetime(data['연월일'])
    return data

def load_weather() -> pd.DataFrame:
    db = pymysql.connect(host='localhost', port=3306, user='younghun', password='watson1259',
                        db='dacon_gas_weather_db', charset='utf8')
    cursor = db.cursor()
    sql = "SELECT datetime, avg_temp, min_temp, max_temp,\
              NULLIF(sum_rain, '') as sum_rain, avg_wind, avg_humid,\
              sum_gsr, NULLIF(ddmefs, '') as ddmefs, avg_ts \
              FROM weather ORDER BY datetime"
    
    weather = pd.read_sql(sql, db)
    weather = weather.fillna(0.)
    
    return weather
    
def merge_gas_weather(path):
    gas = load_datasets(path)
    weather = load_weather()
    gas_weather = gas.merge(weather, how='left', left_on='연월일', right_on='datetime')
    del gas_weather['datetime']
    
    return gas_weather

def make_datetime_vars(data) -> pd.DataFrame:
    data['year'] = data['연월일'].dt.year
    data['month'] = data['연월일'].dt.month
    data['day'] = data['연월일'].dt.day
    #data['week_no'] = data['연월일'].dt.strftime("%V").astype(int)
    data['dayofweek'] = data['연월일'].dt.dayofweek
    data['weekend_yn'] = np.where(data['dayofweek'].isin(['Saturday', 'Sunday']), 1, 0)
    #data['dayofyear'] = data['연월일'].dt.dayofyear

    return data


def change_dates(row):
    if row['시간'] == 24:
        row['연월일'] += pd.DateOffset(days=1)
        row['시간'] = 0
    return row


def change_pandas_date_format(df):
    """ 연월일 날짜 포맷 바꾸기"""
    df = df.apply(change_dates, axis=1)
    df['시간'] = df['시간'].apply(lambda x: str(x) if x >= 10 else '0' + str(x))
    df['연월일'] = df['연월일'].astype(str)
    df['datetime'] = df['연월일'] + ' ' + df['시간']
    df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%d %H')

    df['prev_datetime'] = df['datetime'] - pd.DateOffset(years=1)

    return df


def make_autocorr_vars(df, gubun='A'):
    sub_df = df[df['구분'] == gubun]
    # 일년 전 그 시간대 공급량
    sub_df['prev_year_공급량'] = sub_df.groupby([sub_df['datetime'].dt.month, sub_df['datetime'].dt.day,
                                          sub_df['datetime'].dt.hour])['공급량'].shift()
#     # 이년 전 공급량 파생변수 생성
#     sub_df['prev_two_year_공급량'] = sub_df.groupby([sub_df['datetime'].dt.month, sub_df['datetime'].dt.day,
#                                               sub_df['datetime'].dt.hour])['공급량'].shift(2)
    #===============
    #일년 전 평균 공급량
    #===============
    sub_df['prev_year'] = sub_df['prev_datetime'].dt.year
    prev_year_mean_df = sub_df.groupby([sub_df['datetime'].dt.year])['공급량'].mean()
    prev_year_mean_dict = dict(zip(prev_year_mean_df.index, prev_year_mean_df.values.reshape(-1, )))
    sub_df['prev_year_avg_공급량'] = sub_df['prev_year'].map(prev_year_mean_dict)
    #====================
    # 일년 전 월별 평균 공급량
    #====================
    prev_month_mean_df = sub_df.groupby([sub_df['datetime'].dt.year, sub_df['datetime'].dt.month])['공급량'].mean()
    prev_month_mean_dict = dict(zip(prev_month_mean_df.index, prev_month_mean_df.values.reshape(-1, )))
    sub_df['prev_year_month'] = tuple(zip(sub_df['prev_year'].values.reshape(-1, ),
                                        sub_df['prev_datetime'].dt.month.values.reshape(-1, )))
    sub_df['prev_month_avg_공급량'] = sub_df['prev_year_month'].map(prev_month_mean_dict)
    #=======================
    # 일년 전 월-일자별 평균 공급량
    #=======================
    prev_month_day_mean_df = sub_df.groupby([sub_df['datetime'].dt.year,
                                           sub_df['datetime'].dt.month,
                                           sub_df['datetime'].dt.day])['공급량'].mean()
    prev_month_day_mean_dict = dict(zip(prev_month_day_mean_df.index, prev_month_day_mean_df.values.reshape(-1, )))

    sub_df['prev_year_month_day'] = tuple(zip(sub_df['prev_year'].values.reshape(-1, ),
                                            sub_df['prev_datetime'].dt.month.values.reshape(-1, ),
                                            sub_df['prev_datetime'].dt.day.values.reshape(-1, )))
    sub_df['prev_month_day_avg_공급량'] = sub_df['prev_year_month_day'].map(prev_month_day_mean_dict)
    #========================
    # 일년 전 월-시간 별 평균 공급량
    #========================
    prev_month_hour_mean_df = sub_df.groupby([sub_df['datetime'].dt.year,
                                            sub_df['datetime'].dt.month,
                                            sub_df['datetime'].dt.hour])['공급량'].mean()
    prev_month_hour_mean_dict = dict(zip(prev_month_hour_mean_df.index, prev_month_hour_mean_df.values.reshape(-1, )))

    sub_df['prev_year_month_hour'] = tuple(zip(sub_df['prev_year'].values.reshape(-1, ),
                                             sub_df['prev_datetime'].dt.month.values.reshape(-1, ),
                                             sub_df['prev_datetime'].dt.hour.values.reshape(-1, )))
    sub_df['prev_month_hour_avg_공급량'] = sub_df['prev_year_month_hour'].map(prev_month_hour_mean_dict)
    #==============
    # 증가율 변수 생성
    #==============
    sub_df['prev_year_ratio'] = (sub_df['prev_year_공급량']-sub_df['prev_year_avg_공급량'])/sub_df['prev_year_avg_공급량']
    sub_df['prev_month_ratio'] = (sub_df['prev_year_공급량']-sub_df['prev_month_avg_공급량'])/sub_df['prev_month_avg_공급량']
    sub_df['prev_month_day_ratio'] = (sub_df['prev_year_공급량']-sub_df['prev_month_day_avg_공급량'])/sub_df['prev_month_day_avg_공급량']
    sub_df['prev_month_hour_ratio'] = (sub_df['prev_year_공급량']-sub_df['prev_month_hour_avg_공급량'])/sub_df['prev_month_hour_avg_공급량']
    #========================
    # 일년 전 그 날의 기상 변수 FE
    #========================
    weather_cols = ['avg_temp','min_temp','max_temp','sum_rain','avg_wind','avg_humid','sum_gsr','ddmefs','avg_ts']
    for col in weather_cols:
        sub_df[f'prev_year_{col}'] = sub_df.groupby([sub_df['datetime'].dt.month, sub_df['datetime'].dt.day,
                                                  sub_df['datetime'].dt.hour])[col].shift()
        sub_df[f'prev_year_{col}'] = pd.to_numeric(sub_df[f'prev_year_{col}'], downcast="float")
    #================
    # 필요한 칼럼들만 추출
    #================
    used_cols = ['연월일', '시간', '구분', 'datetime',
#                  'year', 'month', 'day', 'week_no','dayofweek','weekend_yn',
                 'prev_year_공급량', 'prev_year_avg_공급량','prev_month_avg_공급량', 'prev_month_day_avg_공급량', 'prev_month_hour_avg_공급량',
                 'prev_year_ratio','prev_month_ratio','prev_month_day_ratio','prev_month_hour_ratio']
    #weather_cols = [f'prev_year_{col}' for col in weather_cols]
    #used_cols += weather_cols + ['공급량']
    used_cols += ['공급량']
    sub_df = sub_df[used_cols]
    
    return sub_df


def fe_autocorr_vars(df):
    final_df = pd.DataFrame()
    gubun_cols = ['A','B','C','D','E','G','H']
    for gubun in tqdm(gubun_cols):
        sub_df = make_autocorr_vars(df, gubun=gubun)
        final_df = pd.concat([final_df, sub_df], axis=0)
    # 제거할 칼럼들
    final_df = final_df.drop(['prev_year_avg_공급량', 'prev_month_hour_ratio'], axis=1)
    return final_df

def change_dates_adversely(row):
    if row['datetime'].hour == 0:
        row['연월일'] -= pd.DateOffset(days=1)
        row['시간'] = 24
    return row

def change_date_format(df):
    df['연월일'] = pd.to_datetime(df['연월일'])
    df['시간'] = df['시간'].astype(int)
    df = df.apply(change_dates_adversely, axis=1)
    
    del df['datetime']
    
    return df


# 추가 FE 하는 함수 -> 2018년까지의 평균값을 2019년에 집어넣어야 함!
def fe_avg_vars(final_df):
    main_df = pd.DataFrame()
    cols = ['A', 'B', 'C', 'D', 'E', 'G', 'H']
    for gubun in tqdm(cols):
        sub_df = final_df[final_df['구분'] == gubun]
        ## 단, FE 계산 때는 2018년까지의 Train 데이터만 사용, FE 적용은 Train, Test 모두에 적용
        #===============
        # 1.월별 평균 공급량
        #===============
        fe_df = sub_df[sub_df['연월일'].dt.year < 2019]
        group = fe_df.groupby([fe_df['연월일'].dt.month]).agg({'공급량':'mean'}) # Train만 사용
        month_avg_dict = dict(zip(group.index, group.values.reshape(-1,)))
        sub_df['month_avg'] = sub_df['연월일'].dt.month.map(month_avg_dict)    # Train, Test에 모두 적용
        #==================
        # 2. 월-일별 평균 공급량
        #==================
        group = fe_df.groupby([fe_df['연월일'].dt.month, fe_df['연월일'].dt.day]).agg({'공급량':'mean'})
        group_dict1 = dict(zip(group.index, group.values.reshape(-1,)))
        # 임시 변수 생성
        sub_df['month-day_tuple'] = tuple(zip(sub_df['연월일'].dt.month.values.reshape(-1,),
                                              sub_df['연월일'].dt.day.values.reshape(-1,)))
        # 임시 변수 기반으로 group_dict 매핑
        sub_df['month_day_avg'] = sub_df['month-day_tuple'].map(group_dict1)
        del sub_df['month-day_tuple']
        #======================
        # 3. 월-일-시간별 평균 공급량
        #======================
        group = fe_df.groupby([fe_df['연월일'].dt.month, fe_df['연월일'].dt.day, fe_df['시간']])\
                     .agg({'공급량': 'mean'})
        group_dict2 = dict(zip(group.index, group.values.reshape(-1,)))
        # 임시 변수 생성
        sub_df['month-day-hr_tuple'] = tuple(zip(sub_df['연월일'].dt.month.values.reshape(-1,),
                                                  sub_df['연월일'].dt.day.values.reshape(-1,),
                                                  sub_df['시간'].values.reshape(-1,)))
        # 임시 변수 기반으로 group_dict 매핑
        sub_df['month_day_hr_avg'] = sub_df['month-day-hr_tuple'].map(group_dict2)
        del sub_df['month-day-hr_tuple']
        #=========================================
        # 3. 월별 평균 공급량 대비 월-일별 평균 공급량 증감률
        #=========================================
        sub_df['month_month-day_ratio'] = (sub_df['month_day_avg']-sub_df['month_avg'])/sub_df['month_avg']
        #=============================================
        # 4. 월별 평균 공급량 대비 월-일-시간별 평균 공급량 증감률
        #=============================================
        sub_df['month_month-day-hr_ratio'] = (sub_df['month_day_hr_avg']-sub_df['month_avg'])/sub_df['month_avg']
        #===============================================
        # 5. 월-일별 평균 공급량 대비 월-일-시간별 평균 공급량 증감률
        #===============================================
        sub_df['month-day_month-day-hr_ratio'] = (sub_df['month_day_hr_avg']-sub_df['month_day_avg'])/sub_df['month_day_avg']
        
        
        # sub_df를 main_df에 결합
        main_df = pd.concat([main_df, sub_df], axis=0)
    
    # 칼럼 순서 맞추기
    cols_order = main_df.columns.tolist()
    cols_order.remove('공급량')
    cols_order.insert(len(cols_order), '공급량')
    main_df = main_df[cols_order]
        
    return main_df


path = '/Users/younghun/Desktop/gitrepo/KaggleStruggle/dacon_gas'
gas_weather = merge_gas_weather(path)
dataset = make_datetime_vars(gas_weather)
dataset = change_pandas_date_format(dataset)
dataset = fe_autocorr_vars(dataset)
dataset = change_date_format(dataset)
final_df = fe_avg_vars(dataset)
final_df = final_df.reset_index(drop=True)

print(final_df.shape)




100%|██████████| 7/7 [00:07<00:00,  1.09s/it]
100%|██████████| 7/7 [00:03<00:00,  1.89it/s]

(383208, 17)





In [114]:
def predict_scaling_ftr(final_df, model, submission_csv):
    cols = ['A', 'B', 'C', 'D', 'E', 'G', 'H']
    all_pred = np.array([])
    for gubun in cols:
        sub_df = final_df[final_df['구분'] == gubun]
        sub_df = sub_df.set_index(['연월일', '시간', '구분'])
        idx_level = sub_df.index.get_level_values
        
        train = sub_df[(idx_level(0) < '2018-09-01')].fillna(method='ffill')
        valid = sub_df[(idx_level(0) >= '2018-09-01')&(idx_level(0) <= '2018-12-31')].fillna(method='ffill')
        
        X_train, y_train = train.iloc[:, :-1].values, train['공급량']
        X_valid, y_valid = valid.iloc[:, :-1].values, valid['공급량']
        # Scaling feature
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)
        # fit model
        #model = model_dict[gubun]
        model.fit(X_train, y_train)
        # predict
        train_pred = model.predict(X_train)
        valid_pred = model.predict(X_valid)
        # evaluate
        train_NMAE = get_NMAE(y_train, train_pred)
        valid_NMAE = get_NMAE(y_valid, valid_pred)
        print(f'# 유형({gubun}) - Train NMAE: {train_NMAE: .4f}')
        print(f'# 유형({gubun}) - Valid NMAE: {valid_NMAE: .4f}')
        print()
        
        # 다시 Train, Test로 분할
        train = sub_df[(idx_level(0) < '2019-01-01')].fillna(method='ffill')
        test = sub_df[(idx_level(0) >= '2019-01-01')].fillna(method='ffill')
        
        X_train, y_train = train.iloc[:, :-1].values, train['공급량']
        X_test = test.iloc[:, :-1].values
        # Scaling
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        # fit model
        #model = model_dict[gubun]
        model.fit(X_train, y_train)
        # predict
        test_pred = model.predict(X_test)
        all_pred = np.append(all_pred, test_pred)
    submission_csv['공급량'] = all_pred
    return submission_csv

In [115]:
import os
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import MinMaxScaler


# model
lgbm_reg = LGBMRegressor(n_estimators=100, random_state=42)
xgb_reg = XGBRegressor(n_estimators=100, random_state=42)
hybrid_reg = VotingRegressor([('lgbm', lgbm_reg), ('xgb', xgb_reg)])

# submission csv
dirname = '/Users/younghun/Desktop/gitrepo/KaggleStruggle/dacon_gas'
sub_csv = pd.read_csv(os.path.join(dirname, 'sample_submission.csv'))

# predict
pred_csv = predict_scaling_ftr(final_df=final_df, model=hybrid_reg, submission_csv=sub_csv)

# 유형(A) - Train NMAE:  0.1000
# 유형(A) - Valid NMAE:  0.1400

# 유형(B) - Train NMAE:  0.1050
# 유형(B) - Valid NMAE:  0.1410

# 유형(C) - Train NMAE:  0.9540
# 유형(C) - Valid NMAE:  0.2610

# 유형(D) - Train NMAE:  0.0990
# 유형(D) - Valid NMAE:  0.1500

# 유형(E) - Train NMAE:  0.0890
# 유형(E) - Valid NMAE:  0.1440

# 유형(G) - Train NMAE:  0.1030
# 유형(G) - Valid NMAE:  0.1470

# 유형(H) - Train NMAE:  0.1000
# 유형(H) - Valid NMAE:  0.1290



In [116]:
# C유형에 대해 Prophet으로 예측
# C 유형에 대해서 Prophet으로 예측해보기
df = final_df.copy()
c_df = df[df['구분'] == 'C']

def date_format_pandas(row):
    if row['시간'] == 24:
        row['연월일'] += pd.DateOffset(days=1)
        row['시간'] = 0
    return row

c_df = c_df.apply(date_format_pandas, axis=1)
c_df['시간'] = c_df['시간'].apply(lambda x: str(x) if x >= 10 else '0'+str(x))
c_df['연월일'] = c_df['연월일'].astype(str)
c_df['time'] = c_df['연월일'] + ' ' + c_df['시간']
c_df['time'] = pd.to_datetime(c_df['time'], format='%Y-%m-%d %H')

# 일변량으로 Kats Prophet 사용
from kats.consts import TimeSeriesData
from kats.models.prophet import ProphetModel, ProphetParams

# Train, Valid 분할
c_train = c_df[c_df['time'] <= '2019-01-01 00:00:00']
c_test = c_df[(c_df['time'] > '2019-01-01 00:00:00')]

uni_c_df = c_train[['time', '공급량']]
uni_c_ts = TimeSeriesData(uni_c_df)

# params
params = ProphetParams(seasonality_mode='additive')
# model
prophet = ProphetModel(uni_c_ts, params=params)
# fit
prophet.fit()
# predict
test_pred = prophet.predict(steps=c_test.shape[0], freq='H')

In [120]:
# 일변량으로 Kats Prophet 사용
from kats.consts import TimeSeriesData
from kats.models.prophet import ProphetModel, ProphetParams

def date_format_pandas(row):
    if row['시간'] == 24:
        row['연월일'] += pd.DateOffset(days=1)
        row['시간'] = 0
    return row

def prophet(final_df, gubun):
    gubun_df = final_df[final_df['구분'] == gubun]
    gubun_df = gubun_df.apply(date_format_pandas, axis=1)
    gubun_df['시간'] = gubun_df['시간'].apply(lambda x: str(x) if x >= 10 else '0'+str(x))
    gubun_df['연월일'] = gubun_df['연월일'].astype(str)
    gubun_df['time'] = gubun_df['연월일'] + ' ' + gubun_df['시간']
    gubun_df['time'] = pd.to_datetime(gubun_df['time'], format='%Y-%m-%d %H')
    
    # Train, Valid 분할
    train = gubun_df[gubun_df['time'] <= '2019-01-01 00:00:00']
    test = gubun_df[(gubun_df['time'] > '2019-01-01 00:00:00')]
    # TimeSeries 객체로 변환
    uni_df = train[['time', '공급량']]
    uni_ts = TimeSeriesData(uni_df)
    # params
    params = ProphetParams(seasonality_mode='additive')
    # model
    prophet = ProphetModel(uni_ts, params=params)
    # fit
    prophet.fit()
    # predict
    test_pred = prophet.predict(steps=test.shape[0], freq='H')
    print('test_pred shape:', test_pred.shape)
    return test_pred

def merge_pred_csv_prophet_csv(final_df, pred_csv, gubun='C'):
    """
    pred_csv : 일반 머신러닝 모델로 예측한 Test 데이터 결과값이 담긴 panadas.dataframe
    test_pred : Prophet으로 특정 유형에 대해 예측한 결과값이 담긴 panadas.dataframe
    """
    test_pred = prophet(final_df, gubun)
    
    pred_csv['일자'] = pred_csv['일자|시간|구분'].str.split(' ', expand=True)[0]
    pred_csv['시간'] = pred_csv['일자|시간|구분'].str.split(' ', expand=True)[1]
    pred_csv['구분'] = pred_csv['일자|시간|구분'].str.split(' ', expand=True)[2]
    pred_csv['일자'] = pd.to_datetime(pred_csv['일자'])
    pred_csv['시간'] = pred_csv['시간'].astype(int)
    
    final_csv = pred_csv[pred_csv['구분'] != gubun]
    gubun_csv = pred_csv[pred_csv['구분'] == gubun]
    gubun_csv['공급량'] = test_pred['fcst'].values
    final_csv = pd.concat([final_csv, gubun_csv], axis=0)
    columns = ['구분', '일자', '시간']
    final_csv = final_csv.sort_values(by=columns)
    final_csv = final_csv.drop(columns, axis=1)
    return final_csv


sub_csv = merge_pred_csv_prophet_csv(final_df=final_df, pred_csv=pred_csv, gubun='C')

test_pred shape: (2160, 4)


In [123]:
save_name = 'submission/0103False_SimulFalse_xgb+lgbm_cols-autocorr_extra_vars_MinMaxscale_exclude_datetime_C_prophet.csv'
sub_csv.to_csv(os.path.join(dirname, save_name), index=False)

- 오버피팅 발생... -> 차원의 수를 줄여야 할 듯 싶다..
- PLS, PCA 활용해서 차원 감소 후 시도
- 딥러닝 시도
    - MLPRegressor
    - Tensorflow