### import

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping




### 데이터 전처리 클래스 및 함수 정의

In [2]:
# 추후 날씨 데이터의 위도 경도를 county로 변환하기 위한 데이터
location = [
    [0.0, 24.2, 59.1],
    [0.0, 25.2, 59.1],
    [0.0, 23.7, 59.4],
    [0.0, 24.2, 59.4],
    [0.0, 24.7, 59.4],
    [0.0, 25.2, 59.4],
    [0.0, 25.7, 59.4],
    [0.0, 24.7, 59.7],
    [0.0, 25.2, 59.7],
    [0.0, 25.7, 59.7],
    [1.0, 21.7, 58.8],
    [1.0, 22.2, 58.8],
    [1.0, 22.7, 58.8],
    [1.0, 23.2, 58.8],
    [1.0, 22.2, 59.1],
    [1.0, 22.7, 59.1],
    [2.0, 27.2, 59.1],
    [2.0, 27.7, 59.1],
    [2.0, 27.2, 59.4],
    [2.0, 27.7, 59.4],
    [3.0, 25.2, 58.8],
    [3.0, 25.7, 58.8],
    [3.0, 25.7, 59.1],
    [4.0, 26.2, 58.5],
    [4.0, 26.2, 58.8],
    [4.0, 26.7, 58.8],
    [4.0, 27.2, 58.8],
    [5.0, 26.2, 59.1],
    [5.0, 26.7, 59.1],
    [5.0, 26.2, 59.4],
    [5.0, 26.7, 59.4],
    [5.0, 26.2, 59.7],
    [5.0, 26.7, 59.7],
    [6.0, 23.7, 58.8],
    [6.0, 23.2, 59.1],
    [6.0, 23.7, 59.1],
    [6.0, 23.2, 59.4],
    [7.0, 23.7, 57.9],
    [7.0, 24.2, 57.9],
    [7.0, 23.7, 58.2],
    [7.0, 24.2, 58.2],
    [7.0, 24.7, 58.2],
    [7.0, 25.2, 58.2],
    [7.0, 23.7, 58.5],
    [7.0, 24.2, 58.5],
    [7.0, 24.7, 58.5],
    [8.0, 27.2, 58.2],
    [9.0, 24.2, 58.8],
    [9.0, 24.7, 58.8],
    [9.0, 24.7, 59.1],
    [10.0, 23.2, 57.6],
    [10.0, 21.7, 57.9],
    [10.0, 22.2, 57.9],
    [10.0, 23.2, 57.9],
    [10.0, 21.7, 58.2],
    [10.0, 22.2, 58.2],
    [10.0, 22.7, 58.2],
    [10.0, 23.2, 58.2],
    [10.0, 21.7, 58.5],
    [10.0, 22.2, 58.5],
    [10.0, 22.7, 58.5],
    [10.0, 23.2, 58.5],
    [11.0, 26.2, 58.2],
    [11.0, 26.7, 58.2],
    [11.0, 26.7, 58.5],
    [11.0, 27.2, 58.5],
    [13.0, 26.2, 57.9],
    [14.0, 25.7, 58.2],
    [14.0, 25.2, 58.5],
    [14.0, 25.7, 58.5],
    [15.0, 26.7, 57.6],
    [15.0, 27.2, 57.6],
    [15.0, 26.7, 57.9],
    [15.0, 27.2, 57.9],
    [15.0, 27.7, 57.9]
]

location = pd.DataFrame(location, columns=['county', 'longitude', 'latitude'])

In [3]:
class FeatureProcessorClass():
    def __init__(self):
        # join 기준
        self.weather_join = ['datetime', 'county', 'data_block_id']
        self.gas_join = ['data_block_id']
        self.electricity_join = ['datetime', 'data_block_id']
        self.client_join = ['county', 'is_business', 'product_type', 'data_block_id']

        # 위도 경도 칼럼
        self.lat_lon_columns = ['latitude', 'longitude']

        # 추가할 Aggregate stats
        self.agg_stats = ['mean'] #, 'min', 'max', 'std', 'median']

    def create_new_column_names(self, df, suffix, columns_no_change):
        # 칼럼명 변경
        df.columns = [col + suffix
                      if col not in columns_no_change
                      else col
                      for col in df.columns
                      ]
        return df

    def flatten_multi_index_columns(self, df):
        df.columns = ['_'.join([col for col in multi_col if len(col)>0])
                      for multi_col in df.columns]
        return df

    def create_data_features(self, data):
        # 날짜 관련 feature
        data['datetime'] = pd.to_datetime(data['datetime'])
        data['date'] = data['datetime'].dt.normalize()
        data['year'] = data['datetime'].dt.year
        data['quarter'] = data['datetime'].dt.quarter
        data['month'] = data['datetime'].dt.month
        data['week'] = data['datetime'].dt.isocalendar().week
        data['hour'] = data['datetime'].dt.hour
        data['day_of_year'] = data['datetime'].dt.day_of_year
        data['day_of_month']  = data['datetime'].dt.day
        data['day_of_week'] = data['datetime'].dt.day_of_week
        return data

    def create_client_features(self, client):
        # client 칼럼명 변경
        client = self.create_new_column_names(client,
                                           suffix='_client',
                                           columns_no_change = self.client_join
                                          )
        return client

    def create_historical_weather_features(self, historical_weather):
        # To datetime
        historical_weather['datetime'] = pd.to_datetime(historical_weather['datetime'])

        # 위도 경도를 county로 변환
        historical_weather[self.lat_lon_columns] = historical_weather[self.lat_lon_columns].astype(float).round(1)
        historical_weather = historical_weather.merge(location, how = 'left', on = self.lat_lon_columns)
        # historical_weather 칼럼명 변경
        historical_weather = self.create_new_column_names(historical_weather,
                                                          suffix='_h',
                                                          columns_no_change = self.lat_lon_columns + self.weather_join
                                                          )

        # Aggregate stats 계산
        agg_columns = [col for col in historical_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        historical_weather = historical_weather.groupby(self.weather_join).agg(agg_dict).reset_index()

        historical_weather = self.flatten_multi_index_columns(historical_weather)

        # 하루씩 밀기
        historical_weather['hour_h'] = historical_weather['datetime'].dt.hour
        historical_weather['datetime'] = (historical_weather
                                               .apply(lambda x:
                                                      x['datetime'] + pd.DateOffset(1)
                                                      if x['hour_h']< 11
                                                      else x['datetime'] + pd.DateOffset(2),
                                                      axis=1)
                                              )

        return historical_weather

    def create_forecast_weather_features(self, forecast_weather):
        # forecast_weather 칼럼명 변경
        forecast_weather = (forecast_weather
                            .rename(columns = {'forecast_datetime': 'datetime'})
                            .drop(columns = 'origin_datetime') # not needed
                           )

        # To datetime
        forecast_weather['datetime'] = (pd.to_datetime(forecast_weather['datetime'])
                                        .dt
                                        .tz_convert('Europe/Brussels') # change to different time zone?
                                        .dt
                                        .tz_localize(None)
                                       )

        # 위도 경도를 county로 변환
        forecast_weather[self.lat_lon_columns] = forecast_weather[self.lat_lon_columns].astype(float).round(1)
        forecast_weather = forecast_weather.merge(location, how = 'left', on = self.lat_lon_columns)
        forecast_weather = forecast_weather.fillna(0)

        # forecast_weather 칼럼명 변경
        forecast_weather = self.create_new_column_names(forecast_weather,
                                                        suffix='_f',
                                                        columns_no_change = self.lat_lon_columns + self.weather_join
                                                        )

        # Aggregate stats 계산
        agg_columns = [col for col in forecast_weather.columns if col not in self.lat_lon_columns + self.weather_join]
        agg_dict = {agg_col: self.agg_stats for agg_col in agg_columns}
        forecast_weather = forecast_weather.groupby(self.weather_join).agg(agg_dict).reset_index()

        forecast_weather = self.flatten_multi_index_columns(forecast_weather)
        return forecast_weather

    def create_electricity_features(self, electricity):
        # To datetime
        electricity['forecast_date'] = pd.to_datetime(electricity['forecast_date'])

        # 하루씩 밀기
        electricity['datetime'] = electricity['forecast_date'] + pd.DateOffset(1)

        # electricity 칼럼명 변경
        electricity = self.create_new_column_names(electricity,
                                                   suffix='_electricity',
                                                   columns_no_change = self.electricity_join
                                                  )
        return electricity

    def create_gas_features(self, gas):
        # Mean 계산
        gas['mean_price_per_mwh'] = (gas['lowest_price_per_mwh'] + gas['highest_price_per_mwh'])/2

        # gas 칼럼명 변경
        gas = self.create_new_column_names(gas,
                                           suffix='_gas',
                                           columns_no_change = self.gas_join
                                          )
        return gas

    def __call__(self, data, client, historical_weather, forecast_weather, electricity, gas):
        # 정의한 메소드들로 feature 생성
        data = self.create_data_features(data)
        client = self.create_client_features(client)
        historical_weather = self.create_historical_weather_features(historical_weather)
        forecast_weather = self.create_forecast_weather_features(forecast_weather)
        electricity = self.create_electricity_features(electricity)
        gas = self.create_gas_features(gas)

        # 하나의 df에 모두 통합
        df = data.merge(client, how='left', on = self.client_join)
        df = df.merge(historical_weather, how='left', on = self.weather_join)
        df = df.merge(forecast_weather, how='left', on = self.weather_join)
        df = df.merge(electricity, how='left', on = self.electricity_join)
        df = df.merge(gas, how='left', on = self.gas_join)

        return df

In [4]:
def create_revealed_targets_train(data):
    # data_block_id 별로 행 개수가 6144개로 동일하므로, 12288개씩 밀어서 target_2_days_ago 생성
    data['target_2_days_ago'] = data['target'].shift(12288)
    return data

In [5]:
def data_preprocessing(train, client, historical_weather, forecast_weather, electricity_prices, gas_prices):
    data = train.copy()
    client = client.copy()
    historical_weather = historical_weather.copy()
    forecast_weather = forecast_weather.copy()
    electricity = electricity_prices.copy()
    gas = gas_prices.copy()
    
    FeatureProcessor = FeatureProcessorClass()
    
    data = FeatureProcessor(data,
                            client,
                            historical_weather,
                            forecast_weather,
                            electricity,
                            gas
                            )
    df = create_revealed_targets_train(data)

    # Remove columns for features
    no_features = ['date',
                    'latitude',
                    'longitude',
                    'hours_ahead',
                    'hour_h',
                    'prediction_unit_id',
                    'data_block_id',
                    'currently_scored',
                    'row_id'
                    ]

    remove_columns = [col for col in df.columns for no_feature in no_features if no_feature in col]
    features = [col for col in df.columns if col not in remove_columns]
    
    df = df[features]
    
    # target이 결측치인 경우 0으로 처리
    df[df['target'].isnull()] = df[df['target'].isnull()].fillna(0)

    # backfill을 통해 결측치 처리
    df = df.bfill()
    
    ## Numerical Feauture Standard Scaling
    features_not_to_scale = ['county',
                              'is_business',
                              'product_type',
                              'is_consumption',
                              'year',
                              'quarter',
                              'month',
                              'week',
                              'hour',
                              'day_of_year',
                              'day_of_month',
                              'day_of_week',
                              'target']
    features_to_scale = [col for col in df.columns if col not in features_not_to_scale]
    
    Scaler = StandardScaler().fit(df[features_to_scale])

    # 추후 모델 활용을 위해 Scaler 저장
    with open('train_scaler_RNN.pkl', 'wb') as file:
        pickle.dump(Scaler, file)

    df[features_to_scale] = Scaler.transform(df[features_to_scale])
    
    
    ## Categoical Feature One-hot Encoding
    df = df.reset_index(drop=True)
    
    columns_to_onehot_encode = df[['county', 'product_type']]

    Encoder = OneHotEncoder(sparse_output=False)
    encoded = Encoder.fit_transform(columns_to_onehot_encode)
    encoded_df = pd.DataFrame(encoded, columns=Encoder.get_feature_names_out(['county', 'product_type']))

    df = pd.concat([df, encoded_df], axis=1)
    
    df.drop(['county', 'product_type'], axis=1, inplace=True)

    return df
    

###  실제 Train 데이터 불러와서 전처리

In [6]:
file_path = "data/"

train = pd.read_csv(file_path + 'train.csv', parse_dates=['datetime'])
client = pd.read_csv(file_path + 'client.csv')
electricity = pd.read_csv(file_path + 'electricity_prices.csv')
forecast_weather = pd.read_csv(file_path + 'forecast_weather.csv')
gas = pd.read_csv(file_path + 'gas_prices.csv')
historical_weather = pd.read_csv(file_path + 'historical_weather.csv')

In [7]:
data_block_id = train['data_block_id'].unique().tolist()

In [8]:
# county, is_business, product_type, is_consumption의 모든 조합으로 형성된 base 데이터프레임 생성
base = []

county = range(16)
is_business = range(2)
product_type = [1, 2, 3, 0]
is_consumption = range(2)

for i in county:
    for j in is_business:
        for k in product_type:
            for l in is_consumption:
                base.append([i, j, k, l])
    
base = pd.DataFrame(base, columns=['county', 'is_business', 'product_type', 'is_consumption'])


In [9]:
base

Unnamed: 0,county,is_business,product_type,is_consumption
0,0,0,1,0
1,0,0,1,1
2,0,0,2,0
3,0,0,2,1
4,0,0,3,0
...,...,...,...,...
251,15,1,2,1
252,15,1,3,0
253,15,1,3,1
254,15,1,0,0


In [10]:
# 기존 train 데이터의 data_block_id 별로, datetime 별로, base에 train 데이터를 merge
train_expanded = pd.DataFrame(columns=['county', 'is_business', 'product_type', 'is_consumption', 'datetime', 'target', 'data_block_id', 'row_id', 'prediction_unit_id'])
for i in data_block_id:
    train_df = train[train['data_block_id'] == i]
    datetime = train_df['datetime'].unique().tolist()

    for i in datetime:
        a = base.copy()
        a['datetime'] = i
        train_expanded = pd.concat([train_expanded, a.merge(train_df[train_df['datetime'] == i], how='left', on=['county', 'is_business', 'product_type', 'is_consumption', 'datetime'])])

train_expanded = train_expanded.reset_index(drop=True)
train_expanded

Unnamed: 0,county,is_business,product_type,is_consumption,datetime,target,data_block_id,row_id,prediction_unit_id
0,0,0,1,0,2021-09-01 00:00:00,0.713,0.0,0.0,0.0
1,0,0,1,1,2021-09-01 00:00:00,96.590,0.0,1.0,0.0
2,0,0,2,0,2021-09-01 00:00:00,0.000,0.0,2.0,1.0
3,0,0,2,1,2021-09-01 00:00:00,17.314,0.0,3.0,1.0
4,0,0,3,0,2021-09-01 00:00:00,2.904,0.0,4.0,2.0
...,...,...,...,...,...,...,...,...,...
3919867,15,1,2,1,2023-05-31 23:00:00,,,,
3919868,15,1,3,0,2023-05-31 23:00:00,0.000,637.0,2018350.0,60.0
3919869,15,1,3,1,2023-05-31 23:00:00,196.240,637.0,2018351.0,60.0
3919870,15,1,0,0,2023-05-31 23:00:00,0.000,637.0,2018346.0,64.0


In [11]:
df = data_preprocessing(train_expanded, client, historical_weather, forecast_weather, electricity, gas)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df['target'].isnull()] = df[df['target'].isnull()].fillna(0)


In [12]:
# 전처리된 데이터 저장
df.to_csv('data_preprocessed_expanded.csv', index=False)

In [13]:
df = pd.read_csv('data_preprocessed_expanded.csv')

In [14]:
df

Unnamed: 0,is_business,is_consumption,target,year,quarter,month,week,hour,day_of_year,day_of_month,day_of_week,eic_count_client,installed_capacity_client,temperature_h_mean,dewpoint_h_mean,rain_h_mean,snowfall_h_mean,surface_pressure_h_mean,cloudcover_total_h_mean,cloudcover_low_h_mean,cloudcover_mid_h_mean,cloudcover_high_h_mean,windspeed_10m_h_mean,winddirection_10m_h_mean,shortwave_radiation_h_mean,direct_solar_radiation_h_mean,diffuse_radiation_h_mean,temperature_f_mean,dewpoint_f_mean,cloudcover_high_f_mean,cloudcover_low_f_mean,cloudcover_mid_f_mean,cloudcover_total_f_mean,10_metre_u_wind_component_f_mean,10_metre_v_wind_component_f_mean,direct_solar_radiation_f_mean,surface_solar_radiation_downwards_f_mean,snowfall_f_mean,total_precipitation_f_mean,euros_per_mwh_electricity,lowest_price_per_mwh_gas,highest_price_per_mwh_gas,mean_price_per_mwh_gas,target_2_days_ago,county_0,county_1,county_2,county_3,county_4,county_5,county_6,county_7,county_8,county_9,county_10,county_11,county_12,county_13,county_14,county_15,product_type_0,product_type_1,product_type_2,product_type_3
0,0,0,0.713,2021,3,9,35,0,244,1,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,1,96.590,2021,3,9,35,0,244,1,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,0,0.000,2021,3,9,35,0,244,1,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,1,17.314,2021,3,9,35,0,244,1,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,0,2.904,2021,3,9,35,0,244,1,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3919867,1,1,0.000,2023,2,5,22,23,151,31,2,-0.344306,-0.396503,-0.421483,-0.177592,-0.183652,-0.16231,-1.011823,-0.775285,-0.655443,-0.557724,-0.539596,-0.845904,-0.864813,-0.388954,-0.324821,-0.436513,-0.430055,-0.193305,-0.562546,-0.609231,-0.552040,-0.802977,-0.227943,-0.140861,-0.393831,-0.392041,-0.16415,-0.233735,-0.689951,-0.836805,-0.832716,-0.835971,-0.211854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3919868,1,0,0.000,2023,2,5,22,23,151,31,2,0.157289,0.764481,1.538733,1.210213,-0.183652,-0.16231,0.980264,-0.233849,-0.510330,-0.112563,0.299621,-0.311722,0.524849,-0.388954,-0.324821,-0.436513,1.352242,1.088631,0.718283,-0.484891,-0.498909,0.325013,1.215264,-0.253161,-0.395590,-0.392041,-0.16415,-0.233735,-0.700949,-0.342384,-0.324279,-0.333275,-0.211854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3919869,1,1,196.240,2023,2,5,22,23,151,31,2,0.157289,0.764481,1.538733,1.210213,-0.183652,-0.16231,0.980264,-0.233849,-0.510330,-0.112563,0.299621,-0.311722,0.524849,-0.388954,-0.324821,-0.436513,1.352242,1.088631,0.718283,-0.484891,-0.498909,0.325013,1.215264,-0.253161,-0.395590,-0.392041,-0.16415,-0.233735,-0.700949,-0.342384,-0.324279,-0.333275,0.054160,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3919870,1,0,0.000,2023,2,5,22,23,151,31,2,-0.207508,-0.067552,1.538733,1.210213,-0.183652,-0.16231,0.980264,-0.233849,-0.510330,-0.112563,0.299621,-0.311722,0.524849,-0.388954,-0.324821,-0.436513,1.352242,1.088631,0.718283,-0.484891,-0.498909,0.325013,1.215264,-0.253161,-0.395590,-0.392041,-0.16415,-0.233735,-0.700949,-0.342384,-0.324279,-0.333275,-0.211854,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [15]:
# dataset을 window_size 만큼씩 잘라서 batch로 만들어주는 함수
def make_dataset_batch(df, window_size = 24):
    df = df.reset_index(drop=True)
    label = df.pop('target')
    df_list = []
    label_list = []
    for i in range(int(len(df) - window_size)):
        df_list.append(np.array(df.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size-1]))
    return np.array(df_list), np.array(label_list)

In [16]:
df.shape[0] # 4 * 2 * 16 * 2 * 24 * 638

3919872

### 모델 학습

In [17]:
# 각 county, is_business, product_type, is_consumption 별로 모델을 학습하는 함수
def train_LSTM(county, is_business, product_type, is_consumption):
    global df
    test_evaluation_result = {}
    data = df[(df['is_business'] == is_business)
                & (df['is_consumption'] == is_consumption)
                & (df['county_' + str(county)] == 1)
                & (df['product_type_' + str(product_type)] == 1)]
    data = data.drop(columns=['year', 'quarter', 'month', 'week', 'hour', 'day_of_year', 'day_of_month', 'day_of_week',
                              'is_business',
                              'is_consumption',
                              'county_0',
                              'county_1',
                              'county_2',
                              'county_3',
                              'county_4',
                              'county_5',
                              'county_6',
                              'county_7',
                              'county_8',
                              'county_9',
                              'county_10',
                              'county_11',
                              'county_12',
                              'county_13',
                              'county_14',
                              'county_15',
                              'product_type_0',
                              'product_type_1',
                              'product_type_2',
                              'product_type_3'
                              ])
    df_list, label_list = make_dataset_batch(data)
    
    # 스플릿
    X_train, X_test = train_test_split(df_list, test_size=0.2, shuffle=False)
    y_train, y_test = train_test_split(label_list, test_size=0.2, shuffle=False)

    X_train, X_val = train_test_split(X_train, test_size=0.2, shuffle=False)
    y_train, y_val = train_test_split(y_train, test_size=0.2, shuffle=False)

    # Model Design
    global models_LSTM, rlr, ely
    idx = str(county) + '_' + str(is_business) + '_' + str(product_type) + '_' + str(is_consumption)
    models_LSTM[idx] = Sequential([
        LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True),
        LSTM(20, return_sequences=False),
        Dense(1, name='output')
    ])
    models_LSTM[idx].compile(optimizer='adam', loss='mae')

    # 학습
    history = models_LSTM[idx].fit(x=X_train, y=y_train, batch_size=64, epochs=100, validation_data=(X_val, y_val), callbacks=[rlr, ely], shuffle=False, verbose=0)
    
    # Model Test Evaluation
    print(idx)
    test_evaluation_result[idx] = models_LSTM[idx].evaluate(X_test, y_test)
    
    return test_evaluation_result

In [18]:
models_LSTM = {}
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=5, mode='min', verbose=0)
ely = EarlyStopping(monitor='val_loss', patience=10, mode='min', verbose=0)

In [19]:
# 실제 조합별 모델 학습
for county in range(16):
    for is_business in range(2):
        for product_type in range(4):
            for is_consumption in range(2):
                test_evaluation_result = train_LSTM(county, is_business, product_type, is_consumption)




0_0_0_0
0_0_0_1
0_0_1_0
0_0_1_1
0_0_2_0
0_0_2_1
0_0_3_0
0_0_3_1
0_1_0_0
0_1_0_1
0_1_1_0
0_1_1_1
0_1_2_0
0_1_2_1
0_1_3_0
0_1_3_1
1_0_0_0
1_0_0_1
1_0_1_0
1_0_1_1
1_0_2_0
1_0_2_1
1_0_3_0
1_0_3_1
1_1_0_0
1_1_0_1
1_1_1_0
1_1_1_1
1_1_2_0
1_1_2_1
1_1_3_0
1_1_3_1
2_0_0_0
2_0_0_1
2_0_1_0
2_0_1_1
2_0_2_0
2_0_2_1
2_0_3_0
2_0_3_1
2_1_0_0
2_1_0_1
2_1_1_0
2_1_1_1
2_1_2_0
2_1_2_1
2_1_3_0
2_1_3_1
3_0_0_0
3_0_0_1
3_0_1_0
3_0_1_1
3_0_2_0
3_0_2_1
3_0_3_0
3_0_3_1
3_1_0_0
3_1_0_1
3_1_1_0
3_1_1_1
3_1_2_0
3_1_2_1
3_1_3_0
3_1_3_1
4_0_0_0
4_0_0_1
4_0_1_0
4_0_1_1
4_0_2_0
4_0_2_1
4_0_3_0
4_0_3_1
4_1_0_0
4_1_0_1
4_1_1_0
4_1_1_1
4_1_2_0
4_1_2_1
4_1_3_0
4_1_3_1
5_0_0_0
5_0_0_1
5_0_1_0
5_0_1_1
5_0_2_0
5_0_2_1
5_0_3_0
5_0_3_1
5_1_0_0
5_1_0_1
5_1_1_0
5_1_1_1
5_1_2_0
5_1_2_1
5_1_3_0
5_1_3_1
6_0_0_0
6_0_0_1
6_0_1_0
6_0_1_1
6_0_2_0
6_0_2_1
6_0_3_0
6_0_3_1
6_1_0_0
6_1_0_1
6_1_1_0
6_1_1_1
6_1_2_0
6_1_2_1
6_1_3_0
6_1_3_1
7_0_0_0
7_0_0_1
7_0_1_0
7_0_1_1
7_0_2_0
7_0_2_1
7_0_3_0
7_0_3_1
7_1_0_0
7_1_0_1
7_1_1_0
7_1_1_1
7_1_2

### 결과 및 모델 세이브

In [20]:
pd.DataFrame.from_dict(data = test_evaluation_result, orient='index')[0].mean()

173.31607055664062

In [21]:
# 모델 저장
def save_models(models_LSTM, file_path):
    for i in models_LSTM:
        models_LSTM[i].save_weights(file_path + 'LSTM_50_20_' + i + '.h5')

In [22]:
save_models(models_LSTM, 'LSTM_50_20/')