In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import font_manager, rc
import seaborn as sns
from sklearn.model_selection import train_test_split
from datetime import datetime as dt
import warnings
import matplotlib.font_manager as fm
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

  import pandas.util.testing as tm


In [3]:
#!sudo apt-get install -y fonts-nanum
#!sudo fc-cache -fv
#!rm ~/.cache/matplotlib -rf

plt.rc('font', family='NanumBarunGothic') 

# **0.데이터 전처리**

In [4]:
final_data = pd.read_csv('./gdrive/My Drive/빅콘 대상팀/data/model_data.csv')

In [5]:
gu = pd.read_excel('./gdrive/My Drive/빅콘 대상팀/data/지역데이터/구_동.xlsx')
gs = pd.read_csv('./gdrive/My Drive/빅콘 대상팀/data/all_amt.csv',parse_dates=['STD_YMD'])
#gs = gs.drop(['Unnamed: 0'],axis=1)
gs = pd.concat([gs.iloc[:,[0,1]],gs.filter(like='GS')],axis=1)
gs = pd.merge(gs,gu,on='HDONG_NM')
gs['CITY'] = gs['HDONG_GU'].apply(lambda x: x[0:2])
gs_seoul = gs.query('CITY == "서울" & STD_YMD > "2020"')

gs_eat = gs_seoul.iloc[:,[0,1,3]].sort_values(['HDONG_NM','STD_YMD'])
gs_snack = gs_seoul.iloc[:,[0,1,4]].sort_values(['HDONG_NM','STD_YMD'])
gs_drink = gs_seoul.iloc[:,[0,1,5]].sort_values(['HDONG_NM','STD_YMD'])

In [6]:
final_data4 = final_data.drop(['COVID_CNT','covid_p1','sc_m1','cj_m1','covid_p1','최저기온','최고기온','일강수량'],axis=1) #최종사용데이터

In [7]:
def build_data(data,dong,cat):

  X = data.query('HDONG_NM==@dong').reset_index(drop=True)
  
  if cat == "식사":
    eat = gs_eat.query('HDONG_NM==@dong').reset_index(drop=True)
    X['self_m7'] = eat['GS_식사'].shift(7)
    X['y'] = eat['GS_식사']
  elif cat == "간식":
    snack = gs_snack.query('HDONG_NM==@dong').reset_index(drop=True)
    X['self_m7'] = snack['GS_간식'].shift(7)
    X['y'] = snack['GS_간식']
  elif cat == "마실거리":
    drink = gs_drink.query('HDONG_NM==@dong').reset_index(drop=True)
    X['self_m7'] = drink['GS_마실거리'].shift(7)
    X['y'] = drink['GS_마실거리']
  
  X.index = X['STD_YMD']
  del X['STD_YMD'],X['HDONG_NM']

  return X

In [8]:
from sklearn.preprocessing import MinMaxScaler 

def minmax_scalar(X):
  idx = X.index
  col = X.columns

  scalar = MinMaxScaler()
  scaled_X = pd.DataFrame(scalar.fit_transform(X))
  scaled_X.index = idx
  scaled_X.columns = col

  return scaled_X

In [9]:
def split_xy(dataset, time_steps, y_column):

  x, y = list(), list()
  for i in range(len(dataset)):
    x_end_number = i + time_steps
    y_end_number = x_end_number + y_column

    if y_end_number > len(dataset):
      break
    tmp_x = np.array(dataset)[i:x_end_number, :]
    tmp_y = np.array(dataset)[x_end_number:y_end_number, -1]
    x.append(tmp_x)
    y.append(tmp_y)
  return np.array(x), np.array(y)

In [10]:
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error

def RMSLE_fun(origin,pred):
  rmsle = np.sqrt(mean_squared_log_error(origin+1, pred+1))
  return rmsle

In [11]:
def train_test_split(n,X,y):
  if isinstance(X, pd.DataFrame):
    total = X.shape[0]
    X_train,X_test = X.iloc[:total-n, :],X.iloc[-n:, :]
    y_train,y_test = y[:total-n],y[-n:]
  else :
    total = X.shape[0]
    X_train,X_test = X[:total-n, :],X[-n:, :]
    y_train,y_test = y[:total-n],y[-n:]
  return X_train,X_test,y_train,y_test

In [12]:
import keras
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, GRU
from keras.callbacks import EarlyStopping

In [13]:
def data_pipeline(data, dong, cat, time_steps, y_columns):
  data = build_data(data,dong,cat)
  
  #y = data['y']
  #del data['y']
  min = data['y'].min()
  max = data['y'].max()

  X = minmax_scalar(data)
  #Xy = pd.concat([X,y],axis=1)
  Xy = X.dropna()
  
  X,y = split_xy(Xy,time_steps,y_columns)

  X_train, y_train = X[:-7],y[:-7]
  X_test, y_test = X[-7:],y[-7:]

  X_test=X_test.reshape(-1,time_steps,X_train.shape[2])
  y_test=y_test.reshape(-1,y_columns)



  return X_train,y_train,X_test,y_test,min,max

# **1. LSTM**

In [14]:
def LSTM_fun(data, dong, cat):
  
  X_train,y_train,X_test,y_test,min,max = data_pipeline(data, dong, cat,7,1)

  model = Sequential()
  model.add(LSTM(100, input_shape = (None, X_train.shape[2])))
  model.add(Dense(10))
  model.add(Dense(1))

  model.compile(optimizer='adam', loss='mse')
  early_stopping = EarlyStopping(monitor='val_loss', patience=30, mode='min', restore_best_weights=True)
  model.fit(X_train, y_train, epochs=5000, batch_size=32, verbose=0, callbacks=[early_stopping], validation_data = (X_test, y_test))

  y_pred = model.predict(X_test, batch_size=1)
  #y_pred = y_pred.reshape(-1, 1) *(max-min)+min
  #y_test = y_test *(max-min)+min

  mse = np.mean((y_test-y_pred)**2)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  rmsle = RMSLE_fun(np.array(y_test), np.array(y_pred))

  return mse, rmse, mae, rmsle


# **2. GRU**

In [15]:
def GRU_fun(data, dong, cat):
  
  X_train,y_train,X_test,y_test,min,max = data_pipeline(data, dong, cat,7,1)

  model = Sequential()
  model.add(GRU(100, input_shape = (None, X_train.shape[2])))
  model.add(Dense(10))
  #model.add(Dense(7))
  model.add(Dense(1))

  model.compile(optimizer='adam', loss='mse')
  early_stopping = EarlyStopping(monitor='val_loss', patience=50, mode='min', restore_best_weights=True)
  model.fit(X_train, y_train, epochs=5000, batch_size=32, verbose=0, callbacks=[early_stopping], validation_data = (X_test, y_test))

  
  y_pred = model.predict(X_test, batch_size=1)
  #y_pred = y_pred.reshape(-1, 1) *(max-min)+min
  #y_test = y_test *(max-min)+min

  mse = np.mean((y_test-y_pred)**2)
  rmse = np.sqrt(mse)
  mae = mean_absolute_error(y_test, y_pred)
  rmsle = RMSLE_fun(np.array(y_test), np.array(y_pred))

  return mse, rmse, mae, rmsle


# **3. LSTM & GRU results**

In [16]:
dong_list = list(final_data['HDONG_NM'].unique())
dong_list.remove('상계8동')

In [None]:
# LSTM
MSE_eat = []
RMSE_eat = []
MAE_eat = []
RMSLE_eat = []

MSE_snack = []
RMSE_snack = []
MAE_snack = []
RMSLE_snack = []

MSE_drink = []
RMSE_drink = []
MAE_drink = []
RMSLE_drink = []

for cat in ['식사', '간식', '마실거리']:
  for dong in dong_list:
    mse, rmse, mae, rmsle = LSTM_fun(final_data4, dong, cat)
    #print('{0}, {1}, {2}, {3}, {4}, {5}'.format(dong, cat, mse, rmse, mae, rmsle))

    if cat == '식사':
      MSE_eat.append(mse)
      RMSE_eat.append(rmse)
      MAE_eat.append(mae)
      RMSLE_eat.append(rmsle)
      
    elif cat == '간식':
      MSE_snack.append(mse)
      RMSE_snack.append(rmse)
      MAE_snack.append(mae)
      RMSLE_snack.append(rmsle)
      
    else: 
      MSE_drink.append(mse)
      RMSE_drink.append(rmse)
      MAE_drink.append(mae)
      RMSLE_drink.append(rmsle)


In [None]:
lstm_result = pd.DataFrame({'동':dong_list,
              '식사_MSE':MSE_eat,
              '식사_RMSE':RMSE_eat,
              '식사_MAE':MAE_eat,
              '식사_RMSLE':RMSLE_eat,
              '간식_MSE':MSE_snack,
              '간식_RMSE':RMSE_snack,
              '간식_MAE':MAE_snack,
              '간식_RMSLE':RMSLE_snack,
              '마실거리_MSE':MSE_drink,
              '마실거리_RMSE':RMSE_drink,
              '마실거리_MAE':MAE_drink,
              '마실거리_RMSLE':RMSLE_drink})

In [None]:
lstm_result.mean()

식사_MSE        0.015842
식사_RMSE       0.115357
식사_MAE        0.096041
식사_RMSLE      0.045043
간식_MSE        0.006963
간식_RMSE       0.072412
간식_MAE        0.060080
간식_RMSLE      0.031221
마실거리_MSE      0.019499
마실거리_RMSE     0.126972
마실거리_MAE      0.107811
마실거리_RMSLE    0.047043
dtype: float64

In [None]:
# GRU
MSE_eat = []
RMSE_eat = []
MAE_eat = []
RMSLE_eat = []

MSE_snack = []
RMSE_snack = []
MAE_snack = []
RMSLE_snack = []

MSE_drink = []
RMSE_drink = []
MAE_drink = []
RMSLE_drink = []

for cat in ['식사', '간식', '마실거리']:
  for dong in dong_list:
    mse, rmse, mae, rmsle = GRU_fun(final_data4, dong, cat)
    print('{0}, {1}, {2}, {3}, {4}, {5}'.format(dong, cat, mse, rmse, mae, rmsle))

    if cat == '식사':
      MSE_eat.append(mse)
      RMSE_eat.append(rmse)
      MAE_eat.append(mae)
      RMSLE_eat.append(rmsle)
      
    elif cat == '간식':
      MSE_snack.append(mse)
      RMSE_snack.append(rmse)
      MAE_snack.append(mae)
      RMSLE_snack.append(rmsle)
      
    else: 
      MSE_drink.append(mse)
      RMSE_drink.append(rmse)
      MAE_drink.append(mae)
      RMSLE_drink.append(rmsle)


In [None]:
gru_result = pd.DataFrame({'동':dong_list,
              '식사_MSE':MSE_eat,
              '식사_RMSE':RMSE_eat,
              '식사_MAE':MAE_eat,
              '식사_RMSLE':RMSLE_eat,
              '간식_MSE':MSE_snack,
              '간식_RMSE':RMSE_snack,
              '간식_MAE':MAE_snack,
              '간식_RMSLE':RMSLE_snack,
              '마실거리_MSE':MSE_drink,
              '마실거리_RMSE':RMSE_drink,
              '마실거리_MAE':MAE_drink,
              '마실거리_RMSLE':RMSLE_drink})

In [None]:
gru_result.mean()

식사_MSE        0.015914
식사_RMSE       0.116324
식사_MAE        0.097095
식사_RMSLE      0.045337
간식_MSE        0.006268
간식_RMSE       0.068403
간식_MAE        0.056125
간식_RMSLE      0.029493
마실거리_MSE      0.014624
마실거리_RMSE     0.111393
마실거리_MAE      0.091851
마실거리_RMSLE    0.041372
dtype: float64

# **4. GRU Bayesian Optimization**

In [18]:
warnings.filterwarnings("ignore")

In [19]:
!pip install scikit-optimize
import skopt

from skopt import gbrt_minimize, gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer  

import keras
from keras.models import Sequential
from keras.layers import Dense
import tensorflow
from tensorflow.python.keras import backend as K

dim_num_dense_layers = Integer(low=1, high=5, name='num_dense_layers')
dim_num_input_nodes = Integer(low=100, high=512, name='num_input_nodes')
dim_num_dense_nodes = Integer(low=10, high=50, name='num_dense_nodes')
dim_activation = Categorical(categories=['relu', 'sigmoid'],
                             name='activation')
dim_batch_size = Integer(low=1, high=32, name='batch_size')

dimensions = [dim_num_dense_layers,
              dim_num_input_nodes,
              dim_num_dense_nodes,
              dim_activation,
              dim_batch_size
             ]
default_parameters = [1, 512, 13, 'relu',32]


Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/8b/03/be33e89f55866065a02e515c5b319304a801a9f1027a9b311a9b1d1f8dc7/scikit_optimize-0.8.1-py2.py3-none-any.whl (101kB)
[K     |███▎                            | 10kB 15.6MB/s eta 0:00:01[K     |██████▌                         | 20kB 2.3MB/s eta 0:00:01[K     |█████████▊                      | 30kB 3.0MB/s eta 0:00:01[K     |█████████████                   | 40kB 3.4MB/s eta 0:00:01[K     |████████████████▏               | 51kB 3.0MB/s eta 0:00:01[K     |███████████████████▍            | 61kB 3.3MB/s eta 0:00:01[K     |██████████████████████▊         | 71kB 3.7MB/s eta 0:00:01[K     |██████████████████████████      | 81kB 3.6MB/s eta 0:00:01[K     |█████████████████████████████▏  | 92kB 3.8MB/s eta 0:00:01[K     |████████████████████████████████| 102kB 3.4MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddf

In [20]:
from keras.optimizers import Adam
def create_model(num_dense_layers,num_input_nodes,
                 num_dense_nodes, activation):
    #start the model making process and create our first layer

    X_train, y_train, X_test, y_test,_,_ = data_pipeline(final_data4, d, c, 7, 1) 

    model = Sequential()
    model.add(GRU(num_input_nodes, input_shape=(None, X_train.shape[2]), activation=activation
                   ))
    #create a loop making a new dense layer for the amount passed to this model.
    #naming the layers helps avoid tensorflow error deep in the stack trace.
    for i in range(num_dense_layers):
        name = 'layer_dense_{0}'.format(i+1)
        model.add(Dense(num_dense_nodes,
                 activation=activation,
                        name=name
                 ))
    #add our classification layer.
    model.add(Dense(1))
    
    #setup our optimizer and compile
    model.compile(optimizer='adam', loss='mse')
    return model

In [21]:
from keras.models import load_model

@use_named_args(dimensions=dimensions)
def fitness(num_dense_layers, num_input_nodes, 
            num_dense_nodes,activation, batch_size):
  
  model = create_model(num_dense_layers=num_dense_layers,
                        num_input_nodes=num_input_nodes,
                        num_dense_nodes=num_dense_nodes,
                        activation=activation
                      )
  
  X_train,y_train,X_test,y_test,min,max = data_pipeline(final_data4, d, c,7,1) 
  
  early_stopping = EarlyStopping(monitor='val_loss', patience=30, mode='min', restore_best_weights=True)
  model.fit(X_train, y_train, epochs=5000, batch_size=32, verbose=0, callbacks=[early_stopping], validation_data = (X_test, y_test))

  y_pred = model.predict(X_test, batch_size=1)
  #y_pred = y_pred.reshape(-1,1) *(max-min) + min
  #y_test = y_test *(max-min) + min

  mse = np.mean((y_test-y_pred)**2)
  rmse = np.sqrt(mse)

  print()
  print("MSE: {}, RMSE: {}".format(mse,rmse))
  print()

  global best_mse
  
  if mse <= best_mse:
      model.save('./gdrive/My Drive/빅콘 대상팀/분석 code/GRU/models/{}{} GRU.hdf5'.format(d,c))
      best_mse = mse

  del model
  
  return mse

In [22]:
for c in ['식사', '간식', '마실거리']:
  for d in dong_list:
    best_mse=100

    gp_result = gp_minimize(func=fitness,
                                dimensions=dimensions,
                                n_calls=12,
                                noise= 0.01,
                                n_jobs=-1,
                                kappa = 5,
                                x0=default_parameters)
    
    
    from skopt import dump, load
    dump(gp_result, 'result.pkl')
    gp_result_loaded = load('result.pkl')

    print('{0} {1}, Best MSE={2}'.format(d, c, gp_result_loaded.fun))

    print("""Best parameters:
    - num_dense_layers=%d
    - num_input_nodes=%d
    - num_dense_nodes=%d
    - activation=%s
    - batch_size=%d"""  % (gp_result_loaded.x[0], gp_result_loaded.x[1],
                            gp_result_loaded.x[2], gp_result_loaded.x[3],
                            gp_result_loaded.x[4]))
    
    



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
MSE: 0.021035586215733006, RMSE: 0.14503649959831838


MSE: 0.01596083280152513, RMSE: 0.1263361895955594


MSE: 0.017330417278759773, RMSE: 0.1316450427428233


MSE: 0.021581685683177698, RMSE: 0.14690706478307194


MSE: 0.017319463303067155, RMSE: 0.13160343195778426


MSE: 0.020819076838704726, RMSE: 0.14428817289959953


MSE: 0.020812035443669365, RMSE: 0.14426377037797591


MSE: 0.014549418676214083, RMSE: 0.12062097112946024


MSE: 0.025124680720049947, RMSE: 0.15850766770112398


MSE: 0.011633168039167873, RMSE: 0.10785716498762553


MSE: 0.017477710978975786, RMSE: 0.13220329413057674

상계2동 식사, Best MSE=0.011633168039167873
Best parameters:
    - num_dense_layers=3
    - num_input_nodes=489
    - num_dense_nodes=48
    - activation=relu
    - batch_size=24

MSE: 0.0534245983334849, RMSE: 0.23113761773775576


MSE: 0.061649717326895906, RMSE: 0.24829361112782566


MSE: 0.05648070450378121, RMSE: 0.23765669463278583


MSE: 0.05848