## 1. 향후 7일 입도객 예측 데이터 추출 / 전처리 / 피쳐 엔지니어링

#### 패키지 임포트

In [5]:
# Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# Data manipulation, visualization and useful functions
import argparse
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import date, timedelta
import missingno as msno

# gcp functions
from google.cloud import bigquery
from google.cloud import storage
from google.cloud import aiplatform

# Keras and tensorflow
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Dense, Activation, Flatten, Dropout
from keras.layers import SimpleRNN, LSTM, GRU
from keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers
import keras_tuner

#### 데이터 쿼리 from 빅쿼리

In [6]:
# Set up BigQuery clients
bqclient = bigquery.Client(project='charged-genre-350106')

# Query for base dataset

query_AP = """
    SELECT
      CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(DT as date)) AS string),
             CAST(FORMAT_DATE("%m", CAST(DT as date)) AS string),
             CAST(FORMAT_DATE("%d", CAST(DT as date)) AS string)) AS DT
      , ARRIVE_PPL as ARRIVE_PPL
    FROM `charged-genre-350106.kaflix.AIRLINE_PASSENGER`
    """

query_NS = """
    SELECT *
    FROM `charged-genre-350106.kaflix.NAVER_SEARCH`
    """

query_AT = """
    WITH MAIN AS (
        SELECT *
        FROM kaflix.AIRLINE_TICKET A
        WHERE 1=1
            -- AND A.DEPART = 'GMP'
            -- AND A.ARRIVE = 'CJU'
        AND A.SEARCH_DATE NOT IN ('2022-07-08', '2022-07-09', '2022-07-10',  '2022-07-12')
    )
    SELECT
        A.*
        , B.B
        , B.F
        , B.D
        , B.S
        , B.SD
    FROM(
        SELECT
           CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(LEFT(DEPART_DATE,10) as date)) AS string),
                  CAST(FORMAT_DATE("%m", CAST(LEFT(DEPART_DATE,10) as date)) AS string),
                  CAST(FORMAT_DATE("%d", CAST(LEFT(DEPART_DATE,10) as date)) AS string)) AS DEPART_DT
           , CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(LEFT(SEARCH_DATE, 10) as date)) AS string),
                    CAST(FORMAT_DATE("%m", CAST(LEFT(SEARCH_DATE,10) as date)) AS string),
                    CAST(FORMAT_DATE("%d", CAST(LEFT(SEARCH_DATE,10) as date)) AS string)) AS SEARCH_DT
           , CAST(FORMAT_DATE("%a", CAST(LEFT(DEPART_DATE, 10) as date)) AS string) AS DOW
           , DATE_DIFF(CAST(LEFT(DEPART_DATE,10) AS date), CAST(LEFT(SEARCH_DATE,10) as date), DAY) AS LEAD_TM
        , COUNT(*) AS TICKET
        , MIN(FARE) AS FARE_MIN
        , MAX(FARE) AS FARE_MAX
        , AVG(FARE) AS FARE_AVG
        , STDDEV(FARE) AS FARE_STD
        , MIN(AVAIL_SEAT) AS SEAT_MIN
        , MAX(AVAIL_SEAT) AS SEAT_MAX
        , AVG(AVAIL_SEAT) AS SEAT_AVG
        , STDDEV(AVAIL_SEAT) AS SEAT_STD
        FROM MAIN
        WHERE 1=1
        GROUP BY 
        DEPART_DATE, SEARCH_DATE
    ) A
    LEFT JOIN (
        SELECT *
        FROM(
            SELECT
            CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(LEFT(DEPART_DATE,10) as date)) AS string), CAST(FORMAT_DATE("%m", CAST(LEFT(DEPART_DATE,10) as date)) AS string), CAST(FORMAT_DATE("%d", CAST(LEFT(DEPART_DATE,10) as date)) AS string)) AS DEPART_DT
            , CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(LEFT(SEARCH_DATE, 10) as date)) AS string), CAST(FORMAT_DATE("%m", CAST(LEFT(SEARCH_DATE,10) as date)) AS string), CAST(FORMAT_DATE("%d", CAST(LEFT(SEARCH_DATE,10) as date)) AS string)) AS SEARCH_DT 
                , CLASS_DESC
            FROM MAIN
            WHERE 1=1
        ) A
        PIVOT (
            COUNT(*) 
            FOR CLASS_DESC IN ('비즈니스석' AS B, '일반석' AS F, '할인석' AS D, '특가석' AS S, '단독특가' AS SD)
        )
    ) B ON A.DEPART_DT = B.DEPART_DT AND A.SEARCH_DT = B.SEARCH_DT
    WHERE 1=1
        -- AND A.DEPART_DT < '20220725'
    ORDER BY 
      A.DEPART_DT DESC
      , A.LEAD_TM
    """

query_HD = """
    SELECT
      CONCAT(CAST(FORMAT_DATE("%E4Y", CAST(DT as date)) AS string),
             CAST(FORMAT_DATE("%m", CAST(DT as date)) AS string),
             CAST(FORMAT_DATE("%d", CAST(DT as date)) AS string)) AS DT
      , TOURIST AS TOURIST
      , TEMPERTURE AS TEMPERTURE
      , RAIN AS RAIN 
      , HOLIDAY_NAME AS HOLIDAY_NAME
    FROM `charged-genre-350106.kaflix.TOURIST_WEATHER`
    """

query_RC = """
    SELECT
      PARSE_DATE('%Y%m%d', PURCHASE_DATE) AS PURCHASE_DT
      , PARSE_DATE('%Y%m%d', BEGIN_DATE) AS BEGIN_DT
      , DATE_DIFF (PARSE_DATE('%Y%m%d', BEGIN_DATE),PARSE_DATE('%Y%m%d', PURCHASE_DATE), DAY) AS LEAD_TM
    FROM `charged-genre-350106.kaflix.ERP_MERGING`
    """

query_TW = """
    SELECT *
    FROM `charged-genre-350106.kaflix.TWAY_KAFLIX`
    """

#### 데이터 로드 from 빅쿼리

In [7]:
## Data Load
def data_load(AP, AT, HD, RC, TW, start_date, end_date):
    psg = bqclient.query(AP).to_dataframe()
    air = bqclient.query(AT).to_dataframe()
    wth = bqclient.query(HD).to_dataframe()
    rc = bqclient.query(RC).to_dataframe()
    tw = bqclient.query(TW).to_dataframe()
    
    # converting datetime
    psg['DT']=pd.to_datetime(psg['DT'])
    air['DEPART_DT']=pd.to_datetime(air['DEPART_DT'])
    wth['DT']=pd.to_datetime(wth['DT'])
    rc['BEGIN_DT']=pd.to_datetime(rc['BEGIN_DT'])
    tw['SEARCH_DATE']=pd.to_datetime(tw['SEARCH_DATE'])
    tw['DEPART_DATE']=pd.to_datetime(tw['DEPART_DATE'])

    # air ticket 7days lagged to predict next 7days
    air = air.query("1<=LEAD_TM <= 21")
    rc=rc.query('1<=LEAD_TM<=21')

    # Indexing pgs date & left join with other data
    psg.set_index("DT", inplace=True)
    psg = psg.loc[(psg.index >= start_date) & (psg.index<= end_date),:]  # 0513까지 삭제
    psg = psg.groupby("DT")['ARRIVE_PPL'].sum() # 일별 입도객 합계
    psg = pd.DataFrame(psg)                     # 데이터프레임만들고
    
    # column rename to DT
    air.rename(columns={'DEPART_DT':'DT'}, inplace=True)
    rc.rename(columns={'BEGIN_DT':'DT'}, inplace=True)
    #tw.rename(columns={'DEPART_DATE':'DT'}, inplace=True)

    wth = wth[['DT','TEMPERTURE','RAIN','HOLIDAY_NAME']]
    air = air[['DT','LEAD_TM','TICKET','FARE_MIN','FARE_MAX','FARE_AVG', 'FARE_STD', 'SEAT_MIN', 'SEAT_MAX', 'SEAT_AVG', 'SEAT_STD','B','F','D','S','SD']]
    return psg, air, wth, rc, tw

### 렌타카 데이터 전처리

In [8]:
def rentacar_processing(df):
    df=df.groupby(['PURCHASE_DT','DT','LEAD_TM']).size().reset_index()
    df.columns=['purchase_dt','DT', 'lead_time','count']
    df=df.pivot_table(index=['DT'], columns='lead_time', values='count')
    df=df.reset_index()
    return df

### 티웨이 데이터 전처리

In [9]:
def tway_processing(df):
    df1=df[['SEARCH_DATE','DEPART_DATE','FLIGHT_MODEL']]
    df1=df1.groupby(['SEARCH_DATE','DEPART_DATE','FLIGHT_MODEL']).size().reset_index()
    df1=df1.pivot_table(index=['SEARCH_DATE','DEPART_DATE'], columns='FLIGHT_MODEL', aggfunc=['sum']).reset_index()
    df1['lead_time']=df1['DEPART_DATE']-df1['SEARCH_DATE']
    df1['lead_time']=df1['lead_time'].dt.days
    df1.columns=df1.columns.to_flat_index()
    df1.columns=['search_date','depart_date','330','737','lead_time']
    df1=df1.fillna(0)

    
    df2=df[['SEARCH_DATE','DEPART_DATE','GROUP_SOLD','TOTAL_SOLD']]
    df2=df2.groupby(['SEARCH_DATE','DEPART_DATE']).sum().reset_index()
    df2['lead_time']=df2['DEPART_DATE'] - df2['SEARCH_DATE']
    df2['lead_time']=df2['lead_time'].dt.days
    df2.columns=['search_date','depart_date','group_sold','total_sold','lead_time']
    
    df3=pd.merge(df1, df2, how='left', on=['search_date','depart_date'])
    df3=df3.drop('lead_time_y', axis=1)
    df3.columns=['search_date','depart_date','group_sold','total_sold','lead_time','330','737']
    df3=df3.query('1<=lead_time<=21')
    
    df3=df3.pivot_table(index=['depart_date'], columns='lead_time', values=['group_sold','total_sold','330','737'])
    df3.columns=df3.columns.to_flat_index()
    df3=df3.reset_index()
    df3.columns=['DT','330_1','330_2','330_3','330_4','330_5','330_6','330_7',
                      '330_8','330_9','330_10','330_11','330_12','330_13','330_14',
                      '330_15','330_16','330_17','330_18','330_19','330_20','330_21',
                      '737_1','737_2','737_3','737_4','737_5','737_6','737_7',
                      '737_8','737_9','737_10','737_11','737_12','737_13','737_14',
                      '737_15','737_16','737_17','737_18','737_19','737_20','737_21',
                      'group_sold_1','group_sold_2','group_sold_3','group_sold_4','group_sold_5','group_sold_6','group_sold_7',
                      'group_sold_8','group_sold_9','group_sold_10','group_sold_11','group_sold_12','group_sold_13','group_sold_14',
                      'group_sold_15','group_sold_16','group_sold_17','group_sold_18','group_sold_19','group_sold_20','group_sold_21',
                      'total_sold_1','total_sold_2','total_sold_3','total_sold_4','total_sold_5','total_sold_6','total_sold_7',
                      'total_sold_8','total_sold_9','total_sold_10','total_sold_11','total_sold_12','total_sold_13','total_sold_14',
                      'total_sold_15','total_sold_16','total_sold_17','total_sold_18','total_sold_19','total_sold_20','total_sold_21']
    return df3

#### 휴일 데이터 전처리

In [10]:
## Holiday Data Preprocessing
def holiday_data_pre(df):
    wth_h = df[['DT', 'HOLIDAY_NAME']]

    # 1 for all holidays , 0 for others
    wth_h.HOLIDAY_NAME.loc[~wth_h.HOLIDAY_NAME.isnull()] = 1
    wth_h = wth_h.sort_values('DT')
    wth_h.HOLIDAY_NAME.loc[wth_h.HOLIDAY_NAME.isnull()] = 0
    wth_h.HOLIDAY_NAME.unique()

    # [long holidays] and [Holidays Interspersed with Workdays]
    wth_h['dayofweek'] = wth_h.DT.dt.dayofweek

    # 공휴일인데 금요일(4)이면 그주 금요일(4)을 연휴시작으로
    # 공휴일인데 월요일(0)이면 전주 토요일(5)을 연휴시작으로
    wth_h['flong']=np.where((wth_h['HOLIDAY_NAME'] == 1) & (wth_h['dayofweek'] == 4), 1, 0)
    wth_h['mlong']=np.where((wth_h['HOLIDAY_NAME'] == 1) & (wth_h['dayofweek'] == 0), 1, 0)

    #월요일 연휴는 그 전 토요일에 연휴 시작 표시
    wth_h['mlong']= wth_h['mlong'].shift(-2)
    wth_h['mlong'].fillna(method='ffill', inplace = True)

    #금요일 연휴, 월요일 연휴 컬럼 합치기
    wth_h['long_h']=wth_h['flong'] + wth_h['mlong'] 

    #연휴시작 하루 전날 표시
    wth_h['b_long_h']=wth_h['long_h'].shift(-1)
    wth_h['b_long_h'].fillna(method='ffill', inplace = True)

    # 징검다리 로직
    # 휴일인데 목요일(3)이면 그주 목요일(3)을 징검다리 연휴시작
    # 휴일인데 화요일(1)이면 전주 토요일(5)을 징검다리 연휴시작
    wth_h['thinter'] = np.where((wth_h['HOLIDAY_NAME'] == 1) & (wth_h['dayofweek'] == 3), 1, 0)
    wth_h['tuinter'] = np.where((wth_h['HOLIDAY_NAME'] == 1) & (wth_h['dayofweek'] == 1), 1, 0)

    # 화요일 징검다리는 전주 토요일에 연휴 시작 표시
    wth_h['tuinter'] = wth_h['tuinter'].shift(-3)
    wth_h['tuinter'].fillna(method='ffill', inplace = True)
    wth_h['inter_h'] = wth_h['thinter'] + wth_h['tuinter'] 

    # 징검다리 시작 하루 전날 표시
    wth_h['b_inter_h'] = wth_h['inter_h'].shift(-1)
    wth_h['b_inter_h'].fillna(method='ffill', inplace = True)

    # 명절 연휴 시작일 및 시작일 전날 표시 
    #holiday    : 설날/추석 명절 -> 1
    #first_m    : 설날/추석 명절연휴 시작일,단 명절 연휴시작이 (일) 또는 (월)이면 (토)가 시작일 -> 1 표시
    #b_first_m  : 설날/추석 명절연휴 시작일 전날, 단 명절 연휴시작이 (일) 또는 (월)이면 (금)이 시작일 전날 -> 1로 표시

    #first_m    : 설날/추석 명절연휴 시작일,단 명절 연휴시작이 (일) 또는 (월)이면 (토)가 시작일 -> 1 표시
    #2017-01-27, 2017-10-03, 2018-02-15, 2018-09-23, 2019-02-04, 2019-09-12, 2020-01-24, 2020-09-30, 2021-02-11, 2021-09-30, 2022-02-01

    #first_m_nosm   : 설날/추석 명절연휴 시작일,단 명절 연휴시작이 (화) 또는 (수) 또는 (목) 또는 (금) 또는 (토) 이면 1
    wth_h['first_m_nosm']=np.where(((wth_h['DT'] == '2017-01-27') |
                              (wth_h['DT'] == '2017-10-03') |
                              (wth_h['DT'] == '2018-02-15') |
                              (wth_h['DT'] == '2018-09-23') |
                              (wth_h['DT'] == '2019-02-04') |
                              (wth_h['DT'] == '2019-09-12') |
                              (wth_h['DT'] == '2020-01-24') |
                              (wth_h['DT'] == '2020-09-30') |
                              (wth_h['DT'] == '2021-02-11') |
                              (wth_h['DT'] == '2021-09-20') |
                              (wth_h['DT'] == '2022-01-31') |
                              (wth_h['DT'] == '2022-09-09') )
                              & ((wth_h['dayofweek'] != 6) & (wth_h['dayofweek'] !=0)), 1,0)

    #first_m_s   : 설날/추석 명절연휴 시작일,단 명절 연휴시작이 (일)이면 1로 표시하고 shift -1
    wth_h['first_m_s']=np.where(((wth_h['DT'] == '2017-01-27') |
                              (wth_h['DT'] == '2017-10-03') |
                              (wth_h['DT'] == '2018-02-15') |
                              (wth_h['DT'] == '2018-09-23') |
                              (wth_h['DT'] == '2019-02-04') |
                              (wth_h['DT'] == '2019-09-12') |
                              (wth_h['DT'] == '2020-01-24') |
                              (wth_h['DT'] == '2020-09-30') |
                              (wth_h['DT'] == '2021-02-11') |
                              (wth_h['DT'] == '2021-09-20') |
                              (wth_h['DT'] == '2022-01-31') |
                              (wth_h['DT'] == '2022-09-09') )
                              & (wth_h['dayofweek'] == 6), 1,0)
    wth_h['first_m_s'] = wth_h['first_m_s'].shift(-1)
    wth_h['first_m_s'].fillna(method='ffill', inplace = True)

    #first_m_m   : 설날/추석 명절연휴 시작일,단 명절 연휴시작이 (월)이면 1로 표시하고 shift -2
    wth_h['first_m_m']=np.where(((wth_h['DT'] == '2017-01-27') |
                              (wth_h['DT'] == '2017-10-03') |
                              (wth_h['DT'] == '2018-02-15') |
                              (wth_h['DT'] == '2018-09-23') |
                              (wth_h['DT'] == '2019-02-04') |
                              (wth_h['DT'] == '2019-09-12') |
                              (wth_h['DT'] == '2020-01-24') |
                              (wth_h['DT'] == '2020-09-30') |
                              (wth_h['DT'] == '2021-02-11') |
                              (wth_h['DT'] == '2021-09-20') |
                              (wth_h['DT'] == '2022-01-31') |
                              (wth_h['DT'] == '2022-09-09') )
                              & (wth_h['dayofweek'] == 0), 1,0)
    wth_h['first_m_m'] = wth_h['first_m_m'].shift(-2)
    wth_h['first_m_m'].fillna(method='ffill', inplace = True)

    wth_h['first_m'] = wth_h['first_m_nosm'] + wth_h['first_m_s'] + wth_h['first_m_m']

    wth_h['b_first_m'] = wth_h['first_m'].shift(-1)
    wth_h['b_first_m'].fillna(method='ffill', inplace=True)

    wth_h.drop(['flong', 'mlong', 'thinter','tuinter', 'first_m_nosm','first_m_s', 'first_m_m'], axis=1, inplace=True)
    return wth_h

#### 항공티켓 전처리

In [11]:
# LOUIE'S CELL
def air_dataprocessing(air):
    rename = ['DT', 'ticket_1', 'ticket_2', 'ticket_3','ticket_4','ticket_5','ticket_6','ticket_7',
                    'ticket_8', 'ticket_9', 'ticket_10','ticket_11','ticket_12','ticket_13','ticket_14',
                    'ticket_15','ticket_16', 'ticket_17','ticket_18','ticket_19','ticket_20','ticket_21',
                    'fare_min_1','fare_min_2','fare_min_3','fare_min_4','fare_min_5','fare_min_6','fare_min_7',
                    'fare_min_8','fare_min_9','fare_min_10','fare_min_11','fare_min_12','fare_min_13','fare_min_14',
                    'fare_min_15','fare_min_16','fare_min_17','fare_min_18','fare_min_19','fare_min_20','fare_min_21', 
                    'fare_max_1','fare_max_2','fare_max_3','fare_max_4','fare_max_5','fare_max_6','fare_max_7',
                    'fare_max_8','fare_max_9','fare_max_10','fare_max_11','fare_max_12','fare_max_13','fare_max_14',
                    'fare_max_15','fare_max_16','fare_max_17','fare_max_18','fare_max_19','fare_max_20','fare_max_21',
                    'fare_avg_1','fare_avg_2','fare_avg_3','fare_avg_4','fare_avg_5','fare_avg_6','fare_avg_7',
                    'fare_avg_8','fare_avg_9','fare_avg_10','fare_avg_11','fare_avg_12','fare_avg_13','fare_avg_14',
                    'fare_avg_15','fare_avg_16','fare_avg_17','fare_avg_18','fare_avg_19','fare_avg_20','fare_avg_21',
                    'fare_std_1','fare_std_2','fare_std_3','fare_std_4','fare_std_5','fare_std_6','fare_std_7',
                    'fare_std_8','fare_std_9','fare_std_10','fare_std_11','fare_std_12','fare_std_13','fare_std_14',
                    'fare_std_15','fare_std_16','fare_std_17','fare_std_18','fare_std_19','fare_std_20','fare_std_21',
                    'seat_min_1','seat_min_2','seat_min_3','seat_min_4','seat_min_5','seat_min_6','seat_min_7',
                    'seat_min_8','seat_min_9','seat_min_10','seat_min_11','seat_min_12','seat_min_13','seat_min_14',
                    'seat_min_15','seat_min_16','seat_min_17','seat_min_18','seat_min_19','seat_min_20','seat_min_21',
                    'seat_max_1','seat_max_2','seat_max_3','seat_max_4','seat_max_5','seat_max_6','seat_max_7',
                    'seat_max_8','seat_max_9','seat_max_10','seat_max_11','seat_max_12','seat_max_13','seat_max_14',
                    'seat_max_15','seat_max_16','seat_max_17','seat_max_18','seat_max_19','seat_max_20','seat_max_21',
                    'seat_avg_1','seat_avg_2','seat_avg_3','seat_avg_4','seat_avg_5','seat_avg_6','seat_avg_7',
                    'seat_avg_8','seat_avg_9','seat_avg_10','seat_avg_11','seat_avg_12','seat_avg_13','seat_avg_14',
                    'seat_avg_15','seat_avg_16','seat_avg_17','seat_avg_18','seat_avg_19','seat_avg_20','seat_avg_21',
                    'seat_std_1','seat_std_2','seat_std_3','seat_std_4','seat_std_5','seat_std_6','seat_std_7',
                    'seat_std_8','seat_std_9','seat_std_10','seat_std_11','seat_std_12','seat_std_13','seat_std_14',
                    'seat_std_15','seat_std_16','seat_std_17','seat_std_18','seat_std_19','seat_std_20','seat_std_21',
                    'b_1','b_2','b_3','b_4','b_5','b_6','b_7','b_8','b_9','b_10','b_11','b_12','b_13','b_14','b_15','b_16','b_17','b_18','b_19','b_20','b_21',
                    'f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8','f_9','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_19','f_20','f_21',
                    'd_1','d_2','d_3','d_4','d_5','d_6','d_7','d_8','d_9','d_10','d_11','d_12','d_13','d_14','d_15','d_16','d_17','d_18','d_19','d_20','d_21',
                    's_1','s_2','s_3','s_4','s_5','s_6','s_7','s_8','s_9','s_10','s_11','s_12','s_13','s_14','s_15','s_16','s_17','s_18','s_19','s_20','s_21',
                    'sd_1','sd_2','sd_3','sd_4','sd_5','sd_6','sd_7','sd_8','sd_9','sd_10','sd_11','sd_12','sd_13','sd_14','sd_15','sd_16','sd_17','sd_18','sd_19','sd_20','sd_21']

    air.columns = rename
    
    datetime_df = air[['DT']]
    ticket_df = air[['ticket_1', 'ticket_2', 'ticket_3','ticket_4','ticket_5','ticket_6','ticket_7',
                    'ticket_8', 'ticket_9', 'ticket_10','ticket_11','ticket_12','ticket_13','ticket_14',
                    'ticket_15','ticket_16', 'ticket_17','ticket_18','ticket_19','ticket_20','ticket_21']]
    faremin_df = air[['fare_min_1','fare_min_2','fare_min_3','fare_min_4','fare_min_5','fare_min_6','fare_min_7',
                    'fare_min_8','fare_min_9','fare_min_10','fare_min_11','fare_min_12','fare_min_13','fare_min_14',
                    'fare_min_15','fare_min_16','fare_min_17','fare_min_18','fare_min_19','fare_min_20','fare_min_21']]
    faremax_df = air[['fare_max_1','fare_max_2','fare_max_3','fare_max_4','fare_max_5','fare_max_6','fare_max_7',
                    'fare_max_8','fare_max_9','fare_max_10','fare_max_11','fare_max_12','fare_max_13','fare_max_14',
                    'fare_max_15','fare_max_16','fare_max_17','fare_max_18','fare_max_19','fare_max_20','fare_max_21']]
    fareavg_df = air[['fare_avg_1','fare_avg_2','fare_avg_3','fare_avg_4','fare_avg_5','fare_avg_6','fare_avg_7',
                    'fare_avg_8','fare_avg_9','fare_avg_10','fare_avg_11','fare_avg_12','fare_avg_13','fare_avg_14',
                    'fare_avg_15','fare_avg_16','fare_avg_17','fare_avg_18','fare_avg_19','fare_avg_20','fare_avg_21']]
    farestd_df = air[['fare_std_1','fare_std_2','fare_std_3','fare_std_4','fare_std_5','fare_std_6','fare_std_7',
                    'fare_std_8','fare_std_9','fare_std_10','fare_std_11','fare_std_12','fare_std_13','fare_std_14',
                    'fare_std_15','fare_std_16','fare_std_17','fare_std_18','fare_std_19','fare_std_20','fare_std_21']]
    seatmin_df = air[['seat_min_1','seat_min_2','seat_min_3','seat_min_4','seat_min_5','seat_min_6','seat_min_7',
                    'seat_min_8','seat_min_9','seat_min_10','seat_min_11','seat_min_12','seat_min_13','seat_min_14',
                    'seat_min_15','seat_min_16','seat_min_17','seat_min_18','seat_min_19','seat_min_20','seat_min_21']]
    seatmax_df = air[['seat_max_1','seat_max_2','seat_max_3','seat_max_4','seat_max_5','seat_max_6','seat_max_7',
                    'seat_max_8','seat_max_9','seat_max_10','seat_max_11','seat_max_12','seat_max_13','seat_max_14',
                    'seat_max_15','seat_max_16','seat_max_17','seat_max_18','seat_max_19','seat_max_20','seat_max_21']]
    seatavg_df = air[['seat_avg_1','seat_avg_2','seat_avg_3','seat_avg_4','seat_avg_5','seat_avg_6','seat_avg_7',
                    'seat_avg_8','seat_avg_9','seat_avg_10','seat_avg_11','seat_avg_12','seat_avg_13','seat_avg_14',
                    'seat_avg_15','seat_avg_16','seat_avg_17','seat_avg_18','seat_avg_19','seat_avg_20','seat_avg_21']]
    seatstd_df = air[['seat_std_1','seat_std_2','seat_std_3','seat_std_4','seat_std_5','seat_std_6','seat_std_7',
                    'seat_std_8','seat_std_9','seat_std_10','seat_std_11','seat_std_12','seat_std_13','seat_std_14',
                    'seat_std_15','seat_std_16','seat_std_17','seat_std_18','seat_std_19','seat_std_20','seat_std_21']]
    business_df = air[['b_1','b_2','b_3','b_4','b_5','b_6','b_7','b_8','b_9','b_10','b_11','b_12','b_13','b_14','b_15','b_16','b_17','b_18','b_19','b_20','b_21']]
    first_df =    air[['f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8','f_9','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_19','f_20','f_21']]
    discount_df = air[['d_1','d_2','d_3','d_4','d_5','d_6','d_7','d_8','d_9','d_10','d_11','d_12','d_13','d_14','d_15','d_16','d_17','d_18','d_19','d_20','d_21']]
    special_df =  air[['s_1','s_2','s_3','s_4','s_5','s_6','s_7','s_8','s_9','s_10','s_11','s_12','s_13','s_14','s_15','s_16','s_17','s_18','s_19','s_20','s_21']]
    speciald_df = air[['sd_1','sd_2','sd_3','sd_4','sd_5','sd_6','sd_7','sd_8','sd_9','sd_10','sd_11','sd_12','sd_13','sd_14','sd_15','sd_16','sd_17','sd_18','sd_19','sd_20','sd_21']]


    ticket_df = ticket_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    faremin_df = faremin_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    faremax_df = faremax_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    fareavg_df = fareavg_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    farestd_df = farestd_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    seatmin_df = seatmin_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    seatmax_df = seatmax_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    seatavg_df = seatavg_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    seatstd_df = seatstd_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    business_df = business_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    fitst_df = first_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    discount_df = discount_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    special_df = special_df.apply(lambda row: row.fillna(row.mean()), axis=1)
    speciald_df = speciald_df.apply(lambda row: row.fillna(row.mean()), axis=1)

    air2 = pd.concat([datetime_df,ticket_df, faremin_df, faremax_df, fareavg_df, farestd_df, seatmin_df, seatmax_df, seatavg_df, seatstd_df, business_df, fitst_df, discount_df, special_df, speciald_df], axis = 1)
    
    return air2

#### 데이터 통합 및 피쳐 엔지니어링

In [12]:
## Data Merge
def data_merge(psg_df, wth_h_df, air_df, rc_df, tw_df, start_date):
    merged = pd.merge(psg_df, wth_h_df, on="DT", how="left")
    merged = pd.merge(merged, air_df, on='DT', how='left')
    merged = pd.merge(merged, rc_df, on='DT', how='left')
    merged = pd.merge(merged, tw_df, on='DT', how='left')
    
    
    # columns rename
    merged.columns = ['datetime' , 'count', 'holiday', 'dayofweek',
                      'long_h','b_long_h','inter_h','b_inter_h','first_m','b_first_m',
                    'ticket_1', 'ticket_2', 'ticket_3','ticket_4','ticket_5','ticket_6','ticket_7',
                    'ticket_8', 'ticket_9', 'ticket_10','ticket_11','ticket_12','ticket_13','ticket_14',
                    'ticket_15','ticket_16', 'ticket_17','ticket_18','ticket_19','ticket_20','ticket_21',
                    'fare_min_1','fare_min_2','fare_min_3','fare_min_4','fare_min_5','fare_min_6','fare_min_7',
                    'fare_min_8','fare_min_9','fare_min_10','fare_min_11','fare_min_12','fare_min_13','fare_min_14',
                    'fare_min_15','fare_min_16','fare_min_17','fare_min_18','fare_min_19','fare_min_20','fare_min_21', 
                    'fare_max_1','fare_max_2','fare_max_3','fare_max_4','fare_max_5','fare_max_6','fare_max_7',
                    'fare_max_8','fare_max_9','fare_max_10','fare_max_11','fare_max_12','fare_max_13','fare_max_14',
                    'fare_max_15','fare_max_16','fare_max_17','fare_max_18','fare_max_19','fare_max_20','fare_max_21',
                    'fare_avg_1','fare_avg_2','fare_avg_3','fare_avg_4','fare_avg_5','fare_avg_6','fare_avg_7',
                    'fare_avg_8','fare_avg_9','fare_avg_10','fare_avg_11','fare_avg_12','fare_avg_13','fare_avg_14',
                    'fare_avg_15','fare_avg_16','fare_avg_17','fare_avg_18','fare_avg_19','fare_avg_20','fare_avg_21',
                    'fare_std_1','fare_std_2','fare_std_3','fare_std_4','fare_std_5','fare_std_6','fare_std_7',
                    'fare_std_8','fare_std_9','fare_std_10','fare_std_11','fare_std_12','fare_std_13','fare_std_14',
                    'fare_std_15','fare_std_16','fare_std_17','fare_std_18','fare_std_19','fare_std_20','fare_std_21',
                    'seat_min_1','seat_min_2','seat_min_3','seat_min_4','seat_min_5','seat_min_6','seat_min_7',
                    'seat_min_8','seat_min_9','seat_min_10','seat_min_11','seat_min_12','seat_min_13','seat_min_14',
                    'seat_min_15','seat_min_16','seat_min_17','seat_min_18','seat_min_19','seat_min_20','seat_min_21',
                    'seat_max_1','seat_max_2','seat_max_3','seat_max_4','seat_max_5','seat_max_6','seat_max_7',
                    'seat_max_8','seat_max_9','seat_max_10','seat_max_11','seat_max_12','seat_max_13','seat_max_14',
                    'seat_max_15','seat_max_16','seat_max_17','seat_max_18','seat_max_19','seat_max_20','seat_max_21',
                    'seat_avg_1','seat_avg_2','seat_avg_3','seat_avg_4','seat_avg_5','seat_avg_6','seat_avg_7',
                    'seat_avg_8','seat_avg_9','seat_avg_10','seat_avg_11','seat_avg_12','seat_avg_13','seat_avg_14',
                    'seat_avg_15','seat_avg_16','seat_avg_17','seat_avg_18','seat_avg_19','seat_avg_20','seat_avg_21',
                    'seat_std_1','seat_std_2','seat_std_3','seat_std_4','seat_std_5','seat_std_6','seat_std_7',
                    'seat_std_8','seat_std_9','seat_std_10','seat_std_11','seat_std_12','seat_std_13','seat_std_14',
                    'seat_std_15','seat_std_16','seat_std_17','seat_std_18','seat_std_19','seat_std_20','seat_std_21',
                    'b_1','b_2','b_3','b_4','b_5','b_6','b_7','b_8','b_9','b_10','b_11','b_12','b_13','b_14','b_15','b_16','b_17','b_18','b_19','b_20','b_21',
                    'f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8','f_9','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_19','f_20','f_21',
                    'd_1','d_2','d_3','d_4','d_5','d_6','d_7','d_8','d_9','d_10','d_11','d_12','d_13','d_14','d_15','d_16','d_17','d_18','d_19','d_20','d_21',
                    's_1','s_2','s_3','s_4','s_5','s_6','s_7','s_8','s_9','s_10','s_11','s_12','s_13','s_14','s_15','s_16','s_17','s_18','s_19','s_20','s_21',
                    'sd_1','sd_2','sd_3','sd_4','sd_5','sd_6','sd_7','sd_8','sd_9','sd_10','sd_11','sd_12','sd_13','sd_14','sd_15','sd_16','sd_17','sd_18','sd_19','sd_20','sd_21',
                      'lt_1',   'lt_2',   'lt_3',   'lt_4',   'lt_5',   'lt_6' , 'lt_7', 
                      'lt_8',   'lt_9',   'lt_10',  'lt_11',  'lt_12',  'lt_13', 'lt_14',
                      'lt_15',   'lt_16',   'lt_17',  'lt_18',  'lt_19',  'lt_20', 'lt_21',
                      'f330_1',   'f330_2',   'f330_3',   'f330_4',   'f330_5',   'f330_6' , 'f330_7',  
                      'f330_8',   'f330_9',   'f330_10',  'f330_11',  'f330_12',  'f330_13', 'f330_14',
                      'f330_15',  'f330_16',  'f330_17',  'f330_18',  'f330_19',  'f330_20', 'f330_21',
                      'f737_1',   'f737_2',   'f737_3',   'f737_4',   'f737_5',   'f737_6' , 'f737_7',  
                      'f737_8',   'f737_9',   'f737_10',  'f737_11',  'f737_12',  'f737_13', 'f737_14',
                      'f737_15',  'f737_16',  'f737_17',  'f737_18',  'f737_19',  'f737_20', 'f737_21',
                      'group_1',   'group_2',   'group_3',   'group_4',   'group_5',   'group_6' , 'group_7',  
                      'group_8',   'group_9',   'group_10',  'group_11',  'group_12',  'group_13', 'group_14',
                      'group_15',   'group_16',   'group_17',  'group_18',  'group_19',  'group_20', 'group_21',
                      'total_1',   'total_2',   'total_3',   'total_4',   'total_5',   'total_6' , 'total_7',  
                      'total_8',   'total_9',   'total_10',  'total_11',  'total_12',  'total_13', 'total_14',
                      'total_15',   'total_16',   'total_17',  'total_18',  'total_19',  'total_20', 'total_21']

    
    # for visitor count lag
    merged_count=merged[['datetime','count']].sort_values(by='datetime')
    merged = merged.loc[merged.datetime >= start_date,:].sort_values(by='datetime')
    return merged, merged_count



## Feature Engineering
def feature_engineering(raw):
    if 'datetime' in raw.columns:
        raw['DateTime'] = pd.to_datetime(raw['datetime'])
    if raw.index.dtype == 'int64':
        raw.set_index('DateTime', inplace=True)
    
    # add time series data 
    #raw['month'] = raw.datetime.dt.month
    
    # dummy 변수(카테고리 변수) => holiday(0,1) / dayofweek(0,1,2,3,4,5,6) / long_h(0,1) / b_long_h(0,1) / inter_h(0,1) / b_inter_h(0,1) 
    #                         /first_m(0,1)/ b_first_m(0,1) / year(2019,2020,2021,2022) / month(1~12) / quarter(1~4)

    #raw = pd.concat([raw, pd.get_dummies(raw['holiday'],   prefix='holiday'   + '_dummy')], axis=1)
    raw = pd.concat([raw, pd.get_dummies(raw['dayofweek'], prefix='dayofweek' + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['long_h'],    prefix='long_h'    + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['b_long_h'],  prefix='b_long_h'  + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['inter_h'],   prefix='inter_h'   + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['b_inter_h'], prefix='b_inter_h' + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['first_m'],   prefix='first_m'   + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['b_first_m'], prefix='b_first_m' + '_dummy')], axis=1)
    #raw = pd.concat([raw, pd.get_dummies(raw['month'],     prefix='month'     + '_dummy')], axis=1)

    raw = raw.drop(columns=['dayofweek'])
    
    raw_fe = raw.copy() 
    return raw_fe


# Count lagged values of X_test
def feature_engineering_lag_modified(Y, X, target):
    X_lm = X.copy()
    i = 0
    for col in target:
        X_lm[col] = Y.shift(i).values
        X_lm[col].fillna(method='bfill', inplace=True)
        i = i + 1
    return X_lm

# Data Split for Y & X
def datasplit_X_Y(data, Y_colname, X_colname):
    X_colname = [x for x in data.columns if x not in Y_colname + X_remove]
    Y_data = data[Y_colname]
    X_data = data[X_colname]
    print('X:', X_data.shape, 'Y:', Y_data.shape)
    return X_data, Y_data

# Data Split for time series to train & test
def datasplit_ts(raw, train_ratio):
    size = int(len(raw) * train_ratio)
    raw_train, raw_val = raw[0:size].copy(deep=True), raw[size:len(raw)].copy(deep=True)
    print('Train_size:', raw_train.shape, 'Validation_size:', raw_val.shape)
    return raw_train, raw_val

# def datasplit_ts(raw, criteria):
#     raw_train = raw.loc[merged.index < criteria, :]
#     raw_validation  = raw.loc[merged.index >= criteria, :]
#     print('Train_size:', raw_train.shape, 'Validation_size:', raw_validation.shape)
#     return raw_train, raw_validation

#### 데이터 추출 기간 설정 및 불러오기

In [13]:
today = date.today()
before31 = date.today() - timedelta(31)
before1 = date.today() - timedelta(1)
 
today=today.strftime('%Y-%m-%d')
before31=before31.strftime('%Y-%m-%d')
before1=before1.strftime('%Y-%m-%d')

In [14]:
before1

'2022-08-25'

In [15]:
psg, air, wth, rc, tw = data_load(query_AP, query_AT, query_HD, query_RC, query_TW, '2022-01-01', before1)

In [16]:
air = air.pivot(index='DT', columns='LEAD_TM', values=['TICKET', 'FARE_MIN', 'FARE_MAX', 'FARE_AVG', 'FARE_STD', 'SEAT_MIN', 'SEAT_MAX', 'SEAT_AVG', 'SEAT_STD', 'B', 'F', 'D', 'S', 'SD'])
air.columns = air.columns.to_flat_index()
air=air.reset_index()
air2=air_dataprocessing(air)

#### function 실행

In [17]:
## Holiday Data Preprocessing
wth_h = holiday_data_pre(wth)

## Rentacar & Tway Preprocessing
rc = rentacar_processing(rc)
tw = tway_processing(tw)

## Data Merge
merged, merged_count = data_merge(psg, wth_h, air2, rc, tw, '2022-02-02')

# Feature Engineering
merged_fe  = feature_engineering(merged)

# Modify count lagged values of X_train
target_l = ['count_lag0','count_lag1','count_lag2','count_lag3','count_lag4','count_lag5','count_lag6']
count_lag = feature_engineering_lag_modified(pd.DataFrame(merged_count['count']), pd.DataFrame(merged_count['datetime']), target_l)
merged_fe=pd.merge(merged_fe, count_lag, on='datetime', how='left')

predict_data = merged_fe

In [18]:
predict_data=predict_data.iloc[-14:,:]

In [19]:
predict_data.fillna(method = 'ffill',inplace=True) 

## 2. 예측값 인버스 스케일링을 위한 학습 데이터 스케일링

In [20]:
# Query for base dataset
query_tr = """
    SELECT *
    FROM `charged-genre-350106.demand_forecasting.airport_passenger_tr_nm_ns_7_lt21_tr_fm`
    """

query_ts = """
    SELECT *
    FROM `charged-genre-350106.demand_forecasting.airport_passenger_ts_nm_ns_7_lt21_tr_fm`
    """

In [21]:
trds = bqclient.query(query_tr).to_dataframe()
tsds = bqclient.query(query_ts).to_dataframe()

In [22]:
trds=trds.sort_values(by='datetime')
tsds=tsds.sort_values(by='datetime')

In [23]:
def datasplit_X_Y(data, Y_colname, X_colname):
    X_colname = [x for x in data.columns if x not in Y_colname + X_remove]
    Y_data = data[Y_colname]
    X_data = data[X_colname]
    print('X:', X_data.shape, 'Y:', Y_data.shape)
    return X_data, Y_data

In [24]:
# Train Data Split to X and Y
Y_colname = ['count']
X_remove = ['date_time','data_split']
X_colname = [x for x in trds.columns if x not in Y_colname + X_remove]
X_train, Y_train = datasplit_X_Y(trds, Y_colname, X_colname)

X: (183, 421) Y: (183, 1)


In [25]:
X_train

Unnamed: 0,datetime,holiday,long_h,b_long_h,inter_h,b_inter_h,first_m,b_first_m,ticket_1,ticket_2,...,dayofweek_dummy_4,dayofweek_dummy_5,dayofweek_dummy_6,count_lag7,count_lag8,count_lag9,count_lag10,count_lag11,count_lag12,count_lag13
81,2022-02-02 00:00:00+00:00,1,0.0,0.0,0.0,0.0,0.0,0.0,284.0,287.0,...,0,0,0,35281.0,35294.0,39424.0,39697.0,37454.0,40211.0,41666.0
92,2022-02-03 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,328.0,326.0,...,0,0,0,39268.0,35281.0,35294.0,39424.0,39697.0,37454.0,40211.0
86,2022-02-04 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,1018.0,330.0,...,1,0,0,39773.0,39268.0,35281.0,35294.0,39424.0,39697.0,37454.0
93,2022-02-05 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,333.0,949.0,...,0,1,0,48156.0,39773.0,39268.0,35281.0,35294.0,39424.0,39697.0
1,2022-02-06 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,319.0,324.0,...,0,0,1,44912.0,48156.0,39773.0,39268.0,35281.0,35294.0,39424.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,2022-07-30 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,381.0,398.0,...,0,1,0,43051.0,42535.0,43719.0,37746.0,40260.0,40416.0,40065.0
63,2022-07-31 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,283.0,285.0,...,0,0,1,43956.0,43051.0,42535.0,43719.0,37746.0,40260.0,40416.0
35,2022-08-01 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,586.0,622.0,...,0,0,0,41174.0,43956.0,43051.0,42535.0,43719.0,37746.0,40260.0
59,2022-08-02 00:00:00+00:00,0,0.0,0.0,0.0,0.0,0.0,0.0,580.0,608.0,...,0,0,0,41202.0,41174.0,43956.0,43051.0,42535.0,43719.0,37746.0


In [26]:
# Predict Data Split to X and Y
X_predict, Y_predict = datasplit_X_Y(predict_data, Y_colname, X_colname)

X: (14, 421) Y: (14, 1)


In [27]:
X_predict

Unnamed: 0,datetime,holiday,long_h,b_long_h,inter_h,b_inter_h,first_m,b_first_m,ticket_1,ticket_2,...,dayofweek_dummy_4,dayofweek_dummy_5,dayofweek_dummy_6,count_lag0,count_lag1,count_lag2,count_lag3,count_lag4,count_lag5,count_lag6
191,2022-08-12,0,0.0,1.0,0.0,0.0,0.0,0.0,582.0,624.0,...,1,0,0,46218,44547.0,42938.0,39051.0,40279.0,42128.0,40099.0
192,2022-08-13,0,1.0,0.0,0.0,0.0,0.0,0.0,452.0,508.0,...,0,1,0,47420,46218.0,44547.0,42938.0,39051.0,40279.0,42128.0
193,2022-08-14,0,0.0,0.0,0.0,0.0,0.0,0.0,517.0,596.0,...,0,0,1,43780,47420.0,46218.0,44547.0,42938.0,39051.0,40279.0
194,2022-08-15,1,0.0,0.0,0.0,0.0,0.0,0.0,436.0,467.0,...,0,0,0,41193,43780.0,47420.0,46218.0,44547.0,42938.0,39051.0
195,2022-08-16,0,0.0,0.0,0.0,0.0,0.0,0.0,685.0,713.0,...,0,0,0,39262,41193.0,43780.0,47420.0,46218.0,44547.0,42938.0
196,2022-08-17,0,0.0,0.0,0.0,0.0,0.0,0.0,772.0,827.0,...,0,0,0,37247,39262.0,41193.0,43780.0,47420.0,46218.0,44547.0
197,2022-08-18,0,0.0,0.0,0.0,0.0,0.0,0.0,926.0,943.0,...,0,0,0,38866,37247.0,39262.0,41193.0,43780.0,47420.0,46218.0
198,2022-08-19,0,0.0,0.0,0.0,0.0,0.0,0.0,870.0,902.0,...,1,0,0,40935,38866.0,37247.0,39262.0,41193.0,43780.0,47420.0
199,2022-08-20,0,0.0,0.0,0.0,0.0,0.0,0.0,874.0,845.0,...,0,1,0,40362,40935.0,38866.0,37247.0,39262.0,41193.0,43780.0
200,2022-08-21,0,0.0,0.0,0.0,0.0,0.0,0.0,649.0,654.0,...,0,0,1,42797,40362.0,40935.0,38866.0,37247.0,39262.0,41193.0


In [28]:
continous_feature=[
'ticket_1', 'ticket_2', 'ticket_3','ticket_4','ticket_5','ticket_6','ticket_7',
                    'ticket_8', 'ticket_9', 'ticket_10','ticket_11','ticket_12','ticket_13','ticket_14',
                    'ticket_15','ticket_16', 'ticket_17','ticket_18','ticket_19','ticket_20','ticket_21',
                    'fare_min_1','fare_min_2','fare_min_3','fare_min_4','fare_min_5','fare_min_6','fare_min_7',
                    'fare_min_8','fare_min_9','fare_min_10','fare_min_11','fare_min_12','fare_min_13','fare_min_14',
                    'fare_min_15','fare_min_16','fare_min_17','fare_min_18','fare_min_19','fare_min_20','fare_min_21', 
                    'fare_max_1','fare_max_2','fare_max_3','fare_max_4','fare_max_5','fare_max_6','fare_max_7',
                    'fare_max_8','fare_max_9','fare_max_10','fare_max_11','fare_max_12','fare_max_13','fare_max_14',
                    'fare_max_15','fare_max_16','fare_max_17','fare_max_18','fare_max_19','fare_max_20','fare_max_21',
                    'fare_avg_1','fare_avg_2','fare_avg_3','fare_avg_4','fare_avg_5','fare_avg_6','fare_avg_7',
                    'fare_avg_8','fare_avg_9','fare_avg_10','fare_avg_11','fare_avg_12','fare_avg_13','fare_avg_14',
                    'fare_avg_15','fare_avg_16','fare_avg_17','fare_avg_18','fare_avg_19','fare_avg_20','fare_avg_21',
                    'fare_std_1','fare_std_2','fare_std_3','fare_std_4','fare_std_5','fare_std_6','fare_std_7',
                    'fare_std_8','fare_std_9','fare_std_10','fare_std_11','fare_std_12','fare_std_13','fare_std_14',
                    'fare_std_15','fare_std_16','fare_std_17','fare_std_18','fare_std_19','fare_std_20','fare_std_21',
                    'seat_min_1','seat_min_2','seat_min_3','seat_min_4','seat_min_5','seat_min_6','seat_min_7',
                    'seat_min_8','seat_min_9','seat_min_10','seat_min_11','seat_min_12','seat_min_13','seat_min_14',
                    'seat_min_15','seat_min_16','seat_min_17','seat_min_18','seat_min_19','seat_min_20','seat_min_21',
                    'seat_max_1','seat_max_2','seat_max_3','seat_max_4','seat_max_5','seat_max_6','seat_max_7',
                    'seat_max_8','seat_max_9','seat_max_10','seat_max_11','seat_max_12','seat_max_13','seat_max_14',
                    'seat_max_15','seat_max_16','seat_max_17','seat_max_18','seat_max_19','seat_max_20','seat_max_21',
                    'seat_avg_1','seat_avg_2','seat_avg_3','seat_avg_4','seat_avg_5','seat_avg_6','seat_avg_7',
                    'seat_avg_8','seat_avg_9','seat_avg_10','seat_avg_11','seat_avg_12','seat_avg_13','seat_avg_14',
                    'seat_avg_15','seat_avg_16','seat_avg_17','seat_avg_18','seat_avg_19','seat_avg_20','seat_avg_21',
                    'seat_std_1','seat_std_2','seat_std_3','seat_std_4','seat_std_5','seat_std_6','seat_std_7',
                    'seat_std_8','seat_std_9','seat_std_10','seat_std_11','seat_std_12','seat_std_13','seat_std_14',
                    'seat_std_15','seat_std_16','seat_std_17','seat_std_18','seat_std_19','seat_std_20','seat_std_21',
                    'b_1','b_2','b_3','b_4','b_5','b_6','b_7','b_8','b_9','b_10','b_11','b_12','b_13','b_14','b_15','b_16','b_17','b_18','b_19','b_20','b_21',
                    'f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8','f_9','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_19','f_20','f_21',
                    'd_1','d_2','d_3','d_4','d_5','d_6','d_7','d_8','d_9','d_10','d_11','d_12','d_13','d_14','d_15','d_16','d_17','d_18','d_19','d_20','d_21',
                    's_1','s_2','s_3','s_4','s_5','s_6','s_7','s_8','s_9','s_10','s_11','s_12','s_13','s_14','s_15','s_16','s_17','s_18','s_19','s_20','s_21',
                    'sd_1','sd_2','sd_3','sd_4','sd_5','sd_6','sd_7','sd_8','sd_9','sd_10','sd_11','sd_12','sd_13','sd_14','sd_15','sd_16','sd_17','sd_18','sd_19','sd_20','sd_21',
                      'lt_1',   'lt_2',   'lt_3',   'lt_4',   'lt_5',   'lt_6' , 'lt_7', 
                      'lt_8',   'lt_9',   'lt_10',  'lt_11',  'lt_12',  'lt_13', 'lt_14',
                      'lt_15',   'lt_16',   'lt_17',  'lt_18',  'lt_19',  'lt_20', 'lt_21',
                      'f330_1',   'f330_2',   'f330_3',   'f330_4',   'f330_5',   'f330_6' , 'f330_7',  
                      'f330_8',   'f330_9',   'f330_10',  'f330_11',  'f330_12',  'f330_13', 'f330_14',
                      'f330_15',  'f330_16',  'f330_17',  'f330_18',  'f330_19',  'f330_20', 'f330_21',
                      'f737_1',   'f737_2',   'f737_3',   'f737_4',   'f737_5',   'f737_6' , 'f737_7',  
                      'f737_8',   'f737_9',   'f737_10',  'f737_11',  'f737_12',  'f737_13', 'f737_14',
                      'f737_15',  'f737_16',  'f737_17',  'f737_18',  'f737_19',  'f737_20', 'f737_21',
                      'group_1',   'group_2',   'group_3',   'group_4',   'group_5',   'group_6' , 'group_7',  
                      'group_8',   'group_9',   'group_10',  'group_11',  'group_12',  'group_13', 'group_14',
                      'group_15',   'group_16',   'group_17',  'group_18',  'group_19',  'group_20', 'group_21',
                      'total_1',   'total_2',   'total_3',   'total_4',   'total_5',   'total_6' , 'total_7',  
                      'total_8',   'total_9',   'total_10',  'total_11',  'total_12',  'total_13', 'total_14',
                      'total_15',   'total_16',   'total_17',  'total_18',  'total_19',  'total_20', 'total_21',
                      'count_lag0','count_lag1','count_lag2','count_lag3','count_lag4','count_lag5','count_lag6'
]

category_feature=[
'holiday', 'long_h', 'b_long_h', 'inter_h', 'b_inter_h','first_m','b_first_m',
'dayofweek_dummy_0','dayofweek_dummy_1','dayofweek_dummy_2','dayofweek_dummy_3','dayofweek_dummy_4','dayofweek_dummy_5','dayofweek_dummy_6'
]

In [29]:
## Train data scaling & Predict data scaling 
scaler_X_tr = preprocessing.MinMaxScaler()
scaler_Y_tr = preprocessing.MinMaxScaler()

def scaling(x_train, y_train, x_predict, y_predict, scaler_X_tr=scaler_X_tr, scaler_Y_tr=scaler_Y_tr):
    
    ## Scaling
    X_train_con = scaler_X_tr.fit_transform(x_train[continous_feature])
    X_train_cat = x_train[category_feature].to_numpy()
    X_train_scaled = np.concatenate((X_train_con, X_train_cat), axis=1)
    Y_train_scaled = scaler_Y_tr.fit_transform(y_train)
    
    x_predict_con = scaler_X_tr.transform(x_predict[continous_feature])
    x_predict_cat = x_predict[category_feature].to_numpy()
    x_predict_scaled = np.concatenate((x_predict_con, x_predict_cat), axis=1)
    y_predict_scaled = scaler_Y_tr.transform(y_predict)
    
    print('X_train_scaled:', X_train_scaled.shape, 'Y_train_scaled:', Y_train_scaled.shape, 'X_predict_scaled:', x_predict_scaled.shape, 'Y_predict_scaled:', y_predict_scaled.shape)
    return X_train_scaled, Y_train_scaled, x_predict_scaled, y_predict_scaled

In [30]:
def inverse_scaling(data, scaler_Y_tr=scaler_Y_tr):
    data_scaled = np.empty([data.shape[1], data.shape[0]])
    for i in range(data.shape[1]):
        data_scaled[i] = scaler_Y_tr.inverse_transform([data[:,i]])
    return data_scaled.transpose()

In [31]:
## make [traing columns name] same with [predict columns name]
X_train.columns=['datetime','holiday','long_h','b_long_h','inter_h','b_inter_h','first_m','b_first_m',
                    'ticket_1', 'ticket_2', 'ticket_3','ticket_4','ticket_5','ticket_6','ticket_7',
                    'ticket_8', 'ticket_9', 'ticket_10','ticket_11','ticket_12','ticket_13','ticket_14',
                    'ticket_15','ticket_16', 'ticket_17','ticket_18','ticket_19','ticket_20','ticket_21',
                    'fare_min_1','fare_min_2','fare_min_3','fare_min_4','fare_min_5','fare_min_6','fare_min_7',
                    'fare_min_8','fare_min_9','fare_min_10','fare_min_11','fare_min_12','fare_min_13','fare_min_14',
                    'fare_min_15','fare_min_16','fare_min_17','fare_min_18','fare_min_19','fare_min_20','fare_min_21', 
                    'fare_max_1','fare_max_2','fare_max_3','fare_max_4','fare_max_5','fare_max_6','fare_max_7',
                    'fare_max_8','fare_max_9','fare_max_10','fare_max_11','fare_max_12','fare_max_13','fare_max_14',
                    'fare_max_15','fare_max_16','fare_max_17','fare_max_18','fare_max_19','fare_max_20','fare_max_21',
                    'fare_avg_1','fare_avg_2','fare_avg_3','fare_avg_4','fare_avg_5','fare_avg_6','fare_avg_7',
                    'fare_avg_8','fare_avg_9','fare_avg_10','fare_avg_11','fare_avg_12','fare_avg_13','fare_avg_14',
                    'fare_avg_15','fare_avg_16','fare_avg_17','fare_avg_18','fare_avg_19','fare_avg_20','fare_avg_21',
                    'fare_std_1','fare_std_2','fare_std_3','fare_std_4','fare_std_5','fare_std_6','fare_std_7',
                    'fare_std_8','fare_std_9','fare_std_10','fare_std_11','fare_std_12','fare_std_13','fare_std_14',
                    'fare_std_15','fare_std_16','fare_std_17','fare_std_18','fare_std_19','fare_std_20','fare_std_21',
                    'seat_min_1','seat_min_2','seat_min_3','seat_min_4','seat_min_5','seat_min_6','seat_min_7',
                    'seat_min_8','seat_min_9','seat_min_10','seat_min_11','seat_min_12','seat_min_13','seat_min_14',
                    'seat_min_15','seat_min_16','seat_min_17','seat_min_18','seat_min_19','seat_min_20','seat_min_21',
                    'seat_max_1','seat_max_2','seat_max_3','seat_max_4','seat_max_5','seat_max_6','seat_max_7',
                    'seat_max_8','seat_max_9','seat_max_10','seat_max_11','seat_max_12','seat_max_13','seat_max_14',
                    'seat_max_15','seat_max_16','seat_max_17','seat_max_18','seat_max_19','seat_max_20','seat_max_21',
                    'seat_avg_1','seat_avg_2','seat_avg_3','seat_avg_4','seat_avg_5','seat_avg_6','seat_avg_7',
                    'seat_avg_8','seat_avg_9','seat_avg_10','seat_avg_11','seat_avg_12','seat_avg_13','seat_avg_14',
                    'seat_avg_15','seat_avg_16','seat_avg_17','seat_avg_18','seat_avg_19','seat_avg_20','seat_avg_21',
                    'seat_std_1','seat_std_2','seat_std_3','seat_std_4','seat_std_5','seat_std_6','seat_std_7',
                    'seat_std_8','seat_std_9','seat_std_10','seat_std_11','seat_std_12','seat_std_13','seat_std_14',
                    'seat_std_15','seat_std_16','seat_std_17','seat_std_18','seat_std_19','seat_std_20','seat_std_21',
                    'b_1','b_2','b_3','b_4','b_5','b_6','b_7','b_8','b_9','b_10','b_11','b_12','b_13','b_14','b_15','b_16','b_17','b_18','b_19','b_20','b_21',
                    'f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8','f_9','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_19','f_20','f_21',
                    'd_1','d_2','d_3','d_4','d_5','d_6','d_7','d_8','d_9','d_10','d_11','d_12','d_13','d_14','d_15','d_16','d_17','d_18','d_19','d_20','d_21',
                    's_1','s_2','s_3','s_4','s_5','s_6','s_7','s_8','s_9','s_10','s_11','s_12','s_13','s_14','s_15','s_16','s_17','s_18','s_19','s_20','s_21',
                    'sd_1','sd_2','sd_3','sd_4','sd_5','sd_6','sd_7','sd_8','sd_9','sd_10','sd_11','sd_12','sd_13','sd_14','sd_15','sd_16','sd_17','sd_18','sd_19','sd_20','sd_21',
                  'lt_1',   'lt_2',   'lt_3',   'lt_4',   'lt_5',   'lt_6' , 'lt_7', 
                  'lt_8',   'lt_9',   'lt_10',  'lt_11',  'lt_12',  'lt_13', 'lt_14',
                  'lt_15',   'lt_16',   'lt_17',  'lt_18',  'lt_19',  'lt_20', 'lt_21',
                  'f330_1',   'f330_2',   'f330_3',   'f330_4',   'f330_5',   'f330_6' , 'f330_7',  
                  'f330_8',   'f330_9',   'f330_10',  'f330_11',  'f330_12',  'f330_13', 'f330_14',
                  'f330_15',  'f330_16',  'f330_17',  'f330_18',  'f330_19',  'f330_20', 'f330_21',
                  'f737_1',   'f737_2',   'f737_3',   'f737_4',   'f737_5',   'f737_6' , 'f737_7',  
                  'f737_8',   'f737_9',   'f737_10',  'f737_11',  'f737_12',  'f737_13', 'f737_14',
                  'f737_15',  'f737_16',  'f737_17',  'f737_18',  'f737_19',  'f737_20', 'f737_21',
                  'group_1',   'group_2',   'group_3',   'group_4',   'group_5',   'group_6' , 'group_7',  
                  'group_8',   'group_9',   'group_10',  'group_11',  'group_12',  'group_13', 'group_14',
                  'group_15',   'group_16',   'group_17',  'group_18',  'group_19',  'group_20', 'group_21',
                  'total_1',   'total_2',   'total_3',   'total_4',   'total_5',   'total_6' , 'total_7',  
                  'total_8',   'total_9',   'total_10',  'total_11',  'total_12',  'total_13', 'total_14',
                  'total_15',   'total_16',   'total_17',  'total_18',  'total_19',  'total_20', 'total_21',
                 'dayofweek_dummy_0','dayofweek_dummy_1','dayofweek_dummy_2','dayofweek_dummy_3','dayofweek_dummy_4','dayofweek_dummy_5','dayofweek_dummy_6',
                 'count_lag0','count_lag1','count_lag2','count_lag3','count_lag4','count_lag5','count_lag6']

In [32]:
X_train_scaled, Y_train_scaled, X_predict_scaled, Y_predict_scaled = scaling(X_train, Y_train, X_predict, Y_predict)

X_train_scaled: (183, 420) Y_train_scaled: (183, 1) X_predict_scaled: (14, 420) Y_predict_scaled: (14, 1)


In [33]:
#X_predict.to_csv('X_predict.csv')

#### Data reframing for multi ahead forecasting (multi input / multi output)

In [34]:
def multi_input(x_test_scaled):
    ## X / Y Split for test
    X_test_multi_scaled = []
    for index in range(n_steps, len(x_test_scaled)+1):
        # find the end of this pattern
        end_ix = index 
        out_end_ix = end_ix -1
        # check if we are beyoind the dataset
        if out_end_ix > len(x_test_scaled):
            break
        # gather input and output part of the pattern
        seq_x= x_test_scaled[index - n_steps : index, :]
        X_test_multi_scaled.append(seq_x)
    X_test_multi_scaled = np.array(X_test_multi_scaled)
    print('X_predict_multi_scaled:', X_test_multi_scaled.shape)
    return X_test_multi_scaled

In [35]:
n_steps=7

In [36]:
X_predict_multi_scaled = multi_input(X_predict_scaled)

X_predict_multi_scaled: (8, 7, 420)


In [37]:
aiplatform.init(project='charged-genre-350106', location='asia-northeast3')
endpoint = aiplatform.Endpoint('3003443554618966016')
raw_predictions=endpoint.predict(instances=X_predict_multi_scaled.tolist()).predictions[:]
predicted_values = inverse_scaling(np.array(raw_predictions)).round()
predicted_values=predicted_values.tolist()

In [38]:
predicted_values

[[45728.0, 46178.0, 44406.0, 41045.0, 40799.0, 39641.0, 41190.0],
 [45867.0, 46106.0, 42365.0, 38422.0, 39142.0, 40278.0, 39888.0],
 [44756.0, 44109.0, 40088.0, 37435.0, 39801.0, 39338.0, 39525.0],
 [42779.0, 42312.0, 38360.0, 37560.0, 39823.0, 39033.0, 41095.0],
 [40979.0, 39978.0, 38004.0, 36790.0, 39585.0, 40549.0, 41865.0],
 [40369.0, 39501.0, 38447.0, 37874.0, 40077.0, 41816.0, 40427.0],
 [40545.0, 38366.0, 39914.0, 38913.0, 42026.0, 39417.0, 40562.0],
 [39564.0, 39158.0, 40722.0, 41244.0, 39752.0, 40342.0, 41130.0]]

In [39]:
predicted_values[-1]

[39564.0, 39158.0, 40722.0, 41244.0, 39752.0, 40342.0, 41130.0]