# Library

In [1]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# System related and data input controls
import os
import numpy as np
import pandas as pd
import math
from tqdm import tqdm, tqdm_pandas # execution time
tqdm.pandas()
from holidayskr import year_holidays, is_holiday
from covid19dh import covid19


# Auto reload of library
%reload_ext autoreload
%autoreload 2


# Data Preprocessing

In [2]:
def preprocessing_KTX():
    # 데이터 로딩
    df_demand1 = pd.read_excel(os.path.join(os.getcwd(), 'Data', '(간선)수송-운행일-주운행(201501-202305).xlsx'), skiprows=5)
    df_demand2 = pd.read_excel(os.path.join(os.getcwd(), 'Data', '(간선)수송-운행일-주운행(202305-202403).xlsx'), skiprows=5)
    df_info1 = pd.read_excel(os.path.join(os.getcwd(), 'Data', '(간선)시종착역별 열차운행(201501-202305).xlsx'), skiprows=8)
    df_info2 = pd.read_excel(os.path.join(os.getcwd(), 'Data', '(간선)시종착역별 열차운행(202305-202403).xlsx'), skiprows=8)
    df_demand = pd.concat([df_demand1, df_demand2], axis=0)
    df_info = pd.concat([df_info1, df_info2], axis=0)
                
    # 분석대상 필터
    ## 역무열차종: KTX
    ## 주운행선: '경부선', '경전선', '동해선', '전라선', '호남선'
    df_demand = df_demand[df_demand['역무열차종'].apply(lambda x: x[:3] == 'KTX')].reset_index().iloc[:,1:]
    df_info = df_info[df_info['역무열차종'].apply(lambda x: x[:3] == 'KTX')].reset_index().iloc[:,1:]
    df_demand = df_demand[df_demand['주운행선'].isin(['경부선', '경전선', '동해선', '전라선', '호남선'])].reset_index().iloc[:,1:]
    df_info = df_info[df_info['주운행선'].isin(['경부선', '경전선', '동해선', '전라선', '호남선'])].reset_index().iloc[:,1:]

    # 불필요 변수 삭제
    df_demand.drop(columns=['Unnamed: 1', '운행년도', '운행년월', '운행요일구분', '역무열차종', '메트릭'], inplace=True)
    df_info.drop(columns=['상행하행구분', '역무열차종', '운행요일구분', '메트릭'], inplace=True)
    df_demand = df_demand.reset_index().iloc[:,1:]
    df_info = df_info.reset_index().iloc[:,1:]
    
    # 일별 집계 및 변수생성
    df_demand = df_demand.groupby(['주운행선', '운행일자']).sum().reset_index()
    df_demand['1인당수입율'] = df_demand['승차수입금액']/df_demand['승차인원수']
    df_demand['공급대비승차율'] = df_demand['승차인원수']/df_demand['공급좌석합계수']
    df_demand['운행대비고객이동'] = df_demand['좌석거리']/df_demand['승차연인거리']
    df_info = pd.concat([df_info.groupby(['주운행선', '운행일자'])['열차속성'].value_counts().unstack().fillna(0).reset_index(),
                         df_info.groupby(['주운행선', '운행일자'])['열차구분'].value_counts().unstack().fillna(0).reset_index().iloc[:,-3:],
                         df_info.groupby(['주운행선', '운행일자'])['시발역'].nunique().fillna(0).reset_index().iloc[:,-1],
                         df_info.groupby(['주운행선', '운행일자'])['종착역'].nunique().fillna(0).reset_index().iloc[:,-1],
                         df_info.groupby(['주운행선', '운행일자'])[['공급좌석수', '열차운행횟수']].sum().reset_index().iloc[:,-2:]], axis=1)
    
    # 시간변수 정의 및 추출
    df_demand['운행일자'] = pd.to_datetime(df_demand['운행일자'], format='%Y년 %m월 %d일')
    df_info['운행일자'] = pd.to_datetime(df_info['운행일자'], format='%Y년 %m월 %d일')
    ## 월집계용 변수생성
    df_demand['운행년월'] = pd.to_datetime(df_demand['운행일자'].apply(lambda x: str(x)[:7]))
    df_info['운행년월'] = pd.to_datetime(df_info['운행일자'].apply(lambda x: str(x)[:7]))
    ## 요일 추출
    df_demand['요일'] = df_demand['운행일자'].dt.weekday
    df_info['요일'] = df_info['운행일자'].dt.weekday
    weekday_list = ['월', '화', '수', '목', '금', '토', '일']
    df_demand['요일'] = df_demand.apply(lambda x: weekday_list[x['요일']], axis=1)
    df_info['요일'] = df_info.apply(lambda x: weekday_list[x['요일']], axis=1)
    ## 주말/주중 추출
    df_demand['일수'] = 1
    df_demand['전체주중주말'] = df_demand['요일'].apply(lambda x: '주말' if x in ['금', '토', '일'] else '주중')
    df_info['전체주중주말'] = df_info['요일'].apply(lambda x: '주말' if x in ['금', '토', '일'] else '주중')
    df_demand['주말수'] = df_demand['요일'].isin(['금', '토', '일'])*1
    df_demand['주중수'] = df_demand['요일'].isin(['월', '화', '수', '목'])*1
    del df_demand['요일']
    del df_info['요일']
    ## 공휴일 추출
    df_demand['공휴일수'] = df_demand['운행일자'].apply(lambda x: is_holiday(str(x)[:10]))*1
    ## 명절 추출
    traditional_holidays = []
    for year in df_demand['운행일자'].dt.year.unique():
        for holiday, holiday_name in year_holidays(str(year)):
            if ('설날' in holiday_name) or ('추석' in holiday_name):
                traditional_holidays.append(holiday)
    traditional_holidays = pd.to_datetime(traditional_holidays, format='%Y년 %m월 %d일')
#     traditional_holidays = [t.strftime("%Y년 %m월 %d일") for t in traditional_holidays]
    df_demand['명절수'] = df_demand['운행일자'].apply(lambda x: 1 if x in traditional_holidays else 0)
    
    # Covid 데이터 결합
    ## Covid 데이터 전처리
    df_covid, src = covid19('KOR', verbose=False) 
    df_covid.date = pd.to_datetime(df_covid.date)
    time_covid = df_covid[~df_covid.confirmed.isnull()].date
    df_covid = df_covid[~df_covid.confirmed.isnull()]
    df_covid = df_covid[df_covid.columns[df_covid.dtypes == 'float64']].reset_index().iloc[:,1:]
    df_covid.dropna(axis=1, how='all', inplace=True)
    df_covid.fillna(0, inplace=True)
    ## 종속변수와의 관련도 높은 변수 필터
    feature_Yrelated = []
    df_Y = df_demand[df_demand['운행일자'].apply(lambda x: x in time_covid.values)]
    for line in df_demand['주운행선'].unique():
        Y = df_Y[df_Y['주운행선'] == line]['승차인원수'].reset_index().iloc[:,1:]
        corr = abs(pd.concat([Y, df_covid], axis=1).corr().iloc[:,[0]]).dropna()
        corr = corr.sort_values(by='승차인원수', ascending=False)
        feature_Yrelated.extend([i for i in corr[corr>0.5].dropna().index if i != corr.columns])
    feature_Yrelated = [x for x in set(feature_Yrelated) if feature_Yrelated.count(x) == len(df_demand['주운행선'].unique())]
    df_covid = pd.concat([time_covid.reset_index().iloc[:,1:], df_covid[feature_Yrelated]], axis=1)
    ## 결합
    df_demand = pd.merge(df_demand, df_covid, left_on='운행일자', right_on='date', how='left').fillna(0)
    
    # 정리
    time_demand, time_info = df_demand['운행일자'], df_info['운행일자']
    del df_demand['date']
    del df_demand['운행일자']
    del df_info['운행일자']
    
    # 월별 집계
    df_demand_month = df_demand.groupby(['주운행선', '운행년월']).sum()
    df_demand_month = df_demand_month[[col for col in df_demand_month.columns if col != '전체주중주말']].reset_index()
    df_demand_month['전체주중주말'] = '전체'
    df_demand_temp = df_demand.groupby(['전체주중주말', '주운행선', '운행년월']).sum().reset_index()
    df_demand_month = df_demand_month[df_demand_temp.columns]
    df_info_month = df_info.groupby(['주운행선', '운행년월']).sum()
    df_info_month = df_info_month[[col for col in df_info_month.columns if col != '전체주중주말']].reset_index()
    df_info_month['전체주중주말'] = '전체'
    df_info_temp = df_info.groupby(['전체주중주말', '주운행선', '운행년월']).sum().reset_index()
    df_info_month = df_info_month[df_info_temp.columns]
          
    # 데이터 결합
    df_demand_month = pd.concat([df_demand_month, df_demand_temp], axis=0)
    df_info_month = pd.concat([df_info_month, df_info_temp], axis=0).fillna(0)
    del df_info_month['공급좌석수']
    df = pd.concat([df_demand_month.set_index(['전체주중주말','주운행선','운행년월']),
                    df_info_month.set_index(['전체주중주말','주운행선','운행년월'])], axis=1).reset_index()
    
    # 정리
    df_demand = pd.concat([time_demand, df_demand], axis=1)
    df_info = pd.concat([time_info, df_info], axis=1)
    df_demand = df_demand[['주운행선', '운행일자'] + [col for col in df_demand.columns if col not in ['주운행선', '운행일자']]]
    df_info = df_info[['주운행선', '운행일자'] + [col for col in df_info.columns if col not in ['주운행선', '운행일자']]]
    df = df[['전체주중주말', '주운행선', '운행년월', '일수', '주말수', '주중수', '공휴일수', '명절수'] + [col for col in df.columns if col not in ['전체주중주말', '주운행선', '운행년월', '일수', '주말수', '주중수', '공휴일수', '명절수']]]
    display(df_demand, df_info)
    return df

df = preprocessing_KTX()
df

Unnamed: 0,주운행선,운행일자,공급차량수,공급좌석합계수,승차수입금액,승차인원수,승차연인거리,좌석거리,1인당수입율,공급대비승차율,운행대비고객이동,운행년월,일수,전체주중주말,주말수,주중수,공휴일수,명절수
0,경부선,2015-01-01,2128,109234,3901453300,106667,302014334,459911062,36576.010387,0.976500,1.522812,2015-01-01,1,주중,0,1,1,0
1,경부선,2015-01-02,2478,126462,4472526300,129421,347154015,527981447,34557.964318,1.023398,1.520885,2015-01-01,1,주말,1,0,0,0
2,경부선,2015-01-03,2518,129317,4369216800,121001,336253020,543745231,36108.931331,0.935693,1.617072,2015-01-01,1,주말,1,0,0,0
3,경부선,2015-01-04,2482,127431,4952360800,133931,381175290,535753619,36976.956791,1.051008,1.405531,2015-01-01,1,주말,1,0,0,0
4,경부선,2015-01-05,2130,108647,3626327700,105700,279048954,453211414,34307.736045,0.972875,1.624129,2015-01-01,1,주중,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,호남선,2024-03-27,712,37028,826367722,26895,57389748,124204475,30725.700762,0.726342,2.164228,2024-03-01,1,주중,0,1,0,0
14333,호남선,2024-03-28,712,37028,919487296,30047,64204189,124204475,30601.633973,0.811467,1.934523,2024-03-01,1,주중,0,1,0,0
14334,호남선,2024-03-29,748,38934,1239954946,41337,87122903,130249506,29996.249026,1.061720,1.495009,2024-03-01,1,주말,1,0,0,0
14335,호남선,2024-03-30,748,38965,1331433128,40826,91603399,130349357,32612.382501,1.047761,1.422975,2024-03-01,1,주말,1,0,0,0


Unnamed: 0,주운행선,운행일자,관광,일반,일반/관광,대수송,임시,확정,시발역,종착역,공급좌석수,열차운행횟수,운행년월,전체주중주말
0,경부선,2015-01-01,0.0,19.0,0.0,0.0,4.0,15.0,6,6,109234,124,2015-01-01,주중
1,경부선,2015-01-02,0.0,17.0,0.0,0.0,2.0,15.0,6,6,126462,144,2015-01-01,주말
2,경부선,2015-01-03,0.0,17.0,0.0,0.0,2.0,15.0,6,6,129317,147,2015-01-01,주말
3,경부선,2015-01-04,0.0,17.0,0.0,0.0,2.0,15.0,6,6,127431,145,2015-01-01,주말
4,경부선,2015-01-05,0.0,15.0,0.0,0.0,0.0,15.0,6,6,108647,123,2015-01-01,주중
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,호남선,2024-03-27,0.0,1.0,27.0,0.0,0.0,28.0,7,7,37028,54,2024-03-01,주중
14333,호남선,2024-03-28,0.0,2.0,27.0,0.0,0.0,29.0,7,7,37028,54,2024-03-01,주중
14334,호남선,2024-03-29,0.0,3.0,28.0,0.0,0.0,31.0,7,7,38934,56,2024-03-01,주말
14335,호남선,2024-03-30,0.0,3.0,27.0,0.0,0.0,30.0,7,7,38965,56,2024-03-01,주말
