# Library

In [252]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# System related and data input controls
import os
import numpy as np
import pandas as pd
import math
from tqdm import tqdm, tqdm_pandas # execution time
tqdm.pandas()
from holidayskr import year_holidays, is_holiday


# Auto reload of library
%reload_ext autoreload
%autoreload 2


# Data Preprocessing

In [280]:
def preprocessing_KTX():
    df_demand, df_info = pd.DataFrame(), pd.DataFrame()
    for (path, dir, files) in os.walk(os.path.join(os.getcwd(), 'Data')):
        for file in tqdm(files):
            path_file = os.path.join(path, file)

            # 데이터 로딩
            if path_file.split('\\')[-1][:10] == '(간선)시종착역별 ':
                df_sub = pd.read_excel(path_file, skiprows=8)
            else:
                df_sub = pd.read_excel(path_file, skiprows=5)

            # 데이터 결합
            if path_file.split('\\')[-1][:10] == '(간선)수송-운행일':
                df_demand = pd.concat([df_demand, df_sub], axis=0)
            elif path_file.split('\\')[-1][:10] == '(간선)시종착역별 ':
                df_info = pd.concat([df_info, df_sub], axis=0)
                
    # 불필요 변수 삭제
    df_demand.drop(columns=['Unnamed: 1', '운행년도', '운행년월', '운행요일구분', '역무열차종', '메트릭'], inplace=True)
    df_info.drop(columns=['상행하행구분', '역무열차종', '운행요일구분', '메트릭'], inplace=True)
    
    # 주운행선 필터
    df_demand = df_demand[df_demand['주운행선'].isin(['경부선', '경전선', '동해선', '호남선', '전라선'])]
    df_info = df_info[df_info['주운행선'].isin(['경부선', '경전선', '동해선', '호남선', '전라선'])]
    
    # index 재정렬
    df_demand = df_demand.reset_index().iloc[:,1:]
    df_info = df_info.reset_index().iloc[:,1:]
    
    # 일별 집계 및 변수생성
    df_demand = df_demand.groupby(['주운행선', '운행일자']).sum().reset_index()
    df_demand['1인당수입율'] = df_demand['승차수입금액']/df_demand['승차인원수']
    df_demand['공급대비승차율'] = df_demand['승차인원수']/df_demand['공급좌석합계수']
    df_demand['운행대비고객이동'] = df_demand['좌석거리']/df_demand['승차연인거리']
    df_info = pd.concat([df_info.groupby(['주운행선', '운행일자'])['열차속성'].value_counts().unstack().fillna(0).reset_index(),
                         df_info.groupby(['주운행선', '운행일자'])['열차구분'].value_counts().unstack().fillna(0).reset_index().iloc[:,-3:],
                         df_info.groupby(['주운행선', '운행일자'])['시발역'].nunique().fillna(0).reset_index().iloc[:,-1],
                         df_info.groupby(['주운행선', '운행일자'])['종착역'].nunique().fillna(0).reset_index().iloc[:,-1],
                         df_info.groupby(['주운행선', '운행일자'])[['공급좌석수', '열차운행횟수']].sum().reset_index().iloc[:,-2:]], axis=1)
    
    # 시간변수 정의 및 추출
    df_demand['운행일자'] = pd.to_datetime(df_demand['운행일자'], format='%Y년 %m월 %d일')
    df_info['운행일자'] = pd.to_datetime(df_info['운행일자'], format='%Y년 %m월 %d일')
    ## 월집계용 변수생성
    df_demand['운행년월'] = pd.to_datetime(df_demand['운행일자'].apply(lambda x: str(x)[:7]))
    df_info['운행년월'] = pd.to_datetime(df_info['운행일자'].apply(lambda x: str(x)[:7]))
    ## 요일 추출
    df_demand['요일'] = df_demand['운행일자'].dt.weekday
    df_info['요일'] = df_info['운행일자'].dt.weekday
    weekday_list = ['월', '화', '수', '목', '금', '토', '일']
    df_demand['요일'] = df_demand.apply(lambda x: weekday_list[x['요일']], axis=1)
    df_info['요일'] = df_info.apply(lambda x: weekday_list[x['요일']], axis=1)
    ## 주말/주중 추출
    df_demand['일수'] = 1
    df_demand['전체주중주말'] = df_demand['요일'].apply(lambda x: '주말' if x in ['금', '토', '일'] else '주중')
    df_info['전체주중주말'] = df_info['요일'].apply(lambda x: '주말' if x in ['금', '토', '일'] else '주중')
    df_demand['주말수'] = df_demand['요일'].isin(['금', '토', '일'])*1
    df_demand['주중수'] = df_demand['요일'].isin(['월', '화', '수', '목'])*1
    del df_demand['요일']
    del df_info['요일']
    ## 공휴일 추출
    df_demand['공휴일수'] = df_demand['운행일자'].apply(lambda x: is_holiday(str(x)[:10]))*1
    ## 명절 추출
    traditional_holidays = []
    for year in df_demand['운행일자'].dt.year.unique():
        for holiday, holiday_name in year_holidays(str(year)):
            if ('설날' in holiday_name) or ('추석' in holiday_name):
                traditional_holidays.append(holiday)
    traditional_holidays = pd.to_datetime(traditional_holidays, format='%Y년 %m월 %d일')
    df_demand['명절수'] = df_demand['운행일자'].apply(lambda x: 1 if x in traditional_holidays else 0)
    del df_demand['운행일자']
    del df_info['운행일자']
    
    # 월별 집계
    df_demand_month = df_demand.groupby(['주운행선', '운행년월']).sum()
    df_demand_month = df_demand_month[[col for col in df_demand_month.columns if col != '전체주중주말']].reset_index()
    df_demand_month['전체주중주말'] = '전체'
    df_demand_temp = df_demand.groupby(['전체주중주말', '주운행선', '운행년월']).sum().reset_index()
    df_demand_month = df_demand_month[df_demand_temp.columns]
    df_info_month = df_info.groupby(['주운행선', '운행년월']).sum()
    df_info_month = df_info_month[[col for col in df_info_month.columns if col != '전체주중주말']].reset_index()
    df_info_month['전체주중주말'] = '전체'
    df_info_temp = df_info.groupby(['전체주중주말', '주운행선', '운행년월']).sum().reset_index()
    df_info_month = df_info_month[df_info_temp.columns]
          
    # 데이터 결합
    df_demand_month = pd.concat([df_demand_month, df_demand_temp], axis=0)
    df_info_month = pd.concat([df_info_month, df_info_temp], axis=0).fillna(0)
    del df_info_month['공급좌석수']
    df = pd.concat([df_demand_month.set_index(['전체주중주말','주운행선','운행년월']),
                    df_info_month.set_index(['전체주중주말','주운행선','운행년월'])], axis=1).reset_index()
    
    return df

df = preprocessing_KTX()
df

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:38<00:00,  9.52s/it]


Unnamed: 0,전체주중주말,주운행선,운행년월,공급차량수,공급좌석합계수,승차수입금액,승차인원수,승차연인거리,좌석거리,1인당수입율,...,명절수,관광,일반,일반/관광,대수송,임시,확정,시발역,종착역,열차운행횟수
0,전체,경부선,2015-01-01,86203,4690099,148156008100,6462183,12887623030,18920136968,708718.959449,...,0,0.0,475.0,2.0,0.0,10.0,467.0,186.0,186.0,4035.0
1,전체,경부선,2015-02-01,78266,4275701,141350620300,6069139,12230649006,17328292972,649204.506673,...,3,0.0,432.0,3.0,91.0,12.0,332.0,168.0,168.0,3700.0
2,전체,경부선,2015-03-01,85116,4643846,139089651200,6565945,12032460501,18723639573,652070.909077,...,0,0.0,465.0,4.0,0.0,0.0,469.0,186.0,186.0,3999.0
3,전체,경부선,2015-04-01,81857,4467804,138289030400,6416662,11984070035,17902864800,641734.866258,...,0,0.0,479.0,9.0,0.0,0.0,488.0,180.0,180.0,3810.0
4,전체,경부선,2015-05-01,85949,4687943,155654821600,7082084,13565809400,18795445014,677207.704445,...,0,0.0,515.0,10.0,0.0,21.0,504.0,186.0,186.0,4022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1660,주중,호남선,2023-11-01,17771,965160,18623222778,842921,1434589983,2932074424,397519.125540,...,0,0.0,41.0,490.0,0.0,0.0,531.0,126.0,126.0,972.0
1661,주중,호남선,2023-12-01,15049,816892,17082610290,755287,1316460567,2596316700,361438.869176,...,0,0.0,44.0,434.0,0.0,2.0,476.0,112.0,112.0,866.0
1662,주중,호남선,2024-01-01,17002,924758,18941788738,818426,1463235828,3084568104,439023.373651,...,0,0.0,37.0,527.0,0.0,2.0,562.0,133.0,133.0,1028.0
1663,주중,호남선,2024-02-01,15315,833568,20152469898,863410,1570912551,2782345837,396371.388318,...,1,0.0,86.0,412.0,55.0,2.0,441.0,119.0,119.0,922.0


In [401]:
!pip install covid19dh
!pip install --upgrade covid19dh
from covid19dh import covid19
df_covid, src = covid19('KOR') 
df_covid = df_covid[['date', 'internal_movement_restrictions', 'international_movement_restrictions', 'elderly_people_protection',
                     'government_response_index', 'stringency_index', 'containment_health_index']]




[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:

	[1mGuidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.[0m

A BibTeX entry for LaTeX users is

	@Article{,
		title = {COVID-19 Data Hub},
		year = {2020},
		doi = {10.21105/joss.02376},
		author = {Emanuele Guidotti and David Ardia},
		journal = {Journal of Open Source Software},
		volume = {5},
		number = {51},
		pages = {2376},
	}

[33mTo hide this message use 'verbose = False'.[0m



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [494]:
time_temp

156560   2020-01-22
156561   2020-01-23
156562   2020-01-24
156563   2020-01-25
156564   2020-01-26
156565   2020-01-27
156566   2020-01-28
            ...    
157696   2023-03-03
157697   2023-03-04
157698   2023-03-05
157699   2023-03-06
157700   2023-03-07
157701   2023-03-08
157702   2023-03-09
Name: date, Length: 1143, dtype: datetime64[ns]

In [492]:
from covid19dh import covid19
df_covid, src = covid19('KOR') 
df_covid.date = pd.to_datetime(df_covid.date)
time_temp = df_covid[~df_covid.confirmed.isnull()].date
Y = df_temp[df_temp['운행일자'].apply(lambda x: x in time_temp.values)]
Y = Y[Y['주운행선'] == '경부선']['승차인원수'].reset_index().iloc[:,1:]
df_covid = df_covid[~df_covid.confirmed.isnull()]
df_covid = df_covid[df_covid.columns[df_covid.dtypes == 'float64']].reset_index().iloc[:,1:]
df_covid.dropna(axis=1, how='all', inplace=True)
df_covid.fillna(0, inplace=True)
corr = abs(pd.concat([Y, df_covid], axis=1).corr().iloc[:,[0]]).dropna()
corr = corr.sort_values(by='승차인원수', ascending=False)
display(corr)
df_covid[list(corr.index[1:9])]

We have invested a lot of time and effort in creating COVID-19 Data Hub, please cite the following when using it:

	[1mGuidotti, E., Ardia, D., (2020), "COVID-19 Data Hub", Journal of Open Source Software 5(51):2376, doi: 10.21105/joss.02376.[0m

A BibTeX entry for LaTeX users is

	@Article{,
		title = {COVID-19 Data Hub},
		year = {2020},
		doi = {10.21105/joss.02376},
		author = {Emanuele Guidotti and David Ardia},
		journal = {Journal of Open Source Software},
		volume = {5},
		number = {51},
		pages = {2376},
	}

[33mTo hide this message use 'verbose = False'.[0m


Unnamed: 0,승차인원수
승차인원수,1.0
stringency_index,0.729233
deaths,0.680164
confirmed,0.664734
international_movement_restrictions,0.639557
people_fully_vaccinated,0.606718
people_vaccinated,0.58688
containment_health_index,0.565051
government_response_index,0.543156
vaccines,0.456541


Unnamed: 0,stringency_index,deaths,confirmed,international_movement_restrictions,people_fully_vaccinated,people_vaccinated,containment_health_index,government_response_index
0,0.0,0.0,1.0,0.0,0.0,0.0,2.38,2.08
1,0.0,0.0,1.0,0.0,0.0,0.0,2.38,2.08
2,0.0,0.0,2.0,0.0,0.0,0.0,2.38,2.08
3,0.0,0.0,2.0,0.0,0.0,0.0,2.38,2.08
4,0.0,0.0,3.0,0.0,0.0,0.0,2.38,2.08
5,0.0,0.0,4.0,0.0,0.0,0.0,2.38,2.08
6,0.0,0.0,4.0,0.0,0.0,0.0,2.38,2.08
...,...,...,...,...,...,...,...,...
1136,0.0,34020.0,30555102.0,0.0,44364235.0,44779393.0,0.00,0.00
1137,0.0,34020.0,30555102.0,0.0,44364382.0,44779473.0,0.00,0.00


In [403]:
df_covid.date = pd.to_datetime(df_covid.date)
df_covid.set_index('date', inplace=True)
df_covid

Unnamed: 0_level_0,internal_movement_restrictions,international_movement_restrictions,elderly_people_protection,government_response_index,stringency_index,containment_health_index
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-06,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-07,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-08,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-09,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
2023-05-20,,,,,,
2023-05-21,,,,,,


In [418]:
pd.set_option("display.min_rows", 15)
temp = df_temp.copy()
temp = temp[['운행일자', '승차인원수', '공급차량수', '승차수입금액']]
temp.columns = ['DateTime', '수요', '공급', '금액']
temp.set_index('DateTime', inplace=True)
temp['수요'] = temp['수요']/100
temp['공급'] = temp['공급']/100
temp['금액'] = round(temp['금액']/10000000,2)
temp = temp[temp.index>='2021-03-01'].iloc[:100]
temp = temp[['공급', '금액', '수요']]
display(temp)

Unnamed: 0_level_0,공급,금액,수요
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-03-01,22.36,297.69,1147.82
2021-03-02,22.22,179.28,979.78
2021-03-03,22.22,168.96,942.13
2021-03-04,22.22,185.09,1000.96
2021-03-05,25.68,272.02,1400.18
2021-03-06,25.94,210.82,1097.79
2021-03-07,25.58,227.44,1104.96
...,...,...,...
2021-06-02,22.24,181.72,1010.95
2021-06-03,22.24,207.69,1070.82


In [314]:
pd.set_option("display.min_rows", 15)
temp = df_temp.copy()
temp = temp[['운행일자', '승차인원수', '공급차량수', '승차수입금액']]
temp.columns = ['DateTime', '수요']
temp['수요'] = temp['수요']/100
temp.set_index('DateTime', inplace=True)
temp = temp[temp.index>='2021-03-01'].iloc[:100]
display(temp)

def reshape_1Dto2Dseq(df, seq_length):
    # 2D X & Y split
    X_reshape, Y_reshape = [], []
    for index in range(len(df) - seq_length): # 2D: seq_length+1
        X_reshape.append(np.array(df[index: index + seq_length]))
        Y_reshape.append(np.ravel(df[index + seq_length:index + seq_length + 1])) # 2D(first): seq_length-1
    X_reshape, Y_reshape = np.array(X_reshape), np.array(Y_reshape)    
    
    # reshape
    X_reshape = pd.DataFrame(X_reshape.reshape(X_reshape.shape[0], -1), index=df.index[seq_length:])
    X_reshape.columns = [str(i) for i in X_reshape.columns]
    Y_reshape = pd.DataFrame(Y_reshape, columns=df.columns, index=df.index[seq_length:])
    
    return X_reshape, Y_reshape

X_reshape, Y_reshape = reshape_1Dto2Dseq(temp,5)
sample = pd.concat([X_reshape, Y_reshape], axis=1)
sample.columns = ['수요_T0','수요_T1','수요_T2','수요_T3','수요_T4','수요_T5']
sample

Unnamed: 0_level_0,수요
DateTime,Unnamed: 1_level_1
2021-03-01,1147.82
2021-03-02,979.78
2021-03-03,942.13
2021-03-04,1000.96
2021-03-05,1400.18
2021-03-06,1097.79
2021-03-07,1104.96
...,...
2021-06-02,1010.95
2021-06-03,1070.82


Unnamed: 0_level_0,수요_T0,수요_T1,수요_T2,수요_T3,수요_T4,수요_T5
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-03-06,1147.82,979.78,942.13,1000.96,1400.18,1097.79
2021-03-07,979.78,942.13,1000.96,1400.18,1097.79,1104.96
2021-03-08,942.13,1000.96,1400.18,1097.79,1104.96,1030.79
2021-03-09,1000.96,1400.18,1097.79,1104.96,1030.79,959.76
2021-03-10,1400.18,1097.79,1104.96,1030.79,959.76,995.73
2021-03-11,1097.79,1104.96,1030.79,959.76,995.73,1074.65
2021-03-12,1104.96,1030.79,959.76,995.73,1074.65,1474.14
...,...,...,...,...,...,...
2021-06-02,1571.78,1341.67,1337.66,1113.44,992.21,1010.95
2021-06-03,1341.67,1337.66,1113.44,992.21,1010.95,1070.82
