# Library

In [125]:
# Ignore the warnings
import warnings
# warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# System related and data input controls
import os
import numpy as np
import pandas as pd
import math
from tqdm import tqdm, tqdm_pandas # execution time
tqdm.pandas()

# Auto reload of library
%reload_ext autoreload
%autoreload 2


# Data Preprocessing

In [126]:
def preprocessing_KTX():
    df_seat, df_demand, df_freq = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
    for (path, dir, files) in os.walk(os.path.join(os.getcwd(), 'Data')):
        for file in tqdm(files):
            path_file = os.path.join(path, file)

            # 데이터 로딩
            if path_file.split('\\')[-1][:10] == '(간선)시종착역별 ':
                df_sub = pd.read_excel(path_file, skiprows=8)
            else:
                df_sub = pd.read_excel(path_file, skiprows=5)

            # 데이터 결합
            if path_file.split('\\')[-1][:10] == '(간선)수송-운행일':
                df_seat = pd.concat([df_seat, df_sub], axis=0)
            elif path_file.split('\\')[-1][:10] == '(간선)수송_운행월':
                df_demand = pd.concat([df_demand, df_sub], axis=0)
            elif path_file.split('\\')[-1][:10] == '(간선)시종착역별 ':
                df_freq = pd.concat([df_freq, df_sub], axis=0)
                
    # 불필요 변수 삭제
    df_seat.drop(columns=['Unnamed: 1', '운행년도', '운행년월', '역무열차종', '메트릭'], inplace=True)
    df_demand.drop(columns=['운행년도', 'Unnamed: 4', '객실등급', '거리구분', '역무열차종',
                            '승차연인거리', '메트릭', 'Unnamed: 1'], inplace=True)
    df_freq.drop(columns=['상행하행구분', '역무열차종', '메트릭'], inplace=True)
    
    # 주운행선 필터
    df_demand = df_demand[df_demand['주운행선'].isin(['경부선', '경전선', '동해선', '호남선', '전라선'])]
    df_seat = df_seat[df_seat['주운행선'].isin(['경부선', '경전선', '동해선', '호남선', '전라선'])]
    df_freq = df_freq[df_freq['주운행선'].isin(['경부선', '경전선', '동해선', '호남선', '전라선'])]
    
    # index 재정렬
    df_demand = df_demand.reset_index().iloc[:,1:]
    df_seat = df_seat.reset_index().iloc[:,1:]
    df_freq = df_freq.reset_index().iloc[:,1:]
    
    # 시간변수 정의 및 주중여부 변수 생성
    df_seat['운행일자'] = pd.to_datetime(df_seat['운행일자'], format='%Y년 %m월 %d일')
    df_freq['운행일자'] = pd.to_datetime(df_freq['운행일자'], format='%Y년 %m월 %d일')
    df_seat['주중여부'] = df_seat['운행요일구분'].isin(['월', '화', '수', '목', '금'])*1
    df_freq['주중여부'] = df_freq['운행요일구분'].isin(['월', '화', '수', '목', '금'])*1
    df_seat.drop(columns=['운행요일구분'], inplace=True)
    df_freq.drop(columns=['운행요일구분'], inplace=True)
      
    # 데이터 결합
    
    # 변수 생성
        
    return df_demand, df_seat, df_freq

df_demand, df_seat, df_freq = preprocessing_KTX()
display(df_demand, df_seat, df_freq)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:41<00:00,  6.87s/it]


Unnamed: 0,주운행선,운행년월,승차수입금액,승차인원수
0,경부선,2010년 01월,76800,1
1,경부선,2010년 01월,48392700,7803
2,경부선,2010년 01월,1717656300,198390
3,경부선,2010년 01월,3652194400,279378
4,경부선,2010년 01월,6879423100,361865
...,...,...,...,...
42449,동해선,2024년 03월,1762986140,59617
42450,경부선,2024년 03월,1444207586,110393
42451,호남선,2024년 03월,642863038,49948
42452,전라선,2024년 03월,646827214,44304


Unnamed: 0,운행일자,주운행선,공급차량수,공급좌석합계수,승차수입금액,승차인원수,승차연인거리,좌석거리,주중여부
0,2015-01-01,경부선,2016,104152,3725143600,101525,288685921,440665165,1
1,2015-01-01,경부선,38,2432,66411200,3563,8543090,10746496,1
2,2015-01-01,경부선,366,25812,518790400,65797,86492073,96079902,1
3,2015-01-01,경부선,40,2630,17088900,3988,2818725,3050800,1
4,2015-01-01,경부선,112,5082,176309700,5142,13328413,19245897,1
...,...,...,...,...,...,...,...,...,...
79536,2024-03-31,동해선,8,528,4456526,742,406147,895488,0
79537,2024-03-31,경전선,216,11460,491723566,15533,36739684,48564615,0
79538,2024-03-31,경전선,10,436,4436700,679,820348,1418308,0
79539,2024-03-31,경전선,70,5008,31754080,7698,5292241,8450016,0


Unnamed: 0,운행일자,열차속성,열차구분,주운행선,시발역,종착역,공급좌석수,설비좌석수,열차운행횟수,주중여부
0,2015-01-01,일반,확정,경부선,서울,대전,931,931,1,1
1,2015-01-01,일반,확정,경부선,서울,대전,363,363,1,1
2,2015-01-01,일반,확정,경부선,서울,부산,35318,35318,38,1
3,2015-01-01,일반,확정,경부선,서울,부산,1089,1089,2,1
4,2015-01-01,일반,확정,경부선,대전,서울,931,931,1,1
...,...,...,...,...,...,...,...,...,...,...
281135,2024-03-31,일반/관광,확정,전라선,여수엑스포,용산,820,826,2,0
281136,2024-03-31,일반/관광,확정,전라선,여수엑스포,행신,1230,1239,3,0
281137,2024-03-31,일반/관광,확정,전라선,행신,여수엑스포,1640,1652,4,0
281138,2024-03-31,일반/관광,확정,동해선,서울,포항,820,826,2,0
