In [2]:
# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform
import matplotlib.pyplot as plt

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
df = pd.read_csv("../Data/강남 일별 시간대별 승객유형별 승하차인원 정리.csv")

In [5]:
df.head()

Unnamed: 0,수송일자,요일,주말,공휴일,시간대,어린이,외국인,우대권,일반
0,2022-06-01,2,0,1,5,0.0,6.0,116.0,939.0
1,2022-06-01,2,0,1,6,0.0,0.0,349.0,1085.0
2,2022-06-01,2,0,1,7,2.0,0.0,232.0,1280.0
3,2022-06-01,2,0,1,8,7.0,0.0,205.0,2402.0
4,2022-06-01,2,0,1,9,17.0,1.0,249.0,3740.0


In [6]:
df.tail()

Unnamed: 0,수송일자,요일,주말,공휴일,시간대,어린이,외국인,우대권,일반
14615,2024-05-31,4,0,0,20,9.0,32.0,318.0,8041.0
14616,2024-05-31,4,0,0,21,6.0,19.0,272.0,8724.0
14617,2024-05-31,4,0,0,22,7.0,22.0,165.0,8866.0
14618,2024-05-31,4,0,0,23,5.0,4.0,54.0,5221.0
14619,2024-05-31,4,0,0,24,0.0,0.0,19.0,1467.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   수송일자    14620 non-null  object 
 1   요일      14620 non-null  int64  
 2   주말      14620 non-null  int64  
 3   공휴일     14620 non-null  int64  
 4   시간대     14620 non-null  int64  
 5   어린이     14620 non-null  float64
 6   외국인     14620 non-null  float64
 7   우대권     14620 non-null  float64
 8   일반      14620 non-null  float64
dtypes: float64(4), int64(4), object(1)
memory usage: 1.0+ MB


In [8]:
total_passengers = df[['어린이', '외국인', '우대권', '일반']].sum().sum()
children_passengers = df['어린이'].sum()
children_ratio = (children_passengers / total_passengers) * 100
print(f"어린이 데이터 비율: {children_ratio:.2f}%")

어린이 데이터 비율: 0.21%


In [9]:
df = df.drop(columns=['어린이'])

In [10]:
print(df.isnull().sum())

수송일자    0
요일      0
주말      0
공휴일     0
시간대     0
외국인     0
우대권     0
일반      0
dtype: int64


In [11]:
df['datetime'] = pd.to_datetime(df['수송일자']) + pd.to_timedelta(df['시간대'], unit='h')
df = df.set_index('datetime')
df = df.drop(columns=['수송일자', '시간대'])

In [12]:
df.head(21)

Unnamed: 0_level_0,요일,주말,공휴일,외국인,우대권,일반
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-01 05:00:00,2,0,1,6.0,116.0,939.0
2022-06-01 06:00:00,2,0,1,0.0,349.0,1085.0
2022-06-01 07:00:00,2,0,1,0.0,232.0,1280.0
2022-06-01 08:00:00,2,0,1,0.0,205.0,2402.0
2022-06-01 09:00:00,2,0,1,1.0,249.0,3740.0
2022-06-01 10:00:00,2,0,1,3.0,289.0,3452.0
2022-06-01 11:00:00,2,0,1,4.0,395.0,3942.0
2022-06-01 12:00:00,2,0,1,5.0,353.0,4875.0
2022-06-01 13:00:00,2,0,1,0.0,436.0,5214.0
2022-06-01 14:00:00,2,0,1,9.0,441.0,4985.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14620 entries, 2022-06-01 05:00:00 to 2024-06-01 00:00:00
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   요일      14620 non-null  int64  
 1   주말      14620 non-null  int64  
 2   공휴일     14620 non-null  int64  
 3   외국인     14620 non-null  float64
 4   우대권     14620 non-null  float64
 5   일반      14620 non-null  float64
dtypes: float64(3), int64(3)
memory usage: 799.5 KB


In [14]:
df = df.drop(columns=['요일', '주말', '공휴일'])
df.head()

Unnamed: 0_level_0,외국인,우대권,일반
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-06-01 05:00:00,6.0,116.0,939.0
2022-06-01 06:00:00,0.0,349.0,1085.0
2022-06-01 07:00:00,0.0,232.0,1280.0
2022-06-01 08:00:00,0.0,205.0,2402.0
2022-06-01 09:00:00,1.0,249.0,3740.0


In [15]:
df.to_csv('../Data/강남 일별 시간대별 승객유형별 승하차인원 정리.csv', index=True)