In [1]:
# 한글 폰트 문제 해결 
# matplotlib은 한글 폰트를 지원하지 않음
# os정보
import platform
import matplotlib.pyplot as plt

# font_manager : 폰트 관리 모듈
# rc : 폰트 변경 모듈
from matplotlib import font_manager, rc
# unicode 설정
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Darwin':
    rc('font', family='AppleGothic') # os가 macos
elif platform.system() == 'Windows':
    path = 'c:/Windows/Fonts/malgun.ttf' # os가 windows
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print("Unknown System")

import warnings
warnings.filterwarnings('ignore')

In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
import pandas as pd
df = pd.read_csv("../Data/강남 일별 시간대별 승객유형별 승하차인원 정리.csv")

In [4]:
df.head()

Unnamed: 0,datetime,외국인,우대권,일반
0,2022-06-01 05:00:00,6.0,116.0,939.0
1,2022-06-01 06:00:00,0.0,349.0,1085.0
2,2022-06-01 07:00:00,0.0,232.0,1280.0
3,2022-06-01 08:00:00,0.0,205.0,2402.0
4,2022-06-01 09:00:00,1.0,249.0,3740.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   datetime  14620 non-null  object 
 1   외국인       14620 non-null  float64
 2   우대권       14620 non-null  float64
 3   일반        14620 non-null  float64
dtypes: float64(3), object(1)
memory usage: 457.0+ KB


In [6]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  14620 non-null  datetime64[ns]
 1   외국인       14620 non-null  float64       
 2   우대권       14620 non-null  float64       
 3   일반        14620 non-null  float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 457.0 KB


In [8]:
import holidays

# 한국 공휴일 정보
kr_holidays = holidays.KR()

# 날짜/시간 관련 정보 추가
df['시간'] = df['datetime'].dt.hour  # 시간 추출
df['요일'] = df['datetime'].dt.dayofweek  # 요일 (0: 월요일, ..., 6: 일요일)
df['주말'] = df['요일'].apply(lambda x: 1 if x >= 5 else 0)  # 주말 여부
df['공휴일'] = df['datetime'].apply(lambda x: 1 if x in kr_holidays else 0)  # 공휴일 여부

# 결과 확인
df.head()

Unnamed: 0,datetime,외국인,우대권,일반,시간,요일,주말,공휴일
0,2022-06-01 05:00:00,6.0,116.0,939.0,5,2,0,1
1,2022-06-01 06:00:00,0.0,349.0,1085.0,6,2,0,1
2,2022-06-01 07:00:00,0.0,232.0,1280.0,7,2,0,1
3,2022-06-01 08:00:00,0.0,205.0,2402.0,8,2,0,1
4,2022-06-01 09:00:00,1.0,249.0,3740.0,9,2,0,1


In [9]:
df.head(20)

Unnamed: 0,datetime,외국인,우대권,일반,시간,요일,주말,공휴일
0,2022-06-01 05:00:00,6.0,116.0,939.0,5,2,0,1
1,2022-06-01 06:00:00,0.0,349.0,1085.0,6,2,0,1
2,2022-06-01 07:00:00,0.0,232.0,1280.0,7,2,0,1
3,2022-06-01 08:00:00,0.0,205.0,2402.0,8,2,0,1
4,2022-06-01 09:00:00,1.0,249.0,3740.0,9,2,0,1
5,2022-06-01 10:00:00,3.0,289.0,3452.0,10,2,0,1
6,2022-06-01 11:00:00,4.0,395.0,3942.0,11,2,0,1
7,2022-06-01 12:00:00,5.0,353.0,4875.0,12,2,0,1
8,2022-06-01 13:00:00,0.0,436.0,5214.0,13,2,0,1
9,2022-06-01 14:00:00,9.0,441.0,4985.0,14,2,0,1


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  14620 non-null  datetime64[ns]
 1   외국인       14620 non-null  float64       
 2   우대권       14620 non-null  float64       
 3   일반        14620 non-null  float64       
 4   시간        14620 non-null  int32         
 5   요일        14620 non-null  int32         
 6   주말        14620 non-null  int64         
 7   공휴일       14620 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int32(2), int64(2)
memory usage: 799.7 KB


In [11]:
# '연', '월', '일' 컬럼 추가
df['연'] = df['datetime'].dt.year
df['월'] = df['datetime'].dt.month
df['일'] = df['datetime'].dt.day

# 결과 확인
df.head()

Unnamed: 0,datetime,외국인,우대권,일반,시간,요일,주말,공휴일,연,월,일
0,2022-06-01 05:00:00,6.0,116.0,939.0,5,2,0,1,2022,6,1
1,2022-06-01 06:00:00,0.0,349.0,1085.0,6,2,0,1,2022,6,1
2,2022-06-01 07:00:00,0.0,232.0,1280.0,7,2,0,1,2022,6,1
3,2022-06-01 08:00:00,0.0,205.0,2402.0,8,2,0,1,2022,6,1
4,2022-06-01 09:00:00,1.0,249.0,3740.0,9,2,0,1,2022,6,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  14620 non-null  datetime64[ns]
 1   외국인       14620 non-null  float64       
 2   우대권       14620 non-null  float64       
 3   일반        14620 non-null  float64       
 4   시간        14620 non-null  int32         
 5   요일        14620 non-null  int32         
 6   주말        14620 non-null  int64         
 7   공휴일       14620 non-null  int64         
 8   연         14620 non-null  int32         
 9   월         14620 non-null  int32         
 10  일         14620 non-null  int32         
dtypes: datetime64[ns](1), float64(3), int32(5), int64(2)
memory usage: 971.0 KB


In [13]:
# 원하는 컬럼 순서 정의
columns = ['datetime', '연', '월', '일', '시간', '요일', '주말', '공휴일', '외국인', '우대권', '일반']

# 컬럼 순서 재배치
df = df[columns]

# 결과 확인
df.head()

Unnamed: 0,datetime,연,월,일,시간,요일,주말,공휴일,외국인,우대권,일반
0,2022-06-01 05:00:00,2022,6,1,5,2,0,1,6.0,116.0,939.0
1,2022-06-01 06:00:00,2022,6,1,6,2,0,1,0.0,349.0,1085.0
2,2022-06-01 07:00:00,2022,6,1,7,2,0,1,0.0,232.0,1280.0
3,2022-06-01 08:00:00,2022,6,1,8,2,0,1,0.0,205.0,2402.0
4,2022-06-01 09:00:00,2022,6,1,9,2,0,1,1.0,249.0,3740.0


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   datetime  14620 non-null  datetime64[ns]
 1   연         14620 non-null  int32         
 2   월         14620 non-null  int32         
 3   일         14620 non-null  int32         
 4   시간        14620 non-null  int32         
 5   요일        14620 non-null  int32         
 6   주말        14620 non-null  int64         
 7   공휴일       14620 non-null  int64         
 8   외국인       14620 non-null  float64       
 9   우대권       14620 non-null  float64       
 10  일반        14620 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int32(5), int64(2)
memory usage: 971.0 KB


In [14]:
# 모델 입력 특징(Feature)과 출력(Target) 변수 설정
features = ['연', '월', '일', '시간', '요일', '주말', '공휴일']
targets = ['외국인', '우대권', '일반']

X = df[features]
y = df[targets]

In [15]:
from sklearn.model_selection import train_test_split

# 데이터를 학습/테스트로 분할 (80:20 비율)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("학습 데이터 크기:", X_train.shape)
print("테스트 데이터 크기:", X_test.shape)

학습 데이터 크기: (11696, 7)
테스트 데이터 크기: (2924, 7)


In [22]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤포레스트 모델 초기화
model = RandomForestRegressor(
    n_estimators=200, max_depth=12, min_samples_split=5, random_state=42
)

# 모델 학습
model.fit(X_train, y_train)

# 학습 데이터 점수
train_score = model.score(X_train, y_train)
print(f"Train Score (R²): {train_score}")

# 테스트 데이터 점수
test_score = model.score(X_test, y_test)
print(f"Test Score (R²): {test_score}")

# 교차 검증
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Cross-Validation Scores: {scores}")
print(f"Mean CV Score: {scores.mean()}")

Train Score (R²): 0.8676604386973267
Test Score (R²): 0.8474833969102428
Cross-Validation Scores: [0.00596679 0.79843111 0.84022086 0.82518692 0.8441219 ]
Mean CV Score: 0.662785515920872


In [25]:
from sklearn.model_selection import KFold, cross_val_score

# K-Fold 교차 검증
kfold_cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=kfold_cv, scoring='r2')
print(f"K-Fold Cross-Validation Scores: {scores}")
print(f"Mean K-Fold CV Score: {scores.mean()}")

K-Fold Cross-Validation Scores: [0.84786793 0.85686431 0.84979931 0.85183775 0.8492346 ]
Mean K-Fold CV Score: 0.8511207815343893


In [35]:
import pandas as pd
import holidays
from datetime import datetime

# 한국 공휴일 정보
kr_holidays = holidays.KR()

# 특징 생성 함수
def create_features_for_datetime(dt):
    """datetime에서 특징을 생성"""
    return {
        'datetime': dt,
        '연': dt.year,
        '월': dt.month,
        '일': dt.day,
        '시간': dt.hour,
        '요일': dt.weekday(),  # 수정: dayofweek → weekday()
        '주말': 1 if dt.weekday() >= 5 else 0,  # 토/일이면 1
        '공휴일': 1 if dt in kr_holidays else 0,
    }

# 미래 예측 함수
def predict_passenger_count(model, future_datetime):
    """미래 승객 수 예측"""
    # 미래 datetime에서 특징 생성
    feature = create_features_for_datetime(future_datetime)
    feature_df = pd.DataFrame([feature])
    
    # 필요한 입력 데이터만 선택
    feature_columns = ['연', '월', '일', '시간', '요일', '주말', '공휴일']
    feature_input = feature_df[feature_columns]
    
    # 예측 수행
    prediction = model.predict(feature_input)
    return prediction

# 오늘 날짜를 예측
current_datetime = datetime.now()  # 현재 날짜와 시간 가져오기
predicted_count_today = predict_passenger_count(model, current_datetime)
print(f"오늘 {current_datetime.strftime('%Y-%m-%d %H:%M:%S')}의 예측 승객 수: {predicted_count_today}")

오늘 2024-12-20 14:46:40의 예측 승객 수: [[  23.59841332 1050.62586828 7120.23240571]]
