In [1]:
import pandas as pd
df = pd.read_parquet(
    'data.parquet',
    engine='pyarrow'         # 저장 시 사용한 엔진과 동일하게 지정
)
test_loaded = pd.read_parquet(
    'test.parquet',
    engine='pyarrow'         # 저장 시 사용한 엔진과 동일하게 지정
)

In [2]:
# feature / target 정의
ordered_cols = ['Direction', 'time_period']
cat_cols     = [
                'station_number'
                , 'address'
               # , 'station_name'
               ] + ordered_cols
num_cols = [
    'HM','RN_DAY','RN_HR1',
    #'SI',
    'TA','WD','WS'
    ,'STN'
    ,'sin_dom','cos_dom','sin_dow','cos_dow','sin_hod','cos_hod'
    ,'sin_wom','cos_wom','sin_woy','cos_woy','sin_doy','cos_doy'
    ,'day','day_of_year','hour'
    ,'is_day_before_holiday','is_day_after_holiday','is_holiday','is_weekend'
    ,'month','transfer','week_of_month','week_of_year','weekday','year'
    ,'신설역', '신규관측소'
]
feature_cols = num_cols + ordered_cols + cat_cols
target_col   = 'Congestion'

# 모델 선택

In [3]:
def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred  = model.predict(X_val)
    elapsed = time.time() - t0
    
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    r2   = r2_score(y_val, y_pred)
    
    return {'Model': name, 'Time(s)': elapsed, 'RMSE': rmse, 'R2': r2}

In [4]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

# 전처리·평가용
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics      import mean_squared_error, r2_score

# ── 선형 계열 회귀 모델 ──
from sklearn.linear_model import ARDRegression

# ── 트리 & 앙상블 ──
from sklearn.ensemble      import (
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
    GradientBoostingRegressor
)

# ── 신경망 & 부스팅 ──
from sklearn.neural_network import MLPRegressor
from xgboost                 import XGBRegressor
from lightgbm                import LGBMRegressor
from catboost                import CatBoostRegressor

# ------------------------------------------------------------------------------
# 미리 정의해야 할 변수
# df: 학습용 DataFrame (컬럼에 'Line', 'TM', STN, address, feature_cols, target_col 포함)
# test: 테스트용 DataFrame (컬럼 구조 동일)
# feature_cols: predictor로 사용할 컬럼 리스트
# target_col: 예측 대상 컬럼 이름 (문자열)
# cat_cols: 범주형으로 one-hot encoding 할 컬럼 리스트 (예: ['STN','address'])
# ------------------------------------------------------------------------------

def evaluate_model(name, model, X_train, y_train, X_val, y_val):
    t0 = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    return {
        'Model': name,
        'Time(s)': time.time() - t0,
        'RMSE': np.sqrt(mean_squared_error(y_val, y_pred)),
        'R2': r2_score(y_val, y_pred)
    }

all_results = []

for line in range(1, 9):
    # 1) subset & sort
    df_line   = df [df['Line']==line].sort_values('TM').copy()
    test_line = test_loaded[test_loaded['Line']==line].copy()

    # 2) 카테고리 지정
    for col in cat_cols:
        df_line[col]   = df_line[col].astype('category')
        test_line[col] = test_line[col].astype('category')

    # 3) feature & target
    X      = df_line[feature_cols]
    y      = df_line[target_col].astype(int)
    X_test = test_line[feature_cols]

    # 4) 원-핫 인코딩
    X_enc      = pd.get_dummies(X,      columns=cat_cols, drop_first=False)
    X_test_enc = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)

    # 5) 중복 컬럼 제거 & 정렬, 누락 채움
    X_enc      = X_enc.loc[:, ~X_enc.columns.duplicated()]
    X_test_enc = X_test_enc.loc[:, ~X_test_enc.columns.duplicated()]
    X_test_enc = X_test_enc.reindex(columns=X_enc.columns, fill_value=0)

    # 6) 정규화
    mm             = MinMaxScaler()
    X_scaled       = mm.fit_transform(X_enc)
    X_test_scaled  = mm.transform(X_test_enc)

    # 7) 시간 순 분할 (train:val = 8:2)
    split_idx = int(len(X_scaled) * 0.8)
    X_train, X_val = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_val = y.values[:split_idx],    y.values[split_idx:]

    # 8) 모델별 평가
    for name, model in [

        ('LGBM', LGBMRegressor(n_jobs=-1, random_state=42)),
        ('CAT',  CatBoostRegressor(verbose=0, random_state=42))
    ]:
        res = evaluate_model(name, model, X_train, y_train, X_val, y_val)
        res['Line'] = line
        all_results.append(res)
        print(f"[Line {line}] {name}: RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}, Time={res['Time(s)']:.1f}s")

# 9) 종합 결과 DataFrame 생성 및 저장
results_df = pd.DataFrame(all_results)
print("\n=== 전체 라인·모델별 실행 시간·성능 비교 ===")
print(results_df)

# CSV로 저장 (필요시)
os.makedirs('results', exist_ok=True)
results_df.to_csv('results/model_performance_all_lines.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.100724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2612
[LightGBM] [Info] Number of data points in the train set: 2251468, number of used features: 112
[LightGBM] [Info] Start training from score 18.002817
[Line 1] LGBM: RMSE=7.877, R2=0.852, Time=9.6s
[Line 1] CAT: RMSE=5.479, R2=0.929, Time=255.3s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.107521 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2542
[LightGBM] [Info] Number of data points in the train set: 1596672, number of used features: 95
[LightGBM] [Info] Start training from score 28.565433
[Line 2] LGBM: RMSE=10.431, R2=0.762, Time=10.3s
[Line 2] CAT: RMSE=11.665, R2=0.703, 

In [8]:
import os
import time
import numpy as np
import pandas as pd
from tqdm import tqdm

# 전처리·평가용
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics      import mean_squared_error, r2_score

# ── 선형 계열 회귀 모델 ──
from sklearn.linear_model import (
    ARDRegression,       # ard
    TheilSenRegressor,   # tr
    RANSACRegressor      # ransac
)

# ── 커널 & 거리 기반 ── 용량&시간으로 제외
# from sklearn.kernel_ridge import KernelRidge   # kr
# from sklearn.svm           import SVR           # svm
# from sklearn.neighbors     import KNeighborsRegressor  # knn

# ── 트리 & 앙상블 ──
from sklearn.tree          import DecisionTreeRegressor     # dt
from sklearn.ensemble      import (
    RandomForestRegressor,  # rf
    ExtraTreesRegressor,    # et
    AdaBoostRegressor,      # ada
    GradientBoostingRegressor  # gbr
)

# ── 신경망 & 부스팅 ──
from sklearn.neural_network import MLPRegressor     # mlp
from xgboost                 import XGBRegressor    # xgboost
from lightgbm                import LGBMRegressor   # lightgbm
from catboost                import CatBoostRegressor  # catboost

results = []

# ARD
ard = ARDRegression()
res = evaluate_model('ARD', ard, X_train, y_train, X_val, y_val)
results.append(res)
print('ARD: ', res)

# RandomForest
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
res = evaluate_model('RF', rf, X_train, y_train, X_val, y_val)
results.append(res)
print('RandomForest: ', res)

# ExtraTrees
et = ExtraTreesRegressor(n_jobs=-1, random_state=42)
res = evaluate_model('ET', et, X_train, y_train, X_val, y_val)
results.append(res)
print('ExtraTrees: ', res)

# AdaBoost
ada = AdaBoostRegressor(random_state=42)
res = evaluate_model('ADA', ada, X_train, y_train, X_val, y_val)
results.append(res)
print('AdaBoost: ', res)

# GradientBoosting
gbr = GradientBoostingRegressor(random_state=42)
res = evaluate_model('GBR', gbr, X_train, y_train, X_val, y_val)
results.append(res)
print('GradientBoosting: ', res)

# MLP
mlp = MLPRegressor(random_state=42)
res = evaluate_model('MLP', mlp, X_train, y_train, X_val, y_val)
results.append(res)
print('MLP: ', res)

# XGBoost
xgb = XGBRegressor(tree_method='hist', n_jobs=-1, random_state=42)
res = evaluate_model('XGB', xgb, X_train, y_train, X_val, y_val)
results.append(res)
print('XGBoost: ', res)

# LightGBM
lgbm = LGBMRegressor(n_jobs=-1, random_state=42)
res = evaluate_model('LGBM', lgbm, X_train, y_train, X_val, y_val)
results.append(res)
print('LightGBM: ', res)

# CatBoost
cat = CatBoostRegressor(verbose=0, random_state=42)
res = evaluate_model('CAT', cat, X_train, y_train, X_val, y_val)
results.append(res)
print('CatBoost: ', res)

# —————————————————————————
# 3) 요약 출력
# —————————————————————————
results_df = pd.DataFrame(results)
print("\n=== 모델별 실행 시간·성능 비교 ===")
print(results_df)

# (원하면 CSV로 저장)
os.makedirs('./test', exist_ok=True)
results_df.to_csv('./test/model_time_performance_line7.csv', index=False)

{'Model': 'ARD', 'Time(s)': 112.8433210849762, 'RMSE': 17.95454833434004, 'R2': 0.38365889038325696}
{'Model': 'RF', 'Time(s)': 786.7618379592896, 'RMSE': 6.940728451896612, 'R2': 0.9078951978696133}
{'Model': 'ET', 'Time(s)': 737.5762186050415, 'RMSE': 6.671313294042149, 'R2': 0.9149068032512901}
{'Model': 'ADA', 'Time(s)': 479.0085141658783, 'RMSE': 20.69933365484366, 'R2': 0.18080951919407573}
{'Model': 'GBR', 'Time(s)': 654.2590267658234, 'RMSE': 16.13645052678724, 'R2': 0.5021618334231359}
{'Model': 'MLP', 'Time(s)': 3093.824917078018, 'RMSE': 23.175210809774192, 'R2': -0.02687966476099124}
{'Model': 'XGB', 'Time(s)': 16.0857515335083, 'RMSE': 8.440437858983694, 'R2': 0.8637921553844169}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.220275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2534
[LightGBM] [Info] Number of data points in the train set: 1978368, number of used features: 114
[Light

In [5]:
df23 = pd.read_csv('./data/train_subway23.csv', encoding='CP949')
df22 = pd.read_csv('./data/train_subway22.csv', encoding='CP949')
df21 = pd.read_csv('./data/train_subway21.csv', encoding='CP949')
df = pd.concat([df21, df22, df23], axis=0, ignore_index=True)

del df23
del df22
del df21

# 프로파일 리포트 생성
from pycaret.regression import *
from ydata_profiling import ProfileReport

profile = ProfileReport(
    df,
    title="My Data Profiling Report",  # 리포트 제목
    explorative=True,                  # 자세한 분석 모드
    minimal=False                       # 최소 리포트 모드 해제
)

# 3) 결과를 HTML 파일로 저장
profile.to_file("data_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/15 [00:00<?, ?it/s][A
  7%|█████▌                                                                             | 1/15 [00:39<09:11, 39.36s/it][A
 47%|██████████████████████████████████████▋                                            | 7/15 [00:40<00:34,  4.28s/it][A
 53%|████████████████████████████████████████████▎                                      | 8/15 [00:40<00:25,  3.60s/it][A
 60%|█████████████████████████████████████████████████▊                                 | 9/15 [00:42<00:19,  3.23s/it][A
 80%|█████████████████████████████████████████████████████████████████▌                | 12/15 [00:42<00:05,  1.78s/it][A
 87%|███████████████████████████████████████████████████████████████████████           | 13/15 [00:43<00:03,  1.52s/it][A
 93%|████████████████████████████████████████████████████████████████████████████▌     | 14/15 [00:43<00:01,  1.27s/it][A
100%|██████████

In [12]:
import os
import pandas as pd
import os
os.environ['LIGHTGBM_VERBOSE'] = '0'
import io, contextlib

# 1) 빈 스트림 생성
_silencer = io.StringIO()
from pycaret.regression import (
    setup,
    compare_models,
    finalize_model,
    predict_model,
    save_model
)

# ────────────────── 설정 ──────────────────
os.makedirs('./models_pycaret', exist_ok=True)

all_predictions = []
metrics = []

# 비교할 모델 목록 정의
include_models = ['rf'       # RandomForestRegressor
                  #'xgboost',  # XGBRegressor
                  #'catboost', # CatBoostRegressor
                  #'et'
                 ]       # ExtraTreesRegressor

line = 7
print(f"\n🎯 [Line {line}] AutoML 시작")

# 1) 데이터 분리
train = df[df['Line'] == line].copy()
test_line = test[test['Line'] == line].copy()

# 2) PyCaret 세션 설정
exp = setup(
    data=train,
    target='Congestion',
    session_id=42,
    train_size=0.8,
    use_gpu=True,
    verbose=False,
    html=False,
    ignore_features=['TM', 'station_name'],
    feature_selection=False,
    feature_selection_method='univariate',
    n_features_to_select=30
)

# 3) 모델 비교 및 선택 (네 가지 모델만)
with contextlib.redirect_stdout(_silencer), contextlib.redirect_stderr(_silencer):
    best = compare_models(
        include=include_models,
        n_select=1,
        verbose=False
    )
# 4) 최종 모델 고정
final_model = finalize_model(best)

# 5) 저장
model_name = best.__class__.__name__.lower()
save_model(final_model, f'./models_pycaret/{model_name}_line{line}')

# 6) 검증 성능 산출 (간결 출력)
val_pred = predict_model(final_model)

# ① 원본 train DataFrame의 컬럼 목록
orig_cols = set(train.columns)

# ② 예측값 컬럼 자동 탐색
#    train에 없고, 타깃('Congestion')도 아닌 첫 번째 숫자형 컬럼을 예측값으로 간주
cand = [
    c for c in val_pred.columns 
    if c not in orig_cols 
    and c != 'Congestion' 
    and pd.api.types.is_numeric_dtype(val_pred[c])
]
pred_col = cand[0]  # 보통 한 개만 나옵니다.

# ③ RMSE와 R² 계산
from sklearn.metrics import mean_squared_error, r2_score

rmse = mean_squared_error(
    val_pred['Congestion'], 
    val_pred[pred_col], 
    squared=False
)
r2   = r2_score(
    val_pred['Congestion'], 
    val_pred[pred_col]
)

# ④ 결과 저장 및 출력
metrics.append({
    'Line':      line,
    'Model':     model_name,
    'RMSE':      rmse,
    'R2':        r2
})
print(f"  → Line {line} | 모델: {model_name} | RMSE: {rmse:.4f} | R²: {r2:.4f}")


🎯 [Line 7] AutoML 시작
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Using GPU Device: Intel(R) Iris(R) Xe Graphics, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 0.500000
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of 

KeyboardInterrupt: 


KeyboardInterrupt

