In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# 1️⃣ 데이터 준비
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X = train[['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']]
y = train['Calories']
X_test = test[['Sex', 'Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']]

# 2️⃣ ColumnTransformer 정의 (OneHot + StandardScaler)
categorical_features = ['Sex']
numeric_features = ['Age', 'Height', 'Weight', 'Duration', 'Heart_Rate', 'Body_Temp']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# 3️⃣ 모델 정의 (CatBoostRegressor로 시작)
model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

# 4️⃣ 파이프라인 구성
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', model)
])

# 5️⃣ 교차검증 및 하이퍼파라미터 튜닝
param_grid = {
    'regressor__depth': [4, 6, 8],
    'regressor__learning_rate': [0.01, 0.05, 0.1],
    'regressor__iterations': [300, 500, 700]
}

scorer = make_scorer(lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)), greater_is_better=False)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring=scorer, verbose=2, n_jobs=-1)
grid_search.fit(X, y)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score (RMSE):", -grid_search.best_score_)

# 6️⃣ 테스트 데이터 예측
preds = grid_search.predict(X_test)
test['Calories'] = preds
test[['id', 'Calories']].to_csv('submission.csv', index=False)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


KeyboardInterrupt: 