# ランダムフォレストモデル（時系列分割・特徴量選択・ベイズ最適化・提出フォーマット厳守版）

In [ ]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import optuna
import matplotlib.pyplot as plt
import japanize_matplotlib

PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data'
train = pd.read_csv(DATA_DIR / 'train_processed.csv')
test = pd.read_csv(DATA_DIR / 'test_processed.csv')
target_col = 'price_actual'
drop_cols = ['time', target_col] if target_col in train.columns else ['time']
feature_cols = [col for col in train.columns if col not in drop_cols]
X = train[feature_cols]
y = train[target_col] if target_col in train.columns else train.iloc[:, -1]


In [ ]:
# 目的変数の季節性分析（月別価格分布）
train['time'] = pd.to_datetime(train['time'])
train['month'] = train['time'].dt.month
monthly_price = train.groupby('month')[target_col].agg(['mean', 'std', 'min', 'max'])
print('月別価格統計:')
print(monthly_price)
print('\n最も価格が高い月:', monthly_price['mean'].idxmax(), f'(平均価格: {monthly_price["mean"].max():.2f})')
print('最も価格が低い月:', monthly_price['mean'].idxmin(), f'(平均価格: {monthly_price["mean"].min():.2f})')


In [ ]:
# ベイズ最適化によるハイパーパラメータ探索
tscv = TimeSeriesSplit(n_splits=5)
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'random_state': 42
    }
    rmses = []
    for train_idx, valid_idx in tscv.split(X):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = RandomForestRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_valid)
        rmse = root_mean_squared_error(y_valid, y_pred)
        rmses.append(rmse)
    return np.mean(rmses)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)
print('Best params:', study.best_params)
print('Best CV RMSE:', study.best_value)
best_params = study.best_params
best_params['random_state'] = 42
best_model = RandomForestRegressor(**best_params)
best_model.fit(X, y)


In [ ]:
# 特徴量重要度で下位20%を除外し再学習
importances = best_model.feature_importances_
threshold = np.percentile(importances, 20)
selected_features = [f for f, imp in zip(feature_cols, importances) if imp > threshold]
best_model.fit(X[selected_features], y)


In [ ]:
# テストデータ予測と提出ファイル出力（フォーマット厳守）
X_test = test[selected_features]
test_pred = best_model.predict(X_test)
submission = test[['time']].copy()
submission['price_actual_pred'] = test_pred
assert submission.iloc[0,0] == '2018-01-01 00:00:00+01:00', '1行1列目が要件を満たしません'
submission.to_csv(DATA_DIR / 'submission_random_forest_v2.csv', index=False, header=False)
print('Saved: submission_random_forest_v2.csv')
