# お弁当販売数予測 - bento9_opus_talk

このノートブックは marimoから変換されました。

## 初期化

In [None]:
# 全モジュールのインポート
import polars as pl
import pandas as pd
import altair as alt
from pathlib import Path
import numpy as np
import lightgbm as lgb
import re
import jpholiday
from datetime import datetime
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
# 訓練データの読み込み
data_path_train = Path("data/bento_train.csv")
df_train = pl.read_csv(data_path_train, null_values=["--"])

# テストデータの読み込み
data_path_test = Path("data/bento_test.csv")
df_test = pl.read_csv(data_path_test, null_values=["--"])

print(f"訓練データ: {df_train.shape[0]} 行 × {df_train.shape[1]} 列")
print(f"テストデータ: {df_test.shape[0]} 行 × {df_test.shape[1]} 列")
print(f"評価指標: RMSE（Root Mean Squared Error）")

## 1. 基本統計とデータ理解

In [None]:
# 基本統計量
train_stats = df_train.describe()
train_stats

### 欠損値確認

In [None]:
# 欠損値確認
null_counts = df_train.null_count()
total_rows = df_train.shape[0]

null_info = pl.DataFrame({
    "カラム": list(null_counts.columns),
    "欠損数": [null_counts[col][0] for col in null_counts.columns],
    "欠損率(%)": [
        round(null_counts[col][0] * 100 / total_rows, 2)
        for col in null_counts.columns
    ]
}).filter(pl.col("欠損数") > 0)

null_info

In [None]:
# 目的変数yの分布
chart_y = alt.Chart(df_train.to_pandas()).mark_bar().encode(
    alt.X("y:Q", bin=alt.Bin(maxbins=30), title="販売数"),
    alt.Y("count()", title="頻度"),
    tooltip=["count()"]
).properties(
    width=600,
    height=300,
    title="販売数（y）の分布"
)
chart_y

In [None]:
# 曜日別販売数
week_order = ["月", "火", "水", "木", "金", "土", "日"]

chart_week = alt.Chart(df_train.to_pandas()).mark_boxplot().encode(
    alt.X("week:N", title="曜日", sort=week_order),
    alt.Y("y:Q", title="販売数"),
    tooltip=["week", "y"]
).properties(
    width=600,
    height=300,
    title="曜日別販売数の分布"
)
chart_week

## 2. 特徴量エンジニアリング

In [None]:
# 日付特徴量
def add_date_features(df):
    return df.with_columns([
        pl.col("datetime").str.strptime(pl.Date, "%Y-%m-%d").alias("date"),
    ]).with_columns([
        pl.col("date").dt.year().alias("year"),
        pl.col("date").dt.month().alias("month"),
        pl.col("date").dt.day().alias("day"),
        pl.col("date").dt.weekday().alias("weekday"),  # 0=月, 6=日
    ])

df_train_fe = add_date_features(df_train)
df_test_fe = add_date_features(df_test)

In [None]:
# 祝日フラグ
def is_holiday(date_str):
    try:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        return 1 if jpholiday.is_holiday(dt) else 0
    except:
        return 0

df_train_fe2 = df_train_fe.with_columns([
    pl.col("datetime").map_elements(is_holiday, return_dtype=pl.Int64).alias("is_holiday")
])

df_test_fe2 = df_test_fe.with_columns([
    pl.col("datetime").map_elements(is_holiday, return_dtype=pl.Int64).alias("is_holiday")
])

In [None]:
# カテゴリカル特徴量のエンコーディング
week_map = {"月": 0, "火": 1, "水": 2, "木": 3, "金": 4, "土": 5, "日": 6}
weather_map = {"快晴": 0, "晴れ": 1, "薄曇": 2, "曇": 3, "雨": 4, "雪": 5, "雷電": 6}

df_train_fe3 = df_train_fe2.with_columns([
    pl.col("week").replace(week_map).cast(pl.Int64).alias("week_encoded"),
    pl.col("weather").replace(weather_map).cast(pl.Int64).alias("weather_encoded")
])

df_test_fe3 = df_test_fe2.with_columns([
    pl.col("week").replace(week_map).cast(pl.Int64).alias("week_encoded"),
    pl.col("weather").replace(weather_map).cast(pl.Int64).alias("weather_encoded")
])

In [None]:
# soldout, paydayを数値型に変換
df_train_fe4 = df_train_fe3.with_columns([
    pl.col("soldout").cast(pl.Int64),
    pl.col("payday").fill_null(value=0).cast(pl.Int64)
])

df_test_fe4 = df_test_fe3.with_columns([
    pl.col("soldout").cast(pl.Int64),
    pl.col("payday").fill_null(value=0).cast(pl.Int64)
])

In [None]:
# 欠損値補完（訓練データの統計量を使用）
# データリーク防止: テストデータの補完には必ず訓練データの中央値を使用

# 訓練データで中央値を計算
train_kcal_median = df_train_fe4["kcal"].median()
train_precipitation_median = df_train_fe4["precipitation"].median()
train_temp_median = df_train_fe4["temperature"].median()

# 訓練データの欠損値補完
df_train_filled = df_train_fe4.with_columns([
    pl.col("kcal").fill_null(value=train_kcal_median),
    pl.col("precipitation").fill_null(value=train_precipitation_median),
    pl.col("temperature").fill_null(value=train_temp_median)
])

# テストデータの欠損値補完（訓練データの統計量を使用）
df_test_filled = df_test_fe4.with_columns([
    pl.col("kcal").fill_null(value=train_kcal_median),
    pl.col("precipitation").fill_null(value=train_precipitation_median),
    pl.col("temperature").fill_null(value=train_temp_median)
])

### 欠損値処理後の確認

In [None]:
# 欠損値処理確認
train_nulls_after = df_train_filled.null_count().sum_horizontal()[0]
test_nulls_after = df_test_filled.null_count().sum_horizontal()[0]

print(f"訓練データ欠損数: {train_nulls_after}")
print(f"テストデータ欠損数: {test_nulls_after}")

In [None]:
# 特徴量選択
feature_cols = [
    "year", "month", "day", "weekday", "is_holiday",
    "week_encoded", "weather_encoded",
    "soldout", "payday",
    "kcal", "precipitation", "temperature"
]

X_train = df_train_filled.select(feature_cols).to_pandas()
y_train = df_train_filled.select("y").to_pandas()["y"]
X_test = df_test_filled.select(feature_cols).to_pandas()

## 3. モデリング準備完了

In [None]:
print(f"訓練データ: {X_train.shape[0]} samples × {X_train.shape[1]} features")
print(f"テストデータ: {X_test.shape[0]} samples × {X_test.shape[1]} features")
print(f"特徴量リスト: {', '.join(feature_cols)}")

## 4. モデリング - Ridge回帰

In [None]:
# Ridge回帰モデル
tscv = TimeSeriesSplit(n_splits=5)
ridge = Ridge(alpha=1.0)

ridge_cv_scores = []
for train_idx, val_idx in tscv.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    ridge.fit(X_tr, y_tr)
    y_pred = ridge.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    ridge_cv_scores.append(rmse)

# 全訓練データで再学習
ridge.fit(X_train, y_train)

### Ridge回帰の交差検証結果

In [None]:
ridge_mean_rmse = np.mean(ridge_cv_scores)
ridge_std_rmse = np.std(ridge_cv_scores)

print(f"CV RMSE (平均): {ridge_mean_rmse:.2f}")
print(f"CV RMSE (標準偏差): {ridge_std_rmse:.2f}")
print(f"各Fold: {[f'{s:.2f}' for s in ridge_cv_scores]}")

## 5. モデリング - LightGBM

In [None]:
# Optunaによるハイパーパラメータチューニング
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    }

    tscv_inner = TimeSeriesSplit(n_splits=5)
    cv_scores = []

    for train_idx, val_idx in tscv_inner.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )

        y_pred = model.predict(X_val, num_iteration=model.best_iteration)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        cv_scores.append(rmse)

    return np.mean(cv_scores)

# 最適化実行
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_params.update({
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt'
})

In [None]:
# 最良パラメータで再学習
tscv = TimeSeriesSplit(n_splits=5)
lgb_cv_scores = []

for train_idx, val_idx in tscv.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

    model = lgb.train(
        best_params,
        train_data,
        num_boost_round=1000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

    y_pred = model.predict(X_val, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    lgb_cv_scores.append(rmse)

# 全訓練データで最終モデルを訓練
train_data_full = lgb.Dataset(X_train, label=y_train)
lgb_model = lgb.train(
    best_params,
    train_data_full,
    num_boost_round=1000
)

### LightGBMの交差検証結果

In [None]:
lgb_mean_rmse = np.mean(lgb_cv_scores)
lgb_std_rmse = np.std(lgb_cv_scores)

print(f"CV RMSE (平均): {lgb_mean_rmse:.2f}")
print(f"CV RMSE (標準偏差): {lgb_std_rmse:.2f}")
print(f"各Fold: {[f'{s:.2f}' for s in lgb_cv_scores]}")
print()
print("最適化されたハイパーパラメータ:")
for k, v in best_params.items():
    if k not in ['objective', 'metric', 'verbosity', 'boosting_type']:
        print(f"  - {k}: {v}")

## 6. モデル比較

In [None]:
# モデル比較用データフレーム
comparison_df = pl.DataFrame({
    "Model": ["Ridge", "LightGBM"],
    "CV RMSE": [ridge_mean_rmse, lgb_mean_rmse]
})

# 比較チャート
comparison_chart = alt.Chart(comparison_df.to_pandas()).mark_bar().encode(
    alt.X("Model:N", title="モデル"),
    alt.Y("CV RMSE:Q", title="RMSE"),
    alt.Color("Model:N", legend=None),
    tooltip=["Model", alt.Tooltip("CV RMSE:Q", format=".2f")]
).properties(
    width=400,
    height=300,
    title="モデル性能比較（CV RMSE）"
)

comparison_chart

In [None]:
# 特徴量重要度
importance = lgb_model.feature_importance(importance_type='gain')
importance_df = pl.DataFrame({
    "Feature": feature_cols,
    "Importance": importance
}).sort("Importance", descending=True)

# 重要度チャート
importance_chart = alt.Chart(importance_df.to_pandas()).mark_bar().encode(
    alt.X("Importance:Q", title="重要度"),
    alt.Y("Feature:N", title="特徴量", sort="-x"),
    alt.Color("Importance:Q", legend=None, scale=alt.Scale(scheme="viridis")),
    tooltip=["Feature", alt.Tooltip("Importance:Q", format=".2f")]
).properties(
    width=500,
    height=400,
    title="LightGBM特徴量重要度"
)

importance_chart

## 7. 予測とSubmission生成

In [None]:
# LightGBMで予測
y_pred = lgb_model.predict(X_test)

# 負の値を0にクリップ（販売数は負にならない）
y_pred = np.maximum(y_pred, 0)

In [None]:
# submission.csv生成
# 日付フォーマット: yyyy-m-d（1桁の日は0埋めしない）
dates = df_test_filled["datetime"].to_list()

# y値を整数に丸める
predictions_int = [int(round(p)) for p in y_pred]

submission_df = pd.DataFrame({
    "datetime": dates,
    "y": predictions_int
})

# 保存（ヘッダーなし）
submission_path = Path("../submission.csv")
submission_df.to_csv(submission_path, index=False, header=False)

print(f"Submission生成完了")
print(f"ファイルパス: {submission_path}")

### Submission内容のプレビュー

In [None]:
submission_df.head(10)