In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [63]:
df = pd.read_csv("/work/app/data/input/filtered_result.csv")
df["伝票日付"] = pd.to_datetime(df["伝票日付"], format="%Y-%m-%d", errors="coerce")
df

Unnamed: 0,伝票日付,正味重量,品名
0,2020-01-06,470.0,混合廃棄物B
1,2020-01-06,390.0,その他
2,2020-01-06,120.0,混合廃棄物A
3,2020-01-06,1720.0,混合廃棄物B
4,2020-01-06,320.0,選別
...,...,...,...
210744,2025-05-26,350.0,混合廃棄物A
210745,2025-05-26,20.0,ﾀｲﾔ(ﾎｲﾙ無)
210746,2025-05-26,450.0,混合廃棄物A
210747,2025-05-26,430.0,混合廃棄物A


In [64]:
# === 必要なライブラリ ===
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import jpholiday
from lightgbm import early_stopping, log_evaluation

# === 補助関数 ===

# --- 祝日フラグ判定関数 ---
def make_holiday_flag(date):
    return int(jpholiday.is_holiday(date) or date.weekday() >= 5)

# --- 品目分類関数 ---
def classify_item(item: str) -> str:
    if "混合廃棄物A" in item:
        return "混合A"
    elif "混合廃棄物B" in item:
        return "混合B"
    else:
        return "その他"


## 特徴量生成

In [65]:

def add_previous_year_feature(df_pivot: pd.DataFrame) -> pd.DataFrame:
    df_prev_year = df_pivot.copy()
    df_prev_year.index = df_prev_year.index + pd.DateOffset(years=1)
    df_prev_year_sum = df_prev_year.sum(axis=1).to_frame(name="前年同期重量")
    df_prev_year_sum = df_prev_year_sum.sort_index()

    # 元 df_pivot に merge して "前年同期重量" を列として追加した DataFrame を返す
    df_current = df_pivot.copy()
    df_current = df_current.sort_index()
    df_current = df_current.merge(
        df_prev_year_sum,
        how="left",
        left_index=True,
        right_index=True
    )

    df_current["前年同期重量"] = df_current["前年同期重量"].fillna(0)

    return df_current[["前年同期重量"]]


def compute_previous_year_ratio(df_pivot: pd.DataFrame) -> pd.DataFrame:
    # 前年同期重量
    df_prev_year_sum = add_previous_year_feature(df_pivot)

    # 全体平均搬入量（前年のみ）
    prev_year_dates = df_pivot.index - pd.DateOffset(years=1)
    mask_prev_year = df_pivot.index.isin(prev_year_dates)
    total_prev_year_weight = df_pivot.loc[mask_prev_year].sum(axis=1).mean()

    # 比率 = 前年同期重量 / 前年平均
    df_prev_year_sum["前年同期比率"] = df_prev_year_sum["前年同期重量"] / total_prev_year_weight
    df_prev_year_sum["前年同期比率"] = df_prev_year_sum["前年同期比率"].fillna(0)

    return df_prev_year_sum[["前年同期比率"]]


def add_holiday_adjacent_flag(dates: pd.Index) -> pd.Series:
    # 祝日フラグ（あなたの make_holiday_flag を使う）
    holiday_flags = dates.map(make_holiday_flag).astype(int)

    # 連休前後フラグ初期化
    adjacent_flags = np.zeros(len(dates), dtype=int)

    # ループで前後判定
    for i in range(1, len(dates) - 1):
        # 連休中なら1
        if holiday_flags[i] == 1:
            adjacent_flags[i] = 1
        # 連休前日
        elif holiday_flags[i - 1] == 1:
            adjacent_flags[i] = 1
        # 連休翌日
        elif holiday_flags[i + 1] == 1:
            adjacent_flags[i] = 1

    return pd.Series(adjacent_flags, index=dates, name="連休前後フラグ")

def add_month_start_end_flags(dates: pd.Index) -> pd.DataFrame:
    df = pd.DataFrame(index=dates)
    df["day"] = dates.day
    df["days_in_month"] = dates.days_in_month

    df["月初フラグ"] = (df["day"] <= 3).astype(int)
    df["月末フラグ"] = (df["days_in_month"] - df["day"] <= 2).astype(int)

    return df[["月初フラグ", "月末フラグ"]]


In [66]:
def prepare_features(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]:
    df = df.copy()
    df["伝票日付"] = pd.to_datetime(df["伝票日付"])
    df["品目分類"] = df["品名"].apply(classify_item)

    df_daily = df.groupby(["伝票日付", "品目分類"])["正味重量"].sum().reset_index()
    df_pivot = df_daily.pivot(index="伝票日付", columns="品目分類", values="正味重量").fillna(0)
    df_pivot = df_pivot.sort_index()

    df_feat = df_pivot.copy()
    df_feat["曜日"] = df_feat.index.dayofweek
    df_feat["月"] = df_feat.index.month
    df_feat["祝日フラグ"] = df_feat.index.map(make_holiday_flag)
    df_feat["前日合計"] = df_pivot.sum(axis=1).shift(1)

    for col in df_pivot.columns:
        df_feat[f"前日_{col}"] = df_pivot[col].shift(1)

    # --- 前年同期重量 merge ---
    df_feat = df_feat.merge(
        add_previous_year_feature(df_pivot),
        how="left",
        left_index=True,
        right_index=True
    )
    df_feat["前年同期重量"] = df_feat["前年同期重量"].fillna(0)

    # 月初／月末フラグ
    df_feat = df_feat.merge(
        add_month_start_end_flags(df_feat.index),
        how="left",
        left_index=True,
        right_index=True
    )

    # --- 連休前後フラグ（reset_index前に付ける！）---
    df_feat["連休前後フラグ"] = add_holiday_adjacent_flag(df_feat.index)

    # --- 目的変数（翌日合計重量）---
    df_feat["翌日合計"] = df_pivot.sum(axis=1).shift(-1)

    # --- 最後に dropna + reset_index ---
    df_feat = df_feat.dropna().reset_index()

    X = df_feat.drop(columns=["伝票日付", "翌日合計"])
    y = df_feat["翌日合計"]

    return X, y

## モデル処理

In [67]:

# --- Fold単位の学習・評価 ---
def train_and_evaluate_fold(fold, X_train, X_test, y_train, y_test):
    # --- LightGBMデータセット ---
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    
    # --- パラメータ ---
    params = {
        "objective": "regression",
        "metric": "mae",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 31,
        "max_depth": 7,
        "min_data_in_leaf": 20,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
    }
    
    # --- 学習 ---
    model = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_train, lgb_eval],
        num_boost_round=2000,
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(period=100),
        ],
    )
    
    # --- 学習データ側の評価 ---
    y_train_pred = model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    
    # --- テストデータ側の評価 ---
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    r2_test = r2_score(y_test, y_pred)
    
    # --- 結果表示 ---
    print(f"\n📂 Fold {fold+1}")
    print(f"📊 MAE:       {mae:.2f} kg")
    print(f"📊 Train R²:  {r2_train:.4f}")
    print(f"📊 Test  R²:  {r2_test:.4f}")
    
    return mae, r2_train, r2_test

# --- Fold平均結果出力 ---
def print_fold_summary(mae_list, r2_train_list, r2_test_list):
    print("\n✅ 全Fold平均MAE: {:.2f} kg".format(np.mean(mae_list)))
    print("✅ 全Fold平均Train R²: {:.4f}".format(np.mean(r2_train_list)))
    print("✅ 全Fold平均Test  R²: {:.4f}".format(np.mean(r2_test_list)))
    print("")


## メイン処理

In [68]:

# === メイン処理 ===

# --- データ読み込み（ここはあなたの元の df を用意しておく） ---
# 例: df = pd.read_csv("your_data.csv")
# 今回は既に df が定義済みとして進めます

X, y = prepare_features(df)

# --- TimeSeriesSplit設定 ---
tscv = TimeSeriesSplit(n_splits=5)

# --- foldごとの評価用 ---
fold_mae_list = []
fold_r2_train_list = []
fold_r2_test_list = []

# --- モデルループ ---
for fold, (train_index, test_index) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    mae, r2_train, r2_test = train_and_evaluate_fold(fold, X_train, X_test, y_train, y_test)
    
    fold_mae_list.append(mae)
    fold_r2_train_list.append(r2_train)
    fold_r2_test_list.append(r2_test)

# --- 結果まとめ表示 ---
print_fold_summary(fold_mae_list, fold_r2_train_list, fold_r2_test_list)


Training until validation scores don't improve for 100 rounds
[100]	training's l1: 8653.35	valid_1's l1: 13314.2
Early stopping, best iteration is:
[44]	training's l1: 10689.8	valid_1's l1: 12647.7

📂 Fold 1
📊 MAE:       12647.65 kg
📊 Train R²:  0.6807
📊 Test  R²:  0.5018
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 8255.34	valid_1's l1: 15280.9
Early stopping, best iteration is:
[54]	training's l1: 9989.23	valid_1's l1: 14981.4

📂 Fold 2
📊 MAE:       14981.43 kg
📊 Train R²:  0.6985
📊 Test  R²:  0.5151
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 8700.21	valid_1's l1: 13070
Early stopping, best iteration is:
[55]	training's l1: 10201.8	valid_1's l1: 12705.3

📂 Fold 3
📊 MAE:       12705.32 kg
📊 Train R²:  0.7097
📊 Test  R²:  0.5767
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 8670.91	valid_1's l1: 11660.4
Early stopping, best iteration is:
[60]	training's l1: 9906.24	valid_1