In [39]:
import pandas as pd
import numpy as np
from pathlib import Path
import mlflow
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import optuna

# 既存と揃える
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("packet")

DATA_DIR = Path("data")
hist_file = DATA_DIR / "/Users/hiraokatatsuru/Library/Mobile Documents/com~apple~CloudDocs/postal-operation-shift-management-system/db/init/csv/postal_datas.csv"   # あなたの既存構成に合わせて修正


In [40]:
df = pd.read_csv(hist_file, parse_dates=["日付"])
df = df.sort_values("日付").reset_index(drop=True)

# ゆうパケット列が "ゆうパケット" である前提
df["target"] = df["ゆうパケット"]


In [41]:
import jpholiday

df["weekday"] = df["日付"].dt.weekday

# holiday 列がない場合は作成する
if "holiday" not in df.columns:
    df["holiday"] = df["日付"].apply(
        lambda d: 1 if (jpholiday.is_holiday(d) or d.weekday() >= 5) else 0
    )
else:
    df["holiday"] = df["holiday"].astype(int)
    
# 季節（Spring, Summer, Fall, Winter を 0-3 のカテゴリに）
df["month"] = df["日付"].dt.month
df["season"] = df["month"] % 12 // 3  # 0=冬,1=春,2=夏,3=秋
    
# ここで必ず数値化しておく
df["target"] = pd.to_numeric(df["ゆうパケット"], errors="coerce")

# そのあとでラグ・移動平均を作る
df["yupacket_lag_1"] = df["target"].shift(1)
df["yupacket_lag_7"] = df["target"].shift(7)
df["yupacket_rm7"]   = df["target"].rolling(7).mean()
def is_gw(d):
    """ゴールデンウィーク (4/29〜5/5)"""
    return (d.month == 4 and d.day >= 29) or (d.month == 5 and d.day <= 5)

def is_obon(d):
    """お盆 (8/13〜8/16)"""
    return (d.month == 8 and 13 <= d.day <= 16)

def is_nenmatsu(d):
    """年末年始 (12/29〜1/3)"""
    return (
        (d.month == 12 and d.day >= 29) or
        (d.month == 1  and d.day <= 3)
    )

df["is_gw"] = df["日付"].apply(lambda d: 1 if is_gw(d) else 0)
df["is_obon"] = df["日付"].apply(lambda d: 1 if is_obon(d) else 0)
df["is_nenmatsu"] = df["日付"].apply(lambda d: 1 if is_nenmatsu(d) else 0)
df = df.dropna().reset_index(drop=True)


In [42]:
n = len(df)
split_idx = int(n * 0.8)

train = df.iloc[:split_idx].copy()
valid = df.iloc[split_idx:].copy()

print("train:", train["日付"].min(), "〜", train["日付"].max(), ", len =", len(train))
print("valid:", valid["日付"].min(), "〜", valid["日付"].max(), ", len =", len(valid))


train: 2021-10-08 00:00:00 〜 2024-05-20 00:00:00 , len = 956
valid: 2024-05-21 00:00:00 〜 2025-01-15 00:00:00 , len = 240


In [43]:
df.head()

Unnamed: 0,日付,通常郵便,書留,ゆうパケット,レターパックライト,レターパックプラス,特定記録,ゆうパック,eパケット,EMS,...,weekday,holiday,month,season,yupacket_lag_1,yupacket_lag_7,yupacket_rm7,is_gw,is_obon,is_nenmatsu
0,2021-10-08,71000.0,1587.0,874.0,557.0,239.0,459.0,1806.0,170.0,258.0,...,4.0,0,10.0,3.0,1100.0,882.0,890.857143,0,0,0
1,2021-10-09,0.0,1849.0,726.0,344.0,148.0,0.0,1607.0,0.0,184.0,...,5.0,1,10.0,3.0,874.0,673.0,898.428571,0,0,0
2,2021-10-10,0.0,987.0,713.0,141.0,60.0,0.0,1284.0,0.0,160.0,...,6.0,1,10.0,3.0,726.0,898.0,872.0,0,0,0
3,2021-10-11,96000.0,428.0,719.0,352.0,151.0,542.0,1213.0,397.0,353.0,...,0.0,0,10.0,3.0,713.0,688.0,876.428571,0,0,0
4,2021-10-12,51000.0,740.0,924.0,438.0,188.0,374.0,1124.0,147.0,174.0,...,1.0,0,10.0,3.0,719.0,877.0,883.142857,0,0,0


In [44]:
# === セルC: モデル評価用関数 ===

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

def evaluate_features(feature_list, model_params=None, verbose=True):
    """
    feature_list: 使用する特徴量のカラム名リスト
    """
    if model_params is None:
        model_params = dict(
            n_estimators=300,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            n_jobs=-1,
        )
    
    X_train = train[feature_list]
    y_train = train["target"]
    X_valid = valid[feature_list]
    y_valid = valid["target"]
    
    model = XGBRegressor(**model_params)
    model.fit(X_train, y_train)
    
    pred_valid = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, pred_valid)
    
    if verbose:
        print("features:", feature_list)
        print("MAE(valid):", mae)
    
    return mae, model


In [45]:
# === セルD: 書留モデル ベースライン ===

BASE_FEATURES = ["weekday", "holiday"]  # まずは最低限

baseline_mae, baseline_model = evaluate_features(BASE_FEATURES)

print("===== BASELINE (ゆうパケ) =====")
print("features:", BASE_FEATURES)
print("MAE(valid):", baseline_mae)


features: ['weekday', 'holiday']
MAE(valid): 306.01219889322914
===== BASELINE (ゆうパケ) =====
features: ['weekday', 'holiday']
MAE(valid): 306.01219889322914


In [46]:
# === ゆうパケット用: 特徴量を1つずつ追加して比較 ===

CANDIDATE_FEATURES_PACKET = [
    "yupacket_lag_1",
    "yupacket_lag_7",
    "yupacket_rm7",
    "season",
    "is_gw",
    "is_obon",
    "is_nenmatsu",
]

results_packet = []

for f in CANDIDATE_FEATURES_PACKET:
    feats = ["weekday", "holiday"] + [f]
    mae, _ = evaluate_features(feats, verbose=False)
    results_packet.append({"features": feats, "MAE": mae})

results_packet_df = pd.DataFrame(results_packet)
display(results_packet_df.sort_values("MAE"))

Unnamed: 0,features,MAE
0,"[weekday, holiday, yupacket_lag_1]",171.565573
2,"[weekday, holiday, yupacket_rm7]",176.928601
1,"[weekday, holiday, yupacket_lag_7]",235.786456
5,"[weekday, holiday, is_obon]",301.063845
4,"[weekday, holiday, is_gw]",303.416307
6,"[weekday, holiday, is_nenmatsu]",305.232749
3,"[weekday, holiday, season]",323.098049


In [47]:
BEST_FEATURES_YUPACKET = ["weekday", "holiday", "yupacket_lag_1"]
print("暫定 BEST_FEATURES_YUPACKET:", BEST_FEATURES_YUPACKET)

暫定 BEST_FEATURES_YUPACKET: ['weekday', 'holiday', 'yupacket_lag_1']


In [48]:
# === セルE: ゆうパケット 複数特徴パターン比較 ===

FEATURE_SETS_PACKET = [
    ["weekday", "holiday", "yupacket_lag_1"],
    ["weekday", "holiday", "yupacket_lag_1", "yupacket_rm7"],
    ["weekday", "holiday", "yupacket_lag_1", "yupacket_rm7", "yupacket_lag_7"],
    ["weekday", "holiday", "yupacket_lag_1", "is_obon", "is_nenmatsu"],
    [
        "weekday", "holiday",
        "yupacket_lag_1", "yupacket_rm7",
        "is_obon", "is_nenmatsu",
    ],
]

results_packet2 = []

for feats in FEATURE_SETS_PACKET:
    mae, _ = evaluate_features(feats, verbose=False)
    results_packet2.append({"features": feats, "MAE": mae})

results_packet2_df = pd.DataFrame(results_packet2)
display(results_packet2_df.sort_values("MAE"))


Unnamed: 0,features,MAE
1,"[weekday, holiday, yupacket_lag_1, yupacket_rm7]",159.296628
4,"[weekday, holiday, yupacket_lag_1, yupacket_rm...",159.509804
2,"[weekday, holiday, yupacket_lag_1, yupacket_rm...",167.167689
3,"[weekday, holiday, yupacket_lag_1, is_obon, is...",168.752497
0,"[weekday, holiday, yupacket_lag_1]",171.565573


In [49]:
BEST_FEATURES_YUPACKET = [
    "weekday",
    "holiday",
    "yupacket_lag_1",
    "yupacket_rm7",
]

print(BEST_FEATURES_YUPACKET)

['weekday', 'holiday', 'yupacket_lag_1', 'yupacket_rm7']


In [56]:
# === レタパックライト用の特徴量作成 ===

df_lpl = df.sort_values("日付").reset_index(drop=True).copy()

# target = レタパライト
df_lpl["target"] = pd.to_numeric(df_lpl["レターパックライト"], errors="coerce")

# カレンダー
df_lpl["weekday"] = df_lpl["日付"].dt.weekday

import jpholiday
df_lpl["holiday"] = df_lpl["日付"].apply(
    lambda d: 1 if (jpholiday.is_holiday(d) or d.weekday() >= 5) else 0
).astype(int)

# ラグ・移動平均
df_lpl["lp_light_lag_1"] = df_lpl["target"].shift(1)
df_lpl["lp_light_lag_7"] = df_lpl["target"].shift(7)
df_lpl["lp_light_rm7"]   = df_lpl["target"].rolling(7).mean()

# シーズン系（必要なら）
df_lpl["is_obon"] = ((df_lpl["日付"].dt.month == 8) & (df_lpl["日付"].dt.day.between(10, 16))).astype(int)
df_lpl["is_nenmatsu"] = ((df_lpl["日付"].dt.month == 12) & (df_lpl["日付"].dt.day >= 25)).astype(int)
df_lpl["season"] = df_lpl["日付"].dt.month

# 学習に使う部分だけ
df_lpl_model = df_lpl.dropna().reset_index(drop=True)

print(df_lpl_model.columns)


Index(['日付', '通常郵便', '書留', 'ゆうパケット', 'レターパックライト', 'レターパックプラス', '特定記録', 'ゆうパック',
       'eパケット', 'EMS', '年賀組立', '年賀配達', 'target', 'weekday', 'holiday', 'month',
       'season', 'yupacket_lag_1', 'yupacket_lag_7', 'yupacket_rm7', 'is_gw',
       'is_obon', 'is_nenmatsu', 'lp_light_lag_1', 'lp_light_lag_7',
       'lp_light_rm7'],
      dtype='object')


In [57]:
# === セルB: レタパライト ベースライン ===

BASE_FEATURES_LPL = ["weekday", "holiday"]

baseline_mae_lpl, _ = evaluate_features(BASE_FEATURES_LPL)

print("===== BASELINE (レタパライト) =====")
print("features:", BASE_FEATURES_LPL)
print("MAE(valid):", baseline_mae_lpl)


features: ['weekday', 'holiday']
MAE(valid): 306.01219889322914
===== BASELINE (レタパライト) =====
features: ['weekday', 'holiday']
MAE(valid): 306.01219889322914


In [59]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

def evaluate_features_with_df(feature_list, df_source, model_params=None, verbose=True):
    if model_params is None:
        model_params = dict(
            n_estimators=300,
            max_depth=5,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            n_jobs=-1,
        )

    valid_days = 42  # 書留・ゆうパケと合わせる

    data = df_source[["target"] + feature_list].copy().dropna().reset_index(drop=True)

    train = data.iloc[:-valid_days]
    valid = data.iloc[-valid_days:]

    X_train = train[feature_list]
    y_train = train["target"]
    X_valid = valid[feature_list]
    y_valid = valid["target"]

    model = XGBRegressor(**model_params)
    model.fit(X_train, y_train)

    pred_valid = model.predict(X_valid)
    mae = mean_absolute_error(y_valid, pred_valid)

    if verbose:
        print("features:", feature_list)
        print("MAE(valid):", mae)

    return mae, model

In [61]:
CANDIDATE_FEATURES_LPL = [
    "lp_light_lag_1",
    "lp_light_lag_7",
    "lp_light_rm7",
    "season",
    "is_obon",
    "is_nenmatsu",
]

results_lpl = []

for f in CANDIDATE_FEATURES_LPL:
    feats = ["weekday", "holiday"] + [f]
    mae, _ = evaluate_features_with_df(feats, df_lpl_model, verbose=False)
    results_lpl.append({"features": feats, "MAE": mae})

results_lpl_df = pd.DataFrame(results_lpl)
display(results_lpl_df.sort_values("MAE"))


Unnamed: 0,features,MAE
0,"[weekday, holiday, lp_light_lag_1]",85.970699
2,"[weekday, holiday, lp_light_rm7]",88.489708
5,"[weekday, holiday, is_nenmatsu]",101.440667
1,"[weekday, holiday, lp_light_lag_7]",120.417858
4,"[weekday, holiday, is_obon]",133.271374
3,"[weekday, holiday, season]",134.710119


In [62]:
BEST_FEATURES_LPL = ["weekday", "holiday", "lp_light_lag_1"]

In [63]:
df_grp = df.sort_values("日付").reset_index(drop=True).copy()

df_grp["target"] = (
    pd.to_numeric(df_grp["ゆうパケット"], errors="coerce") +
    pd.to_numeric(df_grp["レターパックライト"], errors="coerce")
)

In [64]:
df_grp["weekday"] = df_grp["日付"].dt.weekday

import jpholiday
df_grp["holiday"] = df_grp["日付"].apply(
    lambda d: 1 if (jpholiday.is_holiday(d) or d.weekday() >= 5) else 0
).astype(int)

df_grp["grp_lag_1"] = df_grp["target"].shift(1)
df_grp["grp_rm7"]   = df_grp["target"].rolling(7).mean()
df_grp["grp_lag_7"] = df_grp["target"].shift(7)

df_grp_model = df_grp.dropna().reset_index(drop=True)

In [65]:
CANDIDATE_FEATURES_GRP = [
    "grp_lag_1",
    "grp_rm7",
    "grp_lag_7",
]

results_grp = []
for f in CANDIDATE_FEATURES_GRP:
    feats = ["weekday", "holiday"] + [f]
    mae, _ = evaluate_features_with_df(feats, df_grp_model, verbose=False)
    results_grp.append({"features": feats, "MAE": mae})

results_grp_df = pd.DataFrame(results_grp)
display(results_grp_df.sort_values("MAE"))

Unnamed: 0,features,MAE
0,"[weekday, holiday, grp_lag_1]",239.363403
1,"[weekday, holiday, grp_rm7]",243.353203
2,"[weekday, holiday, grp_lag_7]",367.463662
