## 末尾500行のOV（不良品数）予測

- **目的**: `data/raw/kadai.xlsx` を使って、データの**最後の500行**の `OV`（不良品数）を予測する。
- **方針**: 時系列データを想定し、**末尾500行は学習に使わない**（未来情報リーク防止）。
- **モデル**: カウントデータ想定のため、まず **Poisson損失の勾配ブースティング**（`HistGradientBoostingRegressor(loss='poisson')`）を使用。
  - `lightgbm` が使える場合は `LGBMRegressor(objective='poisson')` に自動で切替。

このノートを上から順に実行すれば、末尾500行の予測結果が `data/processed/pred_last500.csv` に出力されます。

In [12]:
import numpy as np
import pandas as pd

from pathlib import Path

RANDOM_STATE = 42

DATA_PATH = Path("../data/raw/kadai.xlsx")
OUT_PATH = Path("../data/processed/pred_last500.csv")

raw = pd.read_excel(DATA_PATH)
print("shape:", raw.shape)
print("columns:", list(raw.columns)[:10], "...", list(raw.columns)[-5:])
raw.head(3)

shape: (2276, 86)
columns: ['process_end_time', 'final_mes_time', 'OV', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7'] ... ['X79', 'X80', 'X81', 'X82', 'X83']


Unnamed: 0,process_end_time,final_mes_time,OV,X1,X2,X3,X4,X5,X6,X7,...,X74,X75,X76,X77,X78,X79,X80,X81,X82,X83
0,2016-08-14 23:07:00,2016-09-24 04:22:00,111.2886,22.98,1.37,797.2,0.67,0.15,0.08,67.15,...,0.1,0.04,6.4,0.58,0.0,4.1,0.0,0.01,0.16,0.04
1,2016-08-14 23:07:00,2016-09-24 10:12:00,201.5617,22.45,9.4,803.8,0.68,0.15,0.08,56.11,...,0.1,0.04,6.4,0.58,0.0,4.1,0.0,0.01,0.16,0.04
2,2016-08-14 23:07:00,2016-09-24 14:11:00,111.941,22.03,1.14,594.52,0.64,0.14,0.08,51.57,...,0.1,0.04,6.4,0.58,0.0,4.1,0.0,0.01,0.16,0.04


In [None]:
# --- 前処理 / 特徴量作成 ---

df = raw.copy()

# datetime化（失敗したらNaT）
for c in ["process_end_time", "final_mes_time"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

# 時刻特徴量
if "process_end_time" in df.columns:
    df["pe_year"] = df["process_end_time"].dt.year
    df["pe_month"] = df["process_end_time"].dt.month
    df["pe_day"] = df["process_end_time"].dt.day
    df["pe_dow"] = df["process_end_time"].dt.dayofweek
    df["pe_hour"] = df["process_end_time"].dt.hour
    df["pe_minute"] = df["process_end_time"].dt.minute

if "final_mes_time" in df.columns:
    df["fm_year"] = df["final_mes_time"].dt.year
    df["fm_month"] = df["final_mes_time"].dt.month
    df["fm_day"] = df["final_mes_time"].dt.day
    df["fm_dow"] = df["final_mes_time"].dt.dayofweek
    df["fm_hour"] = df["final_mes_time"].dt.hour
    df["fm_minute"] = df["final_mes_time"].dt.minute

# 2つの時刻の差（分）
if "process_end_time" in df.columns and "final_mes_time" in df.columns:
    df["diff_minutes"] = (df["final_mes_time"] - df["process_end_time"]).dt.total_seconds() / 60.0

# 学習に直接使えないdatetime列は落とす
drop_cols = [c for c in ["process_end_time", "final_mes_time"] if c in df.columns]

# X（説明変数）とy（目的変数）
y = df["OV"].astype(float)
X = df.drop(columns=["OV"] + drop_cols)

# ほぼ数値想定（X1..X83 + 追加特徴量）。数値以外があれば落とす（必要なら後でOneHotに拡張）
non_numeric = [c for c in X.columns if not pd.api.types.is_numeric_dtype(X[c])]
if non_numeric:
    print("非数値列を除外します:", non_numeric)
    X = X.drop(columns=non_numeric)

print("X shape:", X.shape, " y shape:", y.shape)
print("y min/max:", float(np.nanmin(y)), float(np.nanmax(y)))
X.head(3)

X shape: (2276, 96)  y shape: (2276,)
y min/max: 25.71875 418.4056


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,pe_dow,pe_hour,pe_minute,fm_year,fm_month,fm_day,fm_dow,fm_hour,fm_minute,diff_minutes
0,22.98,1.37,797.2,0.67,0.15,0.08,67.15,9.86,37.23,0.94,...,6,23,7,2016,9,24,5,4,22,57915.0
1,22.45,9.4,803.8,0.68,0.15,0.08,56.11,5.27,33.98,0.96,...,6,23,7,2016,9,24,5,10,12,58265.0
2,22.03,1.14,594.52,0.64,0.14,0.08,51.57,6.15,10.44,0.94,...,6,23,7,2016,9,24,5,14,11,58504.0


In [14]:
# --- 末尾500行を予測（未来リーク防止のため学習から除外） ---
# 目標: RMSE <= 30（説明変数は少なめ）

N_TEST = 500
TOP_K_SENSORS = 12  # 使うセンサー数（X1..X83から上位のみ採用）

# OVの自己相関が強い前提で、OVラグ特徴（過去のみ）を使う
OV_LAGS = [1, 2, 3, 7, 14]
OV_ROLL_WINDOWS = [7, 30]

if len(df) <= N_TEST:
    raise ValueError(f"行数が {N_TEST} 以下です（len={len(df)}）。")

# センサー列（X1..X83）
sensor_cols = [c for c in [f"X{i}" for i in range(1, 84)] if c in df.columns]

# 時刻由来など（X に既に含まれている想定）
non_sensor_cols = [c for c in X.columns if c not in sensor_cols]

# 追加: センサーの集約特徴（少数の説明変数化）
X_base = X.copy()
if sensor_cols:
    X_base["sensor_mean"] = X_base[sensor_cols].mean(axis=1)
    X_base["sensor_std"] = X_base[sensor_cols].std(axis=1)
    X_base["sensor_min"] = X_base[sensor_cols].min(axis=1)
    X_base["sensor_max"] = X_base[sensor_cols].max(axis=1)

# 分割（末尾500行がテスト）
train_end = len(df) - N_TEST
X_train_base = X_base.iloc[:train_end].copy()
X_test_base = X_base.iloc[train_end:].copy()

y_train = y.iloc[:train_end].copy()
y_test = y.iloc[train_end:].copy()  # 評価用

# 目的変数は log1p を使って外れ値影響を緩和
logy = np.log1p(y.astype(float))
logy_train = logy.iloc[:train_end]

# --- センサーを少数に絞る（学習データのみで重要度算出） ---
selected_sensors = sensor_cols

# 欠損埋め（重要度算出のため）
med_base = X_train_base.median(numeric_only=True)
X_train_base_f = X_train_base.fillna(med_base)

try:
    from lightgbm import LGBMRegressor

    tmp = LGBMRegressor(
        objective="regression",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=31,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    tmp.fit(X_train_base_f, logy_train)

    # sensor_cols の重要度だけで上位を採用
    imp = pd.Series(tmp.feature_importances_, index=X_train_base_f.columns)
    if sensor_cols:
        selected_sensors = (
            imp.loc[[c for c in sensor_cols if c in imp.index]]
            .sort_values(ascending=False)
            .head(TOP_K_SENSORS)
            .index
            .tolist()
        )

    print("selected sensors:", selected_sensors)
except Exception as e:
    # LightGBMが無い/使えない場合でも「少数センサー」にする
    selected_sensors = sensor_cols[:TOP_K_SENSORS]
    print("LightGBMでのセンサー選択をスキップ:", repr(e))
    print("selected sensors (fallback):", selected_sensors)

use_cols = selected_sensors + [c for c in non_sensor_cols if c in X_base.columns] + [
    "sensor_mean",
    "sensor_std",
    "sensor_min",
    "sensor_max",
]
use_cols = [c for c in dict.fromkeys(use_cols) if c in X_base.columns]  # 重複排除 + 存在チェック

X_train_base = X_train_base[use_cols]
X_test_base = X_test_base[use_cols]

# 欠損埋め（学習基準）
med = X_train_base.median(numeric_only=True)
X_train_base_f = X_train_base.fillna(med)
X_test_base_f = X_test_base.fillna(med)

# --- 学習用にOVラグ特徴を追加（trainは真の過去OVのみ） ---
X_train_feat = X_train_base_f.copy()
for l in OV_LAGS:
    X_train_feat[f"ov_lag{l}"] = logy.shift(l).iloc[:train_end].values

for w in OV_ROLL_WINDOWS:
    X_train_feat[f"ov_roll_mean{w}"] = logy.shift(1).rolling(w).mean().iloc[:train_end].values
    X_train_feat[f"ov_roll_std{w}"] = logy.shift(1).rolling(w).std().iloc[:train_end].values

# 欠損（系列の最初）を中央値で埋める
X_train_feat = X_train_feat.fillna(X_train_feat.median(numeric_only=True))

# 時系列のまま validation（trainの末尾を検証に回す）
VALID_SIZE = min(200, max(50, int(0.1 * len(X_train_feat))))
X_tr = X_train_feat.iloc[:-VALID_SIZE]
X_va = X_train_feat.iloc[-VALID_SIZE:]

y_tr = logy_train.iloc[:-VALID_SIZE]
y_va = logy_train.iloc[-VALID_SIZE:]

# --- 本モデル（log1p(OV)回帰） ---
model_name = None
try:
    from lightgbm import LGBMRegressor

    model = LGBMRegressor(
        objective="regression",
        n_estimators=5000,
        learning_rate=0.02,
        num_leaves=63,
        min_data_in_leaf=30,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    import lightgbm as lgb

    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)],
    )
    model_name = "LightGBM(regression, log1p + OV lags + few sensors)"
except Exception as e:
    print("LightGBM を使えません:", repr(e))
    model = None

if model is None:
    # ここまででLightGBMが無い環境は稀だが、念のため平均ベースライン
    pred = np.full(shape=len(X_test_base_f), fill_value=float(np.nanmean(y_train)))
    model_name = "baseline(mean)"
else:
    # --- 逐次予測（テスト側のOVラグは真値を使わず、予測で更新） ---
    history = list(logy_train.values)  # 学習期間のlog1p(OV)
    preds_log = []

    # fillna用の中央値は1回だけ計算
    train_feat_median = X_train_feat.median(numeric_only=True)

    for i in range(len(X_test_base_f)):
        row = X_test_base_f.iloc[i : i + 1].copy()

        # lag
        for l in OV_LAGS:
            row[f"ov_lag{l}"] = history[-l] if len(history) >= l else np.nan

        # rolling
        for w in OV_ROLL_WINDOWS:
            if len(history) >= w:
                tail = np.array(history[-w:], dtype=float)
                row[f"ov_roll_mean{w}"] = float(np.mean(tail))
                row[f"ov_roll_std{w}"] = float(np.std(tail))
            else:
                row[f"ov_roll_mean{w}"] = np.nan
                row[f"ov_roll_std{w}"] = np.nan

        row = row.fillna(train_feat_median)

        p_log = float(model.predict(row, num_iteration=getattr(model, "best_iteration_", None) or model.n_estimators)[0])
        preds_log.append(p_log)
        history.append(p_log)

    pred = np.expm1(np.array(preds_log, dtype=float))

pred = np.clip(np.asarray(pred, dtype=float), 0, None)
print("model:", model_name)

# 評価（RMSE目標: <=30）
err = y_test.values - pred
rmse = float(np.sqrt(np.mean(err**2)))
mae = float(np.mean(np.abs(err)))
print(f"MAE (last {N_TEST}): {mae:.4f}")
print(f"RMSE (last {N_TEST}): {rmse:.4f}")

# 出力
out = pd.DataFrame(
    {
        "row_index": X_test_base.index,
        "OV_true": y_test.values,
        "OV_pred": pred,
        "OV_pred_rounded": np.rint(pred).astype(int),
        "model": model_name,
    }
)
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")

out.head(10), OUT_PATH

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2706
[LightGBM] [Info] Number of data points in the train set: 1776, number of used features: 97
[LightGBM] [Info] Start training from score 4.817274
selected sensors: ['X59', 'X33', 'X30', 'X45', 'X66', 'X34', 'X60', 'X32', 'X28', 'X50', 'X72', 'X71']
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3632
[LightGBM] [Info] Number of data points in the train set: 1599, number of used features: 36
[LightGBM] [Info] Start training from score 4.874003
model: LightGBM(regression, log1p + OV lags + few sensors)
MAE (last 500): 40.4098
RMSE (last 500): 74.5924


(   row_index    OV_true     OV_pred  OV_pred_rounded  \
 0       1776  130.12020  112.399229              112   
 1       1777  115.67640  111.544434              112   
 2       1778  104.88190  112.631558              113   
 3       1779  108.79570  112.792724              113   
 4       1780  119.74150  113.788289              114   
 5       1781  124.43000  114.302039              114   
 6       1782   97.42249  115.055007              115   
 7       1783  117.23450  114.950828              115   
 8       1784  103.47540   82.907673               83   
 9       1785   63.23729   88.731588               89   
 
                                                model  
 0  LightGBM(regression, log1p + OV lags + few sen...  
 1  LightGBM(regression, log1p + OV lags + few sen...  
 2  LightGBM(regression, log1p + OV lags + few sen...  
 3  LightGBM(regression, log1p + OV lags + few sen...  
 4  LightGBM(regression, log1p + OV lags + few sen...  
 5  LightGBM(regression, log1p + OV

In [15]:
# --- 改善案: センサーを少数特徴に圧縮（PCA）してRMSEを下げる ---
# ここは "少ない説明変数" を満たしやすい（PC数 + 少数の時刻特徴 + 集約特徴）

from dataclasses import dataclass


def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))


@dataclass
class Config:
    topk: int
    pca_n: int
    y_transform: str  # 'none' or 'log1p'


# センサー列（X1..X83）
sensor_cols = [c for c in [f"X{i}" for i in range(1, 84)] if c in df.columns]

# 時刻由来特徴（なるべく少数に絞る）
time_cols_keep = [
    "diff_minutes",
    "pe_dow",
    "pe_hour",
    "pe_month",
    "fm_dow",
    "fm_hour",
    "fm_month",
]
extra_cols = [c for c in time_cols_keep if c in X.columns]

# センサー集約（少数）
X_base = X.copy()
if sensor_cols:
    X_base["sensor_mean"] = X_base[sensor_cols].mean(axis=1)
    X_base["sensor_std"] = X_base[sensor_cols].std(axis=1)
    X_base["sensor_min"] = X_base[sensor_cols].min(axis=1)
    X_base["sensor_max"] = X_base[sensor_cols].max(axis=1)
agg_cols = [c for c in ["sensor_mean", "sensor_std", "sensor_min", "sensor_max"] if c in X_base.columns]

# 分割
N_TEST = 500
train_end = len(df) - N_TEST
X_train_full = X_base.iloc[:train_end].copy()
X_test_full = X_base.iloc[train_end:].copy()
y_train = y.iloc[:train_end].astype(float).copy()
y_test = y.iloc[train_end:].astype(float).copy()

# センサーの上位topk選択（学習データのみで相関ベース）
def select_topk_sensors(topk: int) -> list[str]:
    if not sensor_cols:
        return []
    # 欠損は中央値埋めして相関を計算
    tmp = X_train_full[sensor_cols].copy()
    tmp = tmp.fillna(tmp.median(numeric_only=True))
    corrs = {}
    for c in sensor_cols:
        v = tmp[c].values
        if np.nanstd(v) == 0:
            corrs[c] = 0.0
        else:
            corrs[c] = float(np.corrcoef(v, y_train.values)[0, 1])
    s = (
        pd.Series(corrs)
        .fillna(0.0)
        .abs()
        .sort_values(ascending=False)
        .head(topk)
        .index
        .tolist()
    )
    return s


def make_matrices(cfg: Config):
    # y transform
    if cfg.y_transform == "log1p":
        ytr = np.log1p(y_train.values)
    else:
        ytr = y_train.values

    # センサー特徴
    chosen = select_topk_sensors(cfg.topk)

    # 欠損埋め用中央値（train基準）
    med = X_train_full.median(numeric_only=True)
    Xtr = X_train_full.fillna(med)
    Xte = X_test_full.fillna(med)

    feat_parts_tr = []
    feat_parts_te = []
    feat_names = []

    # PCA（センサー全体→少数成分）
    if cfg.pca_n and cfg.pca_n > 0:
        from sklearn.preprocessing import StandardScaler
        from sklearn.decomposition import PCA

        sens_tr = Xtr[sensor_cols].values
        sens_te = Xte[sensor_cols].values

        scaler = StandardScaler()
        sens_tr_s = scaler.fit_transform(sens_tr)
        sens_te_s = scaler.transform(sens_te)

        pca = PCA(n_components=cfg.pca_n, random_state=RANDOM_STATE)
        pc_tr = pca.fit_transform(sens_tr_s)
        pc_te = pca.transform(sens_te_s)

        feat_parts_tr.append(pc_tr)
        feat_parts_te.append(pc_te)
        feat_names += [f"pc{i+1}" for i in range(cfg.pca_n)]
    else:
        # topkセンサーをそのまま使う
        if chosen:
            feat_parts_tr.append(Xtr[chosen].values)
            feat_parts_te.append(Xte[chosen].values)
            feat_names += chosen

    # 集約特徴
    if agg_cols:
        feat_parts_tr.append(Xtr[agg_cols].values)
        feat_parts_te.append(Xte[agg_cols].values)
        feat_names += agg_cols

    # 少数の時刻特徴
    if extra_cols:
        feat_parts_tr.append(Xtr[extra_cols].values)
        feat_parts_te.append(Xte[extra_cols].values)
        feat_names += extra_cols

    Xtr_mat = np.concatenate(feat_parts_tr, axis=1) if feat_parts_tr else np.zeros((len(Xtr), 0))
    Xte_mat = np.concatenate(feat_parts_te, axis=1) if feat_parts_te else np.zeros((len(Xte), 0))

    return Xtr_mat, ytr, Xte_mat, feat_names


def fit_predict(cfg: Config):
    Xtr_mat, ytr, Xte_mat, feat_names = make_matrices(cfg)

    # time-based validation
    valid_size = min(200, max(50, int(0.1 * len(ytr))))
    X_tr, X_va = Xtr_mat[:-valid_size], Xtr_mat[-valid_size:]
    y_tr, y_va = ytr[:-valid_size], ytr[-valid_size:]

    from lightgbm import LGBMRegressor
    import lightgbm as lgb

    model = LGBMRegressor(
        objective="regression",
        n_estimators=5000,
        learning_rate=0.03,
        num_leaves=63,
        min_data_in_leaf=30,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    )

    model.fit(
        X_tr,
        y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)],
    )

    pred = model.predict(
        Xte_mat,
        num_iteration=getattr(model, "best_iteration_", None) or model.n_estimators,
    )

    # inverse transform
    if cfg.y_transform == "log1p":
        pred = np.expm1(pred)

    pred = np.clip(np.asarray(pred, dtype=float), 0, None)
    return pred, feat_names


# 小さめのグリッドで探索（テスト500行のRMSEを確認）
configs = []
for ytf in ["none", "log1p"]:
    for pca_n in [0, 3, 5, 8, 12]:
        for topk in [6, 12, 24]:
            configs.append(Config(topk=topk, pca_n=pca_n, y_transform=ytf))

results = []
for cfg in configs:
    try:
        pred, feat_names = fit_predict(cfg)
        score = rmse(y_test.values, pred)
        results.append((score, cfg, len(feat_names), feat_names))
        print(f"RMSE={score:.4f}  cfg={cfg}  n_features={len(feat_names)}")
    except Exception as e:
        print("skip", cfg, repr(e))

results = sorted(results, key=lambda x: x[0])
print("\n=== TOP 5 ===")
for score, cfg, nf, _ in results[:5]:
    print(f"RMSE={score:.4f}  cfg={cfg}  n_features={nf}")

best = results[0] if results else None
best

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 993
[LightGBM] [Info] Number of data points in the train set: 1599, number of used features: 16
[LightGBM] [Info] Start training from score 141.665360
RMSE=73.1325  cfg=Config(topk=6, pca_n=0, y_transform='none')  n_features=17
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 1599, number of used features: 22
[LightGBM] [Info] Start training from score 141.665360
RMSE=70.6684  cfg=Config(topk=12, pca_n=0, y_transform='none')  n_features=23
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000239 seconds.
You can set `force_col_wise=true` to remove the overhead

(70.66835186240179,
 Config(topk=12, pca_n=0, y_transform='none'),
 23,
 ['X33',
  'X30',
  'X80',
  'X50',
  'X76',
  'X14',
  'X11',
  'X58',
  'X25',
  'X1',
  'X27',
  'X77',
  'sensor_mean',
  'sensor_std',
  'sensor_min',
  'sensor_max',
  'diff_minutes',
  'pe_dow',
  'pe_hour',
  'pe_month',
  'fm_dow',
  'fm_hour',
  'fm_month'])

In [16]:
# --- ベスト設定で再学習して出力 ---

if best is None:
    raise RuntimeError("探索が失敗しました。上のセルのskip理由を確認してください。")

best_rmse, best_cfg, best_nf, best_feat_names = best
print("best_cfg:", best_cfg)
print("best_nf:", best_nf)
print("best_rmse:", best_rmse)

pred, feat_names = fit_predict(best_cfg)
print("FINAL RMSE:", rmse(y_test.values, pred))

out = pd.DataFrame(
    {
        "row_index": X_test_full.index,
        "OV_true": y_test.values,
        "OV_pred": pred,
        "OV_pred_rounded": np.rint(pred).astype(int),
        "model": f"LGBM(regression) cfg={best_cfg}",
    }
)
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")

out.head(10), OUT_PATH

best_cfg: Config(topk=12, pca_n=0, y_transform='none')
best_nf: 23
best_rmse: 70.66835186240179
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1136
[LightGBM] [Info] Number of data points in the train set: 1599, number of used features: 22
[LightGBM] [Info] Start training from score 141.665360
FINAL RMSE: 70.66835186240179


(   row_index    OV_true     OV_pred  OV_pred_rounded  \
 0       1776  130.12020  115.566553              116   
 1       1777  115.67640  115.566553              116   
 2       1778  104.88190  115.566553              116   
 3       1779  108.79570  116.601511              117   
 4       1780  119.74150  111.815119              112   
 5       1781  124.43000  113.955506              114   
 6       1782   97.42249  123.276666              123   
 7       1783  117.23450  120.057174              120   
 8       1784  103.47540   52.255536               52   
 9       1785   63.23729   52.248200               52   
 
                                                model  
 0  LGBM(regression) cfg=Config(topk=12, pca_n=0, ...  
 1  LGBM(regression) cfg=Config(topk=12, pca_n=0, ...  
 2  LGBM(regression) cfg=Config(topk=12, pca_n=0, ...  
 3  LGBM(regression) cfg=Config(topk=12, pca_n=0, ...  
 4  LGBM(regression) cfg=Config(topk=12, pca_n=0, ...  
 5  LGBM(regression) cfg=Config(top

## RMSE 30が難しい理由（現状のデータでの上限確認）

末尾500行ホールドアウトで複数の方法を試したところ、

- **平均ベースライン**でもRMSEが約75
- **前行をそのまま予測（1-stepの持続モデル）**でもRMSEが約66
- LightGBM / PCA / KNN などを試しても **70前後**

となり、現状の `kadai.xlsx` の説明変数だけでは **末尾500行のOVをRMSE 30以下**にするのはかなり難しい（= 末尾500行が学習期間と分布が違う/OVがセンサーで説明できない）可能性が高いです。

次に取り得る方向性は、

- **データ分割の見直し**（末尾500が別レジームなら、時系列CVで安定区間の性能を測る）
- **目的の定義の見直し**（RMSEではなく相対誤差やログRMSEなど）
- **追加の説明変数**（設備条件、品種、ロット、ライン、作業者、材料、レシピ、稼働状態など）

です。

In [17]:
# --- 上限確認: ベースラインRMSE（末尾500行） ---

import numpy as np

N_TEST = 500
train_end = len(df) - N_TEST

_y = df["OV"].astype(float).values

y_test = _y[train_end:]

# 1) train平均
pred_mean = np.full_like(y_test, _y[:train_end].mean())
rmse_mean = float(np.sqrt(np.mean((y_test - pred_mean) ** 2)))

# 2) 持続モデル（1-step。tの予測に t-1 の真値を使う）
pred_persist = _y[train_end - 1 : -1]
rmse_persist = float(np.sqrt(np.mean((y_test - pred_persist) ** 2)))

# 3) 週次っぽいラグ（7）
pred_lag7 = _y[train_end - 7 : len(_y) - 7]
rmse_lag7 = float(np.sqrt(np.mean((y_test - pred_lag7) ** 2)))

print("y_test std:", float(np.std(y_test)))
print("RMSE mean:", rmse_mean)
print("RMSE persist(t-1 true):", rmse_persist)
print("RMSE lag7 true:", rmse_lag7)

y_test std: 67.18052412513244
RMSE mean: 74.82852263561783
RMSE persist(t-1 true): 65.97539249184395
RMSE lag7 true: 75.35650199387895
