In [1]:
# ZELLE 1 â€“ Imports & Pfade

import os
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from lightgbm import LGBMRegressor
import joblib

# Pfade anpassen, falls dein Projekt anders strukturiert ist
HIST_PATH          = "../data/processed/data_cleaned_3.csv"
EVAL_PUBLIC_PATH   = "../data/raw/df_eval_public_2025-11-03.csv"
EVAL_PRIVATE_PATH  = "../data/raw/df_eval_private_2025-11-03.csv"
IDS_PATH           = "../data/raw/df_IDs_for_eval_2025-11-03.csv"

MODEL_DIR          = "../models"
SUBMISSION_DIR     = "../submissions"
MODEL_PATH         = os.path.join(MODEL_DIR, "lightgbm_pipeline.pkl")
SUBMISSION_PATH    = os.path.join(SUBMISSION_DIR, "lgbm_tuned_submission.csv")

os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)

RANDOM_STATE = 42

In [2]:
# ZELLE 2 â€“ Daten laden (robust, ohne parse_dates-Fehler)

# 1) CSVs normal laden (ohne parse_dates, damit keine Fehler auftreten)
df_hist         = pd.read_csv(HIST_PATH, low_memory=False)
df_eval_public  = pd.read_csv(EVAL_PUBLIC_PATH, low_memory=False)
df_eval_private = pd.read_csv(EVAL_PRIVATE_PATH, low_memory=False)
df_ids          = pd.read_csv(IDS_PATH)

print("History shape:", df_hist.shape)
print("Eval public shape:", df_eval_public.shape)
print("Eval private shape:", df_eval_private.shape)
print("IDs shape:", df_ids.shape)

print("\nSpalten df_hist:")
print(df_hist.columns.tolist())
print("\nSpalten df_eval_public:")
print(df_eval_public.columns.tolist())

# 2) Datums-Spalten nachtrÃ¤glich in datetime umwandeln (nur wenn vorhanden)
DATE_COLS = [
    "Auftragseingang",
    "Auftragsende_SOLL",
    "AFO_Start_SOLL",
    "AFO_Ende_SOLL",
    "AFO_Start_IST",
    "AFO_Ende_IST",
    "Auftragsende_IST"
]

for col in DATE_COLS:
    if col in df_hist.columns:
        df_hist[col] = pd.to_datetime(df_hist[col], errors="coerce")
    if col in df_eval_public.columns:
        df_eval_public[col] = pd.to_datetime(df_eval_public[col], errors="coerce")
    if col in df_eval_private.columns:
        df_eval_private[col] = pd.to_datetime(df_eval_private[col], errors="coerce")

df_hist.head()

History shape: (1393700, 15)
Eval public shape: (4273, 13)
Eval private shape: (4273, 13)
IDs shape: (8546, 1)

Spalten df_hist:
['AuftragsID', 'BauteilID', 'Bauteilbezeichnung', 'Auftragseingang', 'PrioritÃ¤t', 'Auftragsende_SOLL', 'Arbeitsschritt', 'Arbeitsschrittbezeichnung', 'AFO_Start_SOLL', 'AFO_Ende_SOLL', 'AFO_Start_IST', 'AFO_Ende_IST', 'MaschinenID', 'Maschinenbezeichnung', 'Auftragsende_IST']

Spalten df_eval_public:
['AuftragsID', 'BauteilID', 'Bauteilbezeichnung', 'Auftragseingang', 'PrioritÃ¤t', 'Auftragsende_SOLL', 'Arbeitsschritt', 'Arbeitsschrittbezeichnung', 'AFO_Start_SOLL', 'AFO_Ende_SOLL', 'AFO_Start_IST', 'MaschinenID', 'Maschinenbezeichnung']


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,PrioritÃ¤t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,AFO_Start_IST,AFO_Ende_IST,MaschinenID,Maschinenbezeichnung,Auftragsende_IST
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 08:39:00
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:04:00
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 11:25:00
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:19:00
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,2014-01-01 07:00:00,2014-01-01 07:01:00,,,2014-05-07 09:07:00


In [3]:
# ZELLE 3 â€“ Feature-Engineering-Funktionen

def make_order_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Aggregiere AFO-Ebene auf Auftrags-Ebene (eine Zeile pro AuftragsID).
    Robust: verwendet nur Spalten, die existieren.
    """
    if "AuftragsID" not in df.columns:
        raise ValueError("Spalte 'AuftragsID' fehlt in df!")

    rows = []
    grouped = df.groupby("AuftragsID", sort=False)

    for aid, g in grouped:
        row = {"AuftragsID": aid}

        # Basis-Metadaten
        if "BauteilID" in g:
            row["BauteilID"] = g["BauteilID"].iloc[0]
        if "Bauteilbezeichnung" in g:
            row["Bauteilbezeichnung"] = g["Bauteilbezeichnung"].iloc[0]
        if "PrioritÃ¤t" in g:
            row["PrioritÃ¤t"] = g["PrioritÃ¤t"].iloc[0]

        # Auftragseingang: frÃ¼hestes Datum
        if "Auftragseingang" in g:
            row["Auftragseingang"] = g["Auftragseingang"].min()

        # Auftragsende_SOLL: spÃ¤testes SOLL
        if "Auftragsende_SOLL" in g:
            row["Auftragsende_SOLL"] = g["Auftragsende_SOLL"].max()

        # echtes Auftragsende_IST (fÃ¼r Training)
        if "Auftragsende_IST" in g:
            row["Auftragsende_IST"] = g["Auftragsende_IST"].max()

        # Arbeitsschritt-Infos
        if "Arbeitsschritt" in g:
            row["Arbeitsschritt_min"] = g["Arbeitsschritt"].min()
            row["Arbeitsschritt_max"] = g["Arbeitsschritt"].max()
            row["Arbeitsschritt_nunique"] = g["Arbeitsschritt"].nunique()

        # AFO IST-Daten (sofern vorhanden)
        if "AFO_Start_IST" in g:
            row["AFO_Start_IST_min"] = g["AFO_Start_IST"].min()
        if "AFO_Ende_IST" in g:
            row["AFO_Ende_IST_max"] = g["AFO_Ende_IST"].max()

        # AFO Dauer (optional, falls vorhanden)
        if "AFO_Dauer_IST_Stunde" in g:
            row["AFO_Dauer_sum"] = g["AFO_Dauer_IST_Stunde"].sum()
            row["AFO_Dauer_mean"] = g["AFO_Dauer_IST_Stunde"].mean()

        # Anzahl AFOs
        row["n_AFO"] = len(g)

        rows.append(row)

    out = pd.DataFrame(rows)
    return out


def add_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    FÃ¼ge einfache Datums-Features hinzu (Wochentag, Monat, etc.).
    Wichtig: verwendet KEIN Auftragsende_IST als Feature, um Leakage zu vermeiden.
    """
    df = df.copy()

    date_source_cols = []
    for col in ["Auftragseingang", "Auftragsende_SOLL", "AFO_Start_IST_min", "AFO_Ende_IST_max"]:
        if col in df.columns:
            date_source_cols.append(col)

    for col in date_source_cols:
        d = pd.to_datetime(df[col], errors="coerce")
        df[f"{col}_dow"] = d.dt.dayofweek
        df[f"{col}_month"] = d.dt.month
        df[f"{col}_day"] = d.dt.day
        df[f"{col}_weekofyear"] = d.dt.isocalendar().week.astype("Int64")

    return df

In [4]:
# ZELLE 4 â€“ Trainingsdaten vorbereiten

# 1. Aggregation auf Auftrags-Ebene
train_orders = make_order_features(df_hist)
train_orders = add_date_features(train_orders)

print("Train orders shape:", train_orders.shape)
train_orders.head()

Train orders shape: (138068, 29)


Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,PrioritÃ¤t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt_min,Arbeitsschritt_max,Arbeitsschritt_nunique,...,Auftragsende_SOLL_day,Auftragsende_SOLL_weekofyear,AFO_Start_IST_min_dow,AFO_Start_IST_min_month,AFO_Start_IST_min_day,AFO_Start_IST_min_weekofyear,AFO_Ende_IST_max_dow,AFO_Ende_IST_max_month,AFO_Ende_IST_max_day,AFO_Ende_IST_max_weekofyear
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,1,999,6,...,1,1,2,1,1,1,2,5,7,19
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,1,999,6,...,1,1,2,1,1,1,2,5,7,19
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,1,999,7,...,1,1,2,1,1,1,2,5,7,19
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,1,999,6,...,1,1,2,1,1,1,2,5,7,19
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,1,999,6,...,1,1,2,1,1,1,2,5,7,19


In [5]:
# ZELLE 4b â€“ Target berechnen (Dauer in Tagen)

# PrÃ¼fen, ob echtes Auftragsende_IST vorhanden ist
if "Auftragsende_IST" in train_orders.columns:
    print("â†’ Nutze echtes Auftragsende_IST als Target")
    target_end = pd.to_datetime(train_orders["Auftragsende_IST"], errors="coerce")
elif "AFO_Ende_IST_max" in train_orders.columns:
    print("â†’ Fallback: nutze AFO_Ende_IST_max als Target")
    target_end = pd.to_datetime(train_orders["AFO_Ende_IST_max"], errors="coerce")
else:
    raise ValueError("Kein passendes Enddatum (Auftragsende_IST oder AFO_Ende_IST_max) im Training gefunden!")

start = pd.to_datetime(train_orders["Auftragseingang"], errors="coerce")

train_orders["target_days"] = (target_end - start).dt.total_seconds() / 86400

# nur gÃ¼ltige Targets behalten
df_train = train_orders.dropna(subset=["target_days", "Auftragseingang"]).copy()
print("Train usable shape:", df_train.shape)

# Optional: extreme AusreiÃŸer filtern (kann Score verbessern)
# df_train = df_train[(df_train["target_days"] >= 0) & (df_train["target_days"] <= 365)]

target_col = "target_days"

# Spalten, die NICHT als Features ins Modell sollen
drop_cols_for_X = [
    target_col,
    "AuftragsID",
    "Auftragsende_IST",     # echtes Ende (Target-Info)
    # interne Aggregatsfelder, die evtl. nicht nÃ¶tig sind
    # (nur weglassen, wenn sie wirklich nicht als Feature gewÃ¼nscht sind)
]

# Datetime-Spalten nicht direkt verwenden (wir nutzen nur die abgeleiteten _dow/_month/... )
datetime_cols = df_train.select_dtypes(include="datetime64[ns]").columns.tolist()
drop_cols_for_X.extend(datetime_cols)

# unique + nur Spalten entfernen, die es wirklich gibt
drop_cols_for_X = list(set(c for c in drop_cols_for_X if c in df_train.columns))

X = df_train.drop(columns=drop_cols_for_X)
y = df_train[target_col]

print("X shape:", X.shape)
print("y shape:", y.shape)

X.dtypes.head()

â†’ Nutze echtes Auftragsende_IST als Target
Train usable shape: (138068, 30)
X shape: (138068, 23)
y shape: (138068,)


BauteilID              int64
Bauteilbezeichnung    object
PrioritÃ¤t              int64
Arbeitsschritt_min     int64
Arbeitsschritt_max     int64
dtype: object

In [6]:
# ZELLE 5 â€“ Train/Val Split + ColumnTransformer

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print("X_train:", X_train.shape, "X_val:", X_val.shape)

# Spalten-Typen bestimmen
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number, "Int64", "Float64"]).columns.tolist()

print("\nKategoriale Spalten:", cat_cols)
print("Numerische Spalten:", num_cols)

# Pipelines
cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", cat_pipeline, cat_cols),
        ("num", num_pipeline, num_cols)
    ]
)

X_train: (110454, 23) X_val: (27614, 23)

Kategoriale Spalten: ['Bauteilbezeichnung']
Numerische Spalten: ['BauteilID', 'PrioritÃ¤t', 'Arbeitsschritt_min', 'Arbeitsschritt_max', 'Arbeitsschritt_nunique', 'n_AFO', 'Auftragseingang_dow', 'Auftragseingang_month', 'Auftragseingang_day', 'Auftragseingang_weekofyear', 'Auftragsende_SOLL_dow', 'Auftragsende_SOLL_month', 'Auftragsende_SOLL_day', 'Auftragsende_SOLL_weekofyear', 'AFO_Start_IST_min_dow', 'AFO_Start_IST_min_month', 'AFO_Start_IST_min_day', 'AFO_Start_IST_min_weekofyear', 'AFO_Ende_IST_max_dow', 'AFO_Ende_IST_max_month', 'AFO_Ende_IST_max_day', 'AFO_Ende_IST_max_weekofyear']


In [7]:
# ZELLE 6 â€“ Basis-LightGBM-Pipeline (ohne Tuning, zum Vergleich)

base_lgbm = LGBMRegressor(
    n_estimators=400,
    learning_rate=0.05,
    num_leaves=63,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

base_pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", base_lgbm)
])

print("ðŸ”µ Fitte Basis-LightGBM...")
base_pipe.fit(X_train, y_train)

val_pred_base = base_pipe.predict(X_val)

mae_base = mean_absolute_error(y_val, val_pred_base)
rmse_base = mean_squared_error(y_val, val_pred_base) ** 0.5  # FIX
r2_base = r2_score(y_val, val_pred_base)

print("\nðŸ“Š Basis-Modell (LightGBM) â€“ Validation:")
print(f"MAE (Tage):  {mae_base:.3f}")
print(f"RMSE (Tage): {rmse_base:.3f}")
print(f"RÂ²:          {r2_base:.5f}")

ðŸ”µ Fitte Basis-LightGBM...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000777 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 110454, number of used features: 23
[LightGBM] [Info] Start training from score 263.227709

ðŸ“Š Basis-Modell (LightGBM) â€“ Validation:
MAE (Tage):  15.722
RMSE (Tage): 39.458
RÂ²:          0.93437




In [9]:
# ZELLE 7 â€“ Hyperparameter-Tuning fÃ¼r LightGBM (RandomizedSearchCV)

param_distributions = {
    "model__num_leaves":        [31, 63, 127],
    "model__max_depth":         [-1, 8, 12, 16],
    "model__learning_rate":     [0.01, 0.03, 0.05],
    "model__n_estimators":      [300, 500, 800],
    "model__subsample":         [0.7, 0.9, 1.0],
    "model__colsample_bytree":  [0.7, 0.9, 1.0],
    "model__min_child_samples": [20, 50, 100],
    "model__reg_alpha":         [0.0, 0.1, 1.0],
    "model__reg_lambda":        [0.0, 0.1, 1.0],
}

tuned_lgbm = LGBMRegressor(
    objective="regression",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("prep", preprocess),
    ("model", tuned_lgbm)
])

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_distributions,
    n_iter=10,                 
    cv=2,
    scoring="neg_mean_absolute_error",
    verbose=2,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("ðŸ”µ Starte Hyperparameter-Suche (RandomizedSearchCV)...")
search.fit(X_train, y_train)
print("âœ… Suche fertig!")

print("\nBeste Parameter:")
print(search.best_params_)
print(f"Best CV MAE: {-search.best_score_:.4f} Tage")

best_pipe = search.best_estimator_

# Evaluation auf dem Val-Set
val_pred = best_pipe.predict(X_val)

mae = mean_absolute_error(y_val, val_pred)
rmse = np.sqrt(mean_squared_error(y_val, val_pred))  # <<< FIX HERE
r2 = r2_score(y_val, val_pred)

print("\nðŸ“Š Getuntes LightGBM â€“ Validation:")
print(f"MAE (Tage):  {mae:.3f}")
print(f"RMSE (Tage): {rmse:.3f}")
print(f"RÂ²:          {r2:.5f}")

ðŸ”µ Starte Hyperparameter-Suche (RandomizedSearchCV)...
Fitting 2 folds for each of 10 candidates, totalling 20 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006339 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001103 seconds.
You can set `force_row_wise=



[CV] END model__colsample_bytree=0.9, model__learning_rate=0.05, model__max_depth=8, model__min_child_samples=100, model__n_estimators=300, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.7; total time=   8.7s
[CV] END model__colsample_bytree=0.9, model__learning_rate=0.05, model__max_depth=8, model__min_child_samples=100, model__n_estimators=300, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.7; total time=   8.8s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000613 se



[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  13.6s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000617 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.05, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  15.5s




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003797 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.087815
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.03, model__max_depth=16, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=63, model__reg_alpha=0.1, model__reg_lambda=1.0, model__subsample=1.0; total time=  16.4s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004895 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.03, model__max_depth=16, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=63, model__reg_alpha=0.1, model__reg_lambda=1.0, model__subsample=1.0; total time=  18.6s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.087815





[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003253 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.367603




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=12, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.1, model__subsample=1.0; total time=   9.3s
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.01, model__max_depth=8, model__min_child_samples=20, model__n_estimators=500, model__num_leaves=63, model__reg_alpha=1.0, model__reg_lambda=0.1, model__subsample=1.0; total time=  26.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.087815
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001907 seconds.
You can set `force_row_wise=true` to remove the overhead.




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=12, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.1, model__subsample=1.0; total time=   9.9s




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 55227, number of used features: 23
[LightGBM] [Info] Start training from score 263.087815
[CV] END model__colsample_bytree=0.9, model__learning_rate=0.01, model__max_depth=16, model__min_child_samples=50, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=1.0, model__reg_lambda=1.0, model__subsample=0.9; total time=  16.0s




[CV] END model__colsample_bytree=0.9, model__learning_rate=0.01, model__max_depth=16, model__min_child_samples=50, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=1.0, model__reg_lambda=1.0, model__subsample=0.9; total time=  15.7s




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=8, model__min_child_samples=50, model__n_estimators=800, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  36.1s




[CV] END model__colsample_bytree=1.0, model__learning_rate=0.03, model__max_depth=8, model__min_child_samples=50, model__n_estimators=800, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  37.8s




[CV] END model__colsample_bytree=0.9, model__learning_rate=0.05, model__max_depth=16, model__min_child_samples=20, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  12.6s




[CV] END model__colsample_bytree=0.9, model__learning_rate=0.03, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=127, model__reg_alpha=0.0, model__reg_lambda=0.1, model__subsample=0.7; total time=  30.8s
[CV] END model__colsample_bytree=0.9, model__learning_rate=0.03, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=300, model__num_leaves=127, model__reg_alpha=0.0, model__reg_lambda=0.1, model__subsample=0.7; total time=  30.9s




[CV] END model__colsample_bytree=0.9, model__learning_rate=0.05, model__max_depth=16, model__min_child_samples=20, model__n_estimators=500, model__num_leaves=31, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=1.0; total time=  11.5s




[CV] END model__colsample_bytree=0.7, model__learning_rate=0.05, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=800, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.9; total time=  20.5s
[CV] END model__colsample_bytree=0.7, model__learning_rate=0.05, model__max_depth=-1, model__min_child_samples=50, model__n_estimators=800, model__num_leaves=63, model__reg_alpha=0.0, model__reg_lambda=0.0, model__subsample=0.9; total time=  19.2s
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000714 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 110454, number of used features: 23
[LightGBM] [Info] Start training from score 263.227709
âœ… Suche fertig!

Beste Parameter:
{'model__subsample': 0.9, 'model__reg_lambda': 0.0, 'model__



In [10]:
# ZELLE 8 â€“ Bestes Modell auf allen Trainingsdaten fitten & speichern

print("ðŸ”µ Trainiere bestes Modell auf ALLEN Trainingsdaten (X, y)...")
best_pipe.fit(X, y)

joblib.dump(best_pipe, MODEL_PATH)

print(f"ðŸ“¦ Modell gespeichert unter: {MODEL_PATH}")

ðŸ”µ Trainiere bestes Modell auf ALLEN Trainingsdaten (X, y)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 442
[LightGBM] [Info] Number of data points in the train set: 138068, number of used features: 23
[LightGBM] [Info] Start training from score 263.378911
ðŸ“¦ Modell gespeichert unter: ../models/lightgbm_pipeline.pkl


In [11]:
# ZELLE 9 â€“ Eval-Daten vorbereiten (Public + Private) fÃ¼r Submission

# 1. Eval-Daten zusammenfÃ¼hren
df_eval_all = pd.concat([df_eval_public, df_eval_private], axis=0, ignore_index=True)
print("Eval all shape (AFO-Ebene):", df_eval_all.shape)

# 2. Gleiche Aggregation wie beim Training
eval_orders = make_order_features(df_eval_all)
eval_orders = add_date_features(eval_orders)

print("Eval orders shape (Auftrags-Ebene):", eval_orders.shape)

# 3. Mit IDs mergen, um Reihenfolge wie Kaggle-IDs sicherzustellen
#   df_ids hat AuftragsID; eval_orders hat AuftragsID (aggregiert)
df_eval_merged = df_ids.merge(eval_orders, on="AuftragsID", how="left")

print("Eval merged shape:", df_eval_merged.shape)

# 4. Sicherstellen, dass alle Feature-Spalten (X.columns) existieren
for col in X.columns:
    if col not in df_eval_merged.columns:
        # Spalte existiert im Training, aber nicht in Eval â†’ mit NaN auffÃ¼llen
        df_eval_merged[col] = np.nan

# 5. Feature-Matrix fÃ¼r Submission
X_submit = df_eval_merged[X.columns]

X_submit.head()

Eval all shape (AFO-Ebene): (8546, 13)
Eval orders shape (Auftrags-Ebene): (8546, 23)
Eval merged shape: (8546, 23)


Unnamed: 0,BauteilID,Bauteilbezeichnung,PrioritÃ¤t,Arbeitsschritt_min,Arbeitsschritt_max,Arbeitsschritt_nunique,n_AFO,Auftragseingang_dow,Auftragseingang_month,Auftragseingang_day,...,Auftragsende_SOLL_day,Auftragsende_SOLL_weekofyear,AFO_Start_IST_min_dow,AFO_Start_IST_min_month,AFO_Start_IST_min_day,AFO_Start_IST_min_weekofyear,AFO_Ende_IST_max_dow,AFO_Ende_IST_max_month,AFO_Ende_IST_max_day,AFO_Ende_IST_max_weekofyear
0,1,Steuerventilmodul,1,31,31,1,1,6,9,10,...,2,40,,,,,,,,
1,1,Steuerventilmodul,1,31,31,1,1,2,12,6,...,27,52,,,,,,,,
2,1,Steuerventilmodul,1,999,999,1,1,2,2,22,...,1,9,,,,,,,,
3,2,Schwenkzylinder,1,42,42,1,1,6,10,30,...,28,9,,,,,,,,
4,2,Schwenkzylinder,1,42,42,1,1,2,8,23,...,1,48,,,,,,,,


In [None]:
# ZELLE 10 â€“ Finale Submission erstellen

print("ðŸ”µ Lade bestes Modell...")
model = joblib.load(MODEL_PATH)

print("ðŸ”µ Starte Prediction fÃ¼r Submission...")
pred_days = model.predict(X_submit)

print("ðŸ”µ Berechne Auftragsende...")

df_eval_merged["Auftragsende_PREDICTED"] = (
    df_eval_merged["Auftragseingang"] + pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

# Kaggle-Finalform: AuftragsID + Auftragsende
submission = df_ids.merge(
    df_eval_merged[["AuftragsID", "Auftragsende_PREDICTED"]],
    on="AuftragsID",
    how="left"
)

submission.rename(columns={"Auftragsende_PREDICTED": "Auftragsende_PREDICTED"}, inplace=True)

# ðŸ”¥ HIER: ID-Spalte einfÃ¼gen (1, 2, 3, ...)
submission.insert(0, "ID", range(1, len(submission) + 1))

submission.to_csv("LightGBM_Please_submission.csv", index=False)

print("âœ… LightGBM_Please_submission.csv wurde erfolgreich erstellt!")
submission.head()

ðŸ”µ Lade bestes Modell...
ðŸ”µ Starte Prediction fÃ¼r Submission...
ðŸ”µ Berechne Auftragsende...
âœ… LightGBM_Please_submission.csv wurde erfolgreich erstellt!




Unnamed: 0,ID,AuftragsID,Auftragsende
0,1,144502,2024-05-29
1,2,147886,2024-12-26
2,3,135024,2023-11-21
3,4,135000,2023-08-13
4,5,146714,2024-08-16
