In [21]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

from lightgbm import LGBMRegressor
import joblib

In [22]:
df_hist = pd.read_csv(
    "../data/processed/data_cleaned_3.csv",
    parse_dates=[
        "Auftragseingang","Auftragsende_SOLL","AFO_Start_SOLL","AFO_Ende_SOLL",
        "AFO_Start_IST","AFO_Ende_IST","Auftragsende_IST"
    ],
    low_memory=False
)

df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max"
    })
    .reset_index()
)

df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

In [23]:
df_train = df_orders.dropna(subset=["target_days"])

X = df_train.drop(columns=["target_days", "Auftragsende_IST"])
y = df_train["target_days"]

num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numerisch:", list(num_cols))
print("Kategorial:", list(cat_cols))

Numerisch: ['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt']
Kategorial: ['Bauteilbezeichnung']


In [24]:
preprocess = ColumnTransformer(
    [
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [26]:
lgbm = LGBMRegressor(
    n_estimators=600,
    learning_rate=0.03,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

pipe = Pipeline([
    ("prep", preprocess),
    ("model", lgbm)
])

In [27]:
pipe.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 269
[LightGBM] [Info] Number of data points in the train set: 103551, number of used features: 6
[LightGBM] [Info] Start training from score 263.156516


In [28]:
preds_test = pipe.predict(X_test)
mae = mean_absolute_error(y_test, preds_test)

print("MAE TEST:", mae)



MAE TEST: 54.45138682727123


In [29]:
import os

os.makedirs("models", exist_ok=True)

joblib.dump(pipe, "models/lightgbm_pipeline.pkl")
print("Modell gespeichert.")

Modell gespeichert.


In [30]:
df_public = pd.read_csv("../data/raw/df_eval_public_2025-11-03.csv")
df_private = pd.read_csv("../data/raw/df_eval_private_2025-11-03.csv")
df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")

df_submit = pd.concat([df_public, df_private], ignore_index=True)
df_submit = df_ids.merge(df_submit, on="AuftragsID", how="left")
print(df_submit.shape)

(8546, 13)


In [31]:
model = joblib.load("models/lightgbm_pipeline.pkl")

feature_cols = model.feature_names_in_

# fehlende Spalten in df_submit erg√§nzen
for col in feature_cols:
    if col not in df_submit.columns:
        df_submit[col] = np.nan

pred_days = model.predict(df_submit[feature_cols])



In [32]:
# 0. Auftragseingang als echtes Datum casten
df_submit["Auftragseingang"] = pd.to_datetime(df_submit["Auftragseingang"], errors="coerce")

# 1. Modell laden
model = joblib.load("models/lightgbm_pipeline.pkl")

# 2. Feature-Liste aus Pipeline
feature_cols = model.feature_names_in_

# 3. Fehlende Spalten erg√§nzen
for col in feature_cols:
    if col not in df_submit.columns:
        print(f"‚ö†Ô∏è Erg√§nze fehlende Spalte: {col}")
        df_submit[col] = np.nan

# 4. Predict
X_submit = df_submit[feature_cols]
pred_days = model.predict(X_submit)

# 5. Vorhersage-Datum berechnen (JETZT sicher)
df_submit["Auftragsende_PREDICTED"] = (
    df_submit["Auftragseingang"] + pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

df_submit[["AuftragsID", "Auftragsende_PREDICTED"]].head()



Unnamed: 0,AuftragsID,Auftragsende_PREDICTED
0,144502,2024-08-26
1,147886,2024-11-30
2,135024,2024-02-26
3,135000,2023-11-04
4,146714,2024-08-22


In [34]:
submission = pd.DataFrame({
    "ID": np.arange(1, len(df_submit) + 1),
    "AuftragsID": df_submit["AuftragsID"],
    "Auftragsende_PREDICTED": df_submit["Auftragsende_PREDICTED"]
})

submission.to_csv("submissions/lightgbm_submission_tunedNew.csv", index=False)

print("FERTIG! üéâ SUBMISSION gespeichert.")

FERTIG! üéâ SUBMISSION gespeichert.
