In [26]:
import pandas as pd
import numpy as np
import os
import json
from datetime import datetime
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [27]:
df_hist = pd.read_csv(
    "../data/processed/data_feature_zeit_3_gesamt.csv",
    parse_dates=[
        "Auftragseingang","Auftragsende_SOLL","AFO_Start_SOLL","AFO_Ende_SOLL",
        "AFO_Start_IST","AFO_Ende_IST","Auftragsende_IST"
    ],
    low_memory=False
)

df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")

df_hist.head()

Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Auftragseingang,Priorit√§t,Auftragsende_SOLL,Arbeitsschritt,Arbeitsschrittbezeichnung,AFO_Start_SOLL,AFO_Ende_SOLL,...,Auftrags_Laufzeit_Abweichung_Tage,Wartezeit_vor_Beginn_Tage,Pufferzeit_geplant_Tage,AFO_Start_Wochentag_Num,AFO_Start_Stunde,AFO_Kalenderwoche,AFO_Jahr,AFO_Ende_Stunde,AFO_Schicht,Lieferabweichung_Stunden
0,1,1,Steuerventilmodul,2013-10-29,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.879861,64.291667,64.291667,2,7,1,2014,7,Fr√ºh,-4.516667
1,2,1,Steuerventilmodul,2013-08-16,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.897222,138.291667,138.291667,2,7,1,2014,7,Fr√ºh,-4.516667
2,3,1,Steuerventilmodul,2013-08-05,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.995139,149.291667,149.291667,2,7,1,2014,7,Fr√ºh,-4.516667
3,4,1,Steuerventilmodul,2013-10-12,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.907639,81.291667,81.291667,2,7,1,2014,7,Fr√ºh,-4.516667
4,5,1,Steuerventilmodul,2013-10-03,1,2014-01-01 11:32:00,1,Info,2014-01-01 07:00:00,2014-01-01 07:01:00,...,125.899306,90.291667,90.291667,2,7,1,2014,7,Fr√ºh,-4.516667


In [28]:
df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max",
        "AFO_Dauer_IST_Stunde": "sum"
    })
    .reset_index()
)

df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

df_orders.head()

Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89,190.360417
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01,264.377778
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82,275.475694
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12,207.388194
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41,216.379861


In [29]:
df_train = df_orders.dropna(subset=["target_days"])

X = df_train.drop(columns=["target_days", "Auftragsende_IST"])
y = df_train["target_days"]

num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocess = ColumnTransformer(
    [
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [30]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist = {
    "model__n_estimators": [80, 120, 180],
    "model__max_depth": [8, 12, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", 0.5],
}

pipe = Pipeline([
    ("prep", preprocess),
    ("model", rf)
])

search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("üîµ Starte Hyperparameter-Suche ‚Ä¶")
search.fit(X_train, y_train)
print("‚úÖ Hyperparameter-Suche abgeschlossen!")

üîµ Starte Hyperparameter-Suche ‚Ä¶
Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
best_model = search.best_estimator_

pred_train = best_model.predict(X_train)
pred_test = best_model.predict(X_test)

mae_train = mean_absolute_error(y_train, pred_train)
mae_test = mean_absolute_error(y_test, pred_test)

print("MAE Train:", mae_train)
print("MAE Test:", mae_test)

cv_scores = -cross_val_score(best_model, X, y, cv=5, scoring="neg_mean_absolute_error", n_jobs=-1)
print("CV MAE mean:", cv_scores.mean())
print("CV MAE std:", cv_scores.std())

MAE Train: 49.90854810021755
MAE Test: 52.06895290141735
CV MAE mean: 57.72546883415704
CV MAE std: 5.218972741534108


In [None]:
output_dir = "../models/random_forest/pipeline"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "rf_pipeline.pkl")
joblib.dump(best_model, model_path)

results = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_path": model_path,
    "best_params": search.best_params_,
    "metrics": {
        "MAE_train": float(mae_train),
        "MAE_test": float(mae_test),
        "CV_MAE_mean": float(cv_scores.mean()),
        "CV_MAE_std": float(cv_scores.std())
    }
}

with open(os.path.join(output_dir, "rf_metrics.json"), "w") as f:
    json.dump(results, f, indent=4)

print("üì¶ Modell gespeichert:", model_path)

üì¶ Modell gespeichert: ../models/random_forest/pipeline/rf_pipeline.pkl


In [None]:
df_public = pd.read_csv("../data/raw/df_eval_public_2025-11-03.csv")
df_private = pd.read_csv("../data/raw/df_eval_private_2025-11-03.csv")

df_eval = pd.concat([df_public, df_private], ignore_index=True)

df_eval_sorted = df_ids.merge(df_eval, on="AuftragsID", how="left")

In [None]:
for col in X.columns:
    if col not in df_eval_sorted.columns:
        df_eval_sorted[col] = np.nan

In [None]:
# Auftragseingang als Datum parsen (wichtig!)
df_eval_sorted["Auftragseingang"] = pd.to_datetime(
    df_eval_sorted["Auftragseingang"], errors="coerce"
)

# Vorhersagen machen
pred_days = best_model.predict(df_eval_sorted[X.columns])

# Datum berechnen
df_eval_sorted["Auftragsende_PREDICTED"] = (
    df_eval_sorted["Auftragseingang"] +
    pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

In [None]:
df_eval_sorted["ID"] = np.arange(1, len(df_eval_sorted)+1)

df_submit = df_eval_sorted[["ID","AuftragsID","Auftragsende_PREDICTED"]]
df_submit.to_csv("../submissions/submission_randomforest.csv", index=False)

df_submit.head()

Unnamed: 0,ID,AuftragsID,Auftragsende_PREDICTED
0,1,144502,2024-08-25
1,2,147886,2024-11-23
2,3,135024,2024-03-02
3,4,135000,2023-11-09
4,5,146714,2024-08-09


In [None]:
import os

# Ordner f√ºr Submission
os.makedirs("../submissions", exist_ok=True)

print("üîÑ Starte Submission-Erstellung‚Ä¶")

# -----------------------------
# Eval-Daten laden
# -----------------------------
df_public = pd.read_csv("../data/raw/df_eval_public_2025-11-03.csv")
df_private = pd.read_csv("../data/raw/df_eval_private_2025-11-03.csv")
df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")

df_eval = pd.concat([df_public, df_private], ignore_index=True)

# IDs-Reihenfolge sicherstellen
df_eval_sorted = df_ids.merge(df_eval, on="AuftragsID", how="left")

# -----------------------------
# Datumsspalten konvertieren
# -----------------------------
df_eval_sorted["Auftragseingang"] = pd.to_datetime(
    df_eval_sorted["Auftragseingang"], errors="coerce"
)

# Falls weitere Datumsspalten ben√∂tigt werden:
date_cols_eval = [
    "Auftragsende_SOLL", "AFO_Start_SOLL", "AFO_Ende_SOLL",
    "AFO_Start_IST", "AFO_Ende_IST"
]
for col in date_cols_eval:
    if col in df_eval_sorted.columns:
        df_eval_sorted[col] = pd.to_datetime(df_eval_sorted[col], errors="coerce")

# -----------------------------
# Fehlende Modell-Features anlegen
# -----------------------------
for col in X.columns:
    if col not in df_eval_sorted.columns:
        df_eval_sorted[col] = np.nan

# -----------------------------
# Predictions
# -----------------------------
pred_days = best_model.predict(df_eval_sorted[X.columns])

df_eval_sorted["Auftragsende_PREDICTED"] = (
    df_eval_sorted["Auftragseingang"] +
    pd.to_timedelta(pred_days, unit="D")
).dt.strftime("%Y-%m-%d")

# -----------------------------
# Submission erstellen
# -----------------------------
df_submit = pd.DataFrame({
    "ID": np.arange(1, len(df_eval_sorted) + 1),
    "AuftragsID": df_eval_sorted["AuftragsID"],
    "Auftragsende_PREDICTED": df_eval_sorted["Auftragsende_PREDICTED"]
})

save_path = "../submissions/submission_random_forest_optimized.csv"
df_submit.to_csv(save_path, index=False)

print(f"‚úÖ Submission gespeichert unter: {save_path}")
df_submit.head()

üîÑ Starte Submission-Erstellung‚Ä¶
‚úÖ Submission gespeichert unter: ../submissions/submission_random_forest_optimized.csv


Unnamed: 0,ID,AuftragsID,Auftragsende_PREDICTED
0,1,144502,2024-08-25
1,2,147886,2024-11-23
2,3,135024,2024-03-02
3,4,135000,2023-11-09
4,5,146714,2024-08-09
