In [19]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import RandomForestRegressor

import joblib


In [20]:
df_hist = pd.read_csv(
    "../data/processed/data_feature_zeit_3_gesamt.csv",
    parse_dates=[
        "Auftragseingang","Auftragsende_SOLL","AFO_Start_SOLL","AFO_Ende_SOLL",
        "AFO_Start_IST","AFO_Ende_IST","Auftragsende_IST"
    ],
    low_memory=False
)

df_ids = pd.read_csv("../data/raw/df_IDs_for_eval_2025-11-03.csv")

df_hist.head()

KeyboardInterrupt: 

In [None]:
df_orders = (
    df_hist.sort_values(["AuftragsID", "AFO_Ende_IST"])
    .groupby("AuftragsID")
    .agg({
        "BauteilID": "first",
        "Bauteilbezeichnung": "first",
        "Priorit√§t": "first",
        "Auftragseingang": "first",
        "Auftragsende_SOLL": "first",
        "Auftragsende_IST": "max",
        "Arbeitsschritt": "max",
        "AFO_Start_IST": "min",
        "AFO_Ende_IST": "max",
        "AFO_Dauer_IST_Stunde": "sum"
    })
    .reset_index()
)

# Target: Dauer in Tagen
df_orders["target_days"] = (
    df_orders["Auftragsende_IST"] - df_orders["Auftragseingang"]
).dt.total_seconds() / 86400

df_orders.head()

Unnamed: 0,AuftragsID,BauteilID,Bauteilbezeichnung,Priorit√§t,Auftragseingang,Auftragsende_SOLL,Auftragsende_IST,Arbeitsschritt,AFO_Start_IST,AFO_Ende_IST,AFO_Dauer_IST_Stunde,target_days
0,1,1,Steuerventilmodul,1,2013-10-29,2014-01-01 11:32:00,2014-05-07 08:39:00,999,2014-01-01 07:00:00,2014-05-07 08:39:00,4.89,190.360417
1,2,1,Steuerventilmodul,1,2013-08-16,2014-01-01 11:32:00,2014-05-07 09:04:00,999,2014-01-01 07:00:00,2014-05-07 09:04:00,5.01,264.377778
2,3,1,Steuerventilmodul,1,2013-08-05,2014-01-01 11:32:00,2014-05-07 11:25:00,999,2014-01-01 07:00:00,2014-05-07 11:25:00,7.82,275.475694
3,4,1,Steuerventilmodul,1,2013-10-12,2014-01-01 11:32:00,2014-05-07 09:19:00,999,2014-01-01 07:00:00,2014-05-07 09:19:00,5.12,207.388194
4,5,1,Steuerventilmodul,1,2013-10-03,2014-01-01 11:32:00,2014-05-07 09:07:00,999,2014-01-01 07:00:00,2014-05-07 09:07:00,5.41,216.379861


In [None]:
df_train = df_orders.dropna(subset=["target_days"])

X = df_train.drop(columns=["target_days", "Auftragsende_IST"])
y = df_train["target_days"]

num_cols = X.select_dtypes(include=["number"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_cols, cat_cols

(Index(['AuftragsID', 'BauteilID', 'Priorit√§t', 'Arbeitsschritt',
        'AFO_Dauer_IST_Stunde'],
       dtype='object'),
 Index(['Bauteilbezeichnung'], dtype='object'))

In [None]:
preprocess = ColumnTransformer(
    [
        ("num", SimpleImputer(strategy="median"), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [None]:
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_dist = {
    "model__n_estimators": [80, 120, 180],
    "model__max_depth": [8, 12, None],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", 0.5],
}

pipe = Pipeline([
    ("prep", preprocess),
    ("model", rf)
])

In [None]:
search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=8,
    cv=2,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

print("üîµ Starte RandomizedSearchCV ‚Ä¶")
search.fit(X_train, y_train)
print("‚úÖ Hyperparameter Suche fertig!")

print("BEST PARAMS:", search.best_params_)

best_model = search.best_estimator_

üîµ Starte RandomizedSearchCV ‚Ä¶
Fitting 2 folds for each of 8 candidates, totalling 16 fits
[CV] END model__max_depth=12, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=120; total time=   4.6s
[CV] END model__max_depth=12, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=120; total time=   4.7s
[CV] END model__max_depth=12, model__max_features=0.5, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=120; total time=   5.9s
[CV] END model__max_depth=12, model__max_features=0.5, model__min_samples_leaf=2, model__min_samples_split=5, model__n_estimators=120; total time=   6.2s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimators=120; total time=  10.6s
[CV] END model__max_depth=None, model__max_features=sqrt, model__min_samples_leaf=1, model__min_samples_split=2, model__n_estimato

In [None]:
preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)

print("üìâ TEST MAE:", mae)

üìâ TEST MAE: 52.06895290141735


In [21]:
import os

# Pfad definieren
model_path = "../models"
model_file = os.path.join(model_path, "best_random_forest.pkl")

# Ordner automatisch anlegen
os.makedirs(model_path, exist_ok=True)

# Modell speichern
joblib.dump(best_model, model_file)

print(f"üíæ Modell gespeichert in: {model_file}")

üíæ Modell gespeichert in: ../models/best_random_forest.pkl


In [None]:
import os
import json
from datetime import datetime
import joblib
import numpy as np
import pandas as pd

# ----------------------------------------
# üìÅ Output-Ordner exakt wie im Beispiel
# ----------------------------------------
output_dir = "../models/tree/pipeline"
os.makedirs(output_dir, exist_ok=True)

# ----------------------------------------
# üíæ Modell speichern (Pipeline mit Preprocessing)
# ----------------------------------------
model_path = os.path.join(output_dir, "best_random_forest_pipeline.pkl")
joblib.dump(best_model, model_path)

print(f"üì¶ Modell gespeichert unter: {model_path}")

NS_PER_DAY = 24 * 60 * 60 * 1e9  # optional falls ben√∂tigt  
# Unser Modell arbeitet aber mit Tagen ‚Üí trotzdem √ºbernehmen wir Struktur

results = {
    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model_path": model_path,
    "metrics": {
        "MAE_days": float(mae),
    },
    "model_type": "RandomForest_Tuned",
}

metrics_path = os.path.join(output_dir, "random_forest_metrics.json")

with open(metrics_path, "w") as f:
    json.dump(results, f, indent=4)

print(f"üìù Metriken gespeichert unter: {metrics_path}")

üì§ Submission gespeichert: submission_rf_tuned.csv
