# Notebook 4: Hyperparameter-Tuning und Modelloptimierung

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/theavengerscroissant/htw-awe-ki-ana/blob/main/notebooks/lecture_4_optimization.ipynb)

In [None]:
%pip install scikit-learn matplotlib numpy pandas

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# Laden des Beispiel-Datensatzes (California Housing Dataset)
california_housing = fetch_california_housing()
df_california_housing = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
df_california_housing['MedHouseVal'] = california_housing.target

# Aufteilen der Daten in Trainings-, Test- und Validierungs-Sets
X = df_california_housing.drop('MedHouseVal', axis=1)
y = df_california_housing['MedHouseVal']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


### Anwendung des Tunings auf Modelle wie Entscheidungsbäume

In [None]:
# GridSearchCV für DecisionTreeRegressor
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {np.sqrt(-grid_search.best_score_)}')

### Kreuzvalidierung

In [None]:
# k-fold Cross Validation für RandomForestRegressor
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(RandomForestRegressor(), param_dist, error_score='raise', n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
random_search.fit(X_train, y_train)

print(f'Best parameters: {random_search.best_params_}')
print(f'Best score: {np.sqrt(-random_search.best_score_)}')

### Modelloptimierung

In [None]:
# Vergleich optimierter Modelle mit den Baseline-Modellen aus Notebook 3
best_tree_model = grid_search.best_estimator_
best_forest_model = random_search.best_estimator_

models = {'Best Decision Tree': best_tree_model, 'Best Random Forest': best_forest_model}
for name, model in models.items():
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f'{name} - RMSE: {rmse}, MAE: {mae}, R²: {r2}')

### Aufgaben für Studierende

- Experimentiert mit verschiedenen Hyperparametern für GridSearchCV und RandomizedSearchCV
- Visualisiert die Tuning-Ergebnisse für verschiedene Hyperparameter
- Vergleicht die Leistung der optimierten Modelle mit den Baseline-Modellen aus Notebook 3