# Lösung Lab 09: Statistik & ML Basics

### Setup: Daten generieren

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)
n_samples = 200

# Features: Jahreseinkommen (30k bis 120k) und Kontostand (0 bis 50k)
income = np.random.uniform(30000, 120000, n_samples)
balance = np.random.uniform(0, 50000, n_samples)

# Target: Das Limit hängt von Einkommen und Balance ab + Noise
credit_limit = (income * 0.10) + (balance * 0.20) + np.random.normal(0, 2000, n_samples)

df_credit = pd.DataFrame({
    "income": income,
    "balance": balance,
    "credit_limit": credit_limit
})

print("Kredit-Daten generiert.")

### Lineare Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 1. Features & Target definieren
X = df_credit[["income", "balance"]]
y = df_credit["credit_limit"]

# 2. Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3. Training
model = LinearRegression()
model.fit(X_train, y_train)

# 4. Prediction für Neukunden
# income=85000, balance=12000. Input muss 2D sein: [[inc, bal]]
new_customer = [[85000, 12000]]
predicted_limit = model.predict(new_customer)

print(f"Empfohlenes Kreditlimit: {predicted_limit[0]:.2f} EUR")

### Evaluation & Pipelines

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# 1. Evaluation
y_pred = model.predict(X_test)

# RMSE und R2 berechnen (analog zu Theory Code Section 4)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Model Error (RMSE): +/- {rmse:.2f} EUR")
print(f"Explained Variance (R2): {r2:.4f}") # Nahe 1.0 ist sehr gut

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 2. Pipeline mit Skalierung
# Analog zu Theory Code Section 5
pipe = Pipeline([
    ('scaler', StandardScaler()),       # Skaliert Daten auf Mean=0, Std=1
    ('regressor', LinearRegression())   # Das eigentliche Modell
])

pipe.fit(X_train, y_train)

# Quick Check via Score Methode
pipeline_score = pipe.score(X_test, y_test)
print(f"Pipeline R2 Score: {pipeline_score:.4f}")