In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Daten laden
df = pd.read_csv("/workspaces/codespaces-jupyter/merged_data.csv")

# Datum in datetime umwandeln
df['Datum'] = pd.to_datetime(df['Datum'])

# Fehlende Werte in Warengruppe entfernen
df = df.dropna(subset=["Warengruppe", "Umsatz"])

# One-Hot-Encoding für Warengruppe
df_encoded = pd.get_dummies(df, columns=["Warengruppe"], drop_first=True)

# Datensatz aufteilen nach Datum
train_data = df_encoded[df_encoded['Datum'] < "2017-08-01"]
test_data = df_encoded[(df_encoded['Datum'] >= "2017-08-01") & (df_encoded['Datum'] <= "2018-07-31")]

# Features definieren (nur Warengruppe-Features)
features = [col for col in df_encoded.columns if col.startswith("Warengruppe_")]
X_train = train_data[features]
y_train = train_data["Umsatz"]
X_test = test_data[features]
y_test = test_data["Umsatz"]

# Lineares Regressionsmodell trainieren
model = LinearRegression()
model.fit(X_train, y_train)

# Vorhersagen machen
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Modellbewertung
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Adjustierter R² berechnen
def adjusted_r2(r2, n, k):
    return 1 - (1 - r2) * ((n - 1) / (n - k - 1))

train_adjusted_r2 = adjusted_r2(train_r2, X_train.shape[0], X_train.shape[1])
test_adjusted_r2 = adjusted_r2(test_r2, X_test.shape[0], X_test.shape[1])

# Ergebnisse ausgeben
print("Modellbewertung:")
print(f"Trainingsdatensatz - MSE: {train_mse:.2f}, R²: {train_r2:.2f}, Adjustierter R²: {train_adjusted_r2:.2f}")
print(f"Testdatensatz - MSE: {test_mse:.2f}, R²: {test_r2:.2f}, Adjustierter R²: {test_adjusted_r2:.2f}")

# Modellgleichung
coefficients = model.coef_
intercept = model.intercept_

print("Lineare Modellgleichung:")
print(f"Umsatz = {intercept:.2f} + " + " + ".join([f"{coeff:.2f}*{feat}" for coeff, feat in zip(coefficients, features)]))


Modellbewertung:
Trainingsdatensatz - MSE: 7629.32, R²: 0.65
Testdatensatz - MSE: 6669.39, R²: 0.61
Lineare Modellgleichung:
Umsatz = 121.34 + 289.26*Warengruppe_2.0 + 42.81*Warengruppe_3.0 + -33.01*Warengruppe_4.0 + 159.55*Warengruppe_5.0 + -54.36*Warengruppe_6.0
