In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1. Daten
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Nur Warengruppe 4
train = train[train["warengruppe"] == 4].copy()
test = test[test["warengruppe"] == 4].copy()

# 3. Date → Wochentag 
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# 4. Features und Zielvariable
features = ["Temperatur", "Wochentag"]
target = "umsatz"     

X = train[features]
y = train[target]

X_test = test[features]

# 5. Preprocessing: Wochentag → OneHotEncoding
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), ["Wochentag"]),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), ["Temperatur"])
    ]
)

# 6. Lineares Regressionsmodell
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# 7. Modelltraining
model.fit(X, y) 

# 8. Vorhersage für Testset
test["umsatz_Prediction"] = model.predict(X_test)
print(test.columns)
print(test[["umsatz_Prediction"]].head())
print(test[["id", "umsatz_Prediction"]].head())


Index(['date', 'warengruppe', 'id', 'umsatz', 'Bewoelkung', 'Temperatur',
       'Windgeschwindigkeit', 'Wettercode', 'KielerWoche', 'Wochentag',
       'umsatz_Prediction'],
      dtype='object')
    umsatz_Prediction
3           68.614075
8           67.909336
13          69.035264
18          77.392929
23         132.449192
         id  umsatz_Prediction
3   1808014          68.614075
8   1808024          67.909336
13  1808034          69.035264
18  1808044          77.392929
23  1808054         132.449192


In [12]:
#9. Submission als csv speichern
submission = test[["id", "umsatz_Prediction"]].copy()
submission.to_csv("submission_simple_linear_regression.csv", index=False)


In [13]:
#10. R-Wer für Trainingsdaten berechnen
from sklearn.metrics import r2_score
y_train_pred = model.predict(X)
r2_train = r2_score(y, y_train_pred)
print(f"R² für Trainingsdaten: {r2_train:.4f}")


R² für Trainingsdaten: 0.3320
