In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1. Daten
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Nur Warengruppe 4
train = train[train["warengruppe"] == 4]


# 3. Date → Wochentag 
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# 4. Features und Zielvariable
features = ["Temperatur", "Wochentag"]
target = "umsatz"     

X = train[features]
y = train[target]

X_test = test[features]

# 5. Preprocessing: Wochentag → OneHotEncoding
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), ["Wochentag"]),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), ["Temperatur"])
    ]
)

# 6. Lineares Regressionsmodell
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# 7. Modelltraining
model.fit(X, y) 

# 8. Vorhersage für Testset
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1. Daten
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Nur Warengruppe 4
train = train[train["warengruppe"] == 4]


# 3. Date → Wochentag 
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# 4. Features und Zielvariable
features = ["Temperatur", "Wochentag"]
target = "umsatz"     

X = train[features]
y = train[target]

X_test = test[features]

# 5. Preprocessing: Wochentag → OneHotEncoding
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), ["Wochentag"]),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), ["Temperatur"])
    ]
)

# 6. Lineares Regressionsmodell
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# 7. Modelltraining
model.fit(X, y) 

# 8. Vorhersage für Testset
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# 1. Daten
train = pd.read_csv("analysis/train_split_merged_data_updated.csv")
test = pd.read_csv("analysis/test_split_merged_data_updated.csv")

# 2. Nur Warengruppe 4
train = train[train["warengruppe"] == 4]


# 3. Date → Wochentag 
train["date"] = pd.to_datetime(train["date"])
test["date"] = pd.to_datetime(test["date"])

train["Wochentag"] = train["date"].dt.day_name()
test["Wochentag"] = test["date"].dt.day_name()

# 4. Features und Zielvariable
features = ["Temperatur", "Wochentag"]
target = "umsatz"     

X = train[features]
y = train[target]

X_test = test[features]

# 5. Preprocessing: Wochentag → OneHotEncoding
from sklearn.impute import SimpleImputer

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), ["Wochentag"]),

        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), ["Temperatur"])
    ]
)

# 6. Lineares Regressionsmodell
model = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", LinearRegression())
])

# 7. Modelltraining
model.fit(X, y) 

# 8. Vorhersage für Testset ==> Änderung wegen zu wenig Datensätzen
# 8.1 Fallback für Nicht-WG4 Datensätze:
mean_wg4 = train["umsatz"].mean()
# 8.2 Alle Zeilen erhalten zunächst den Fallback
test["umsatz_Prediction"] = mean_wg4
# 8.3 Maske für WG4 im Testset
mask = test["warengruppe"] == 4
# 8.4 Nur für WG4 echte Modellvorhersage eintragen
test.loc[mask, "umsatz_Prediction"] = model.predict(X_test.loc[mask])
#8.5 Ausgaben prüfen
print(test[["id", "umsatz_Prediction"]].head())
print("Anzahl Zeilen in Submission:", len(test))





        id  umsatz_Prediction
0  1808011          88.326489
1  1808012          88.326489
2  1808013          88.326489
3  1808014          68.614075
4  1808015          88.326489
Anzahl Zeilen in Submission: 1830


In [4]:
#9. Submission als csv speichern
submission = test[["id", "umsatz_Prediction"]].copy()
submission.to_csv("submission_simple_linear_regression_updated.csv", index=False) #updated with enough rows


In [5]:
#10. R-Wer für Trainingsdaten berechnen
from sklearn.metrics import r2_score
y_train_pred = model.predict(X)
r2_train = r2_score(y, y_train_pred)
print(f"R² für Trainingsdaten: {r2_train:.4f}")


R² für Trainingsdaten: 0.3320
