In [2]:
   %pip install seaborn
   %pip install statsmodels
   %pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
[K     |████████████████████████████████| 11.1 MB 3.1 MB/s eta 0:00:01
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
[K     |███████████

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Load your dataset
df = pd.read_csv("/Users/edilbekabdyrakhmanov/Documents/GitHub/bakeryy/0_DataPreparation/initialdata/merged_data_temperature+holidays+weather_impressions.csv")

# Ensure the 'Datum' column is in datetime format
df['Datum'] = pd.to_datetime(df['Datum'])

# If 'Weekday' column does not exist, create it
if 'Weekday' not in df.columns:
    df['Weekday'] = df['Datum'].dt.day_name()

# Define time ranges
train_start = '2013-07-01'
train_end = '2017-07-31'
val_start = '2017-08-01'
val_end = '2018-07-31'
test_start = '2018-08-01'
test_end = '2019-07-30'

# Split data
train_data = df[(df['Datum'] >= train_start) & (df['Datum'] <= train_end)]
validation_data = df[(df['Datum'] >= val_start) & (df['Datum'] <= val_end)]
test_data = df[(df['Datum'] >= test_start) & (df['Datum'] <= test_end)]

# Print shapes
print("Train shape:", train_data.shape)
print("Validation shape:", validation_data.shape)
print("Test shape:", test_data.shape)

def prepare_ols_data(data):
    """
    Prepare data for OLS regression with enhanced features
    """
    data = data.copy()
    data['Datum'] = pd.to_datetime(data['Datum'])

    # Add time-based features
    data['Month'] = data['Datum'].dt.month
    data['Day_of_Year'] = data['Datum'].dt.dayofyear
    data['Week_of_Year'] = data['Datum'].dt.isocalendar().week.astype(int)
    data['Quarter'] = data['Datum'].dt.quarter
    data['Year'] = data['Datum'].dt.year

    # Cyclical encoding
    data['Month_sin'] = np.sin(2 * np.pi * data['Month'] / 12)
    data['Month_cos'] = np.cos(2 * np.pi * data['Month'] / 12)
    data['Day_sin'] = np.sin(2 * np.pi * data['Day_of_Year'] / 365)
    data['Day_cos'] = np.cos(2 * np.pi * data['Day_of_Year'] / 365)
    data['Week_sin'] = np.sin(2 * np.pi * data['Week_of_Year'] / 52)
    data['Week_cos'] = np.cos(2 * np.pi * data['Week_of_Year'] / 52)

    # Fill missing values for key features
    if 'KielerWoche' in data.columns:
        data['KielerWoche'] = data['KielerWoche'].fillna(0)
    if 'Is_Holiday' in data.columns:
        data['Is_Holiday'] = data['Is_Holiday'].fillna(0)
    if 'Temp_Deviation' in data.columns:
        data['Temp_Deviation'] = data['Temp_Deviation'].fillna(data['Temp_Deviation'].median())
    if 'Bewoelkung' in data.columns:
        data['Bewoelkung'] = data['Bewoelkung'].fillna(data['Bewoelkung'].median())
    if 'Weather_Impression' in data.columns:
        data['Weather_Impression'] = data['Weather_Impression'].fillna('Unknown')
    return data

def build_ols_model(train_data):
    """
    Build OLS model with enhanced features
    """
    data = prepare_ols_data(train_data)
    Y = data['Umsatz']

    X_components = []

    # 1. Product categories (dummy variables)
    if 'Warengruppe' in data.columns:
        warengruppe_dummies = pd.get_dummies(data['Warengruppe'], prefix='Warengruppe', drop_first=True, dtype=int)
        X_components.append(warengruppe_dummies)
        print(f"Added Warengruppe dummies: {list(warengruppe_dummies.columns)}")

    # 2. Weekday dummies
    if 'Weekday' in data.columns:
        weekday_dummies = pd.get_dummies(data['Weekday'], prefix='Weekday', drop_first=True, dtype=int)
        X_components.append(weekday_dummies)
        print(f"Added Weekday dummies: {list(weekday_dummies.columns)}")

    # 3. Weather impression dummies
    if 'Weather_Impression' in data.columns:
        weather_dummies = pd.get_dummies(data['Weather_Impression'], prefix='Weather', drop_first=True, dtype=int)
        X_components.append(weather_dummies)
        print(f"Added Weather dummies: {list(weather_dummies.columns)}")

    # 4. Continuous weather variables
    continuous_vars = ['Temperatur', 'Windgeschwindigkeit', 'Bewoelkung', 'Temp_Deviation']
    for var in continuous_vars:
        if var in data.columns:
            X_components.append(data[[var]])
            print(f"Added continuous variable: {var}")

    # 5. Binary variables
    binary_vars = ['Is_Holiday', 'KielerWoche']
    for var in binary_vars:
        if var in data.columns:
            X_components.append(data[[var]])
            print(f"Added binary variable: {var}")

    # 6. Time-based features
    time_features = ['Month', 'Quarter', 'Month_sin', 'Month_cos', 'Day_sin', 'Day_cos', 'Week_sin', 'Week_cos']
    for var in time_features:
        if var in data.columns:
            X_components.append(data[[var]])
            print(f"Added time feature: {var}")

    # Combine all components
    X = pd.concat(X_components, axis=1)
    X = sm.add_constant(X)

    print(f"\nFinal feature matrix shape: {X.shape}")
    print(f"Features: {list(X.columns)}")

    # Remove rows with NaN in X or Y
    valid_idx = Y.notna() & X.notna().all(axis=1)
    Y_clean = Y[valid_idx].reset_index(drop=True)
    X_clean = X[valid_idx].reset_index(drop=True)

    print(f"\nData after cleaning:")
    print(f"Observations: {len(Y_clean)}")
    print(f"Features: {X_clean.shape[1]} (including constant)")

    

Train shape: (7517, 15)
Validation shape: (1839, 15)
Test shape: (351, 15)
