ADDING FEATURES

In [12]:
import pandas as pd
import numpy as np
import os

# Load data
data = pd.read_csv('/Users/edilbekabdyrakhmanov/Documents/GitHub/bakeryy/0_DataPreparation/initialdata/cleaned_data_with_KielerWoche_zero.csv')

# Parse date
data['Datum'] = pd.to_datetime(data['Datum'])

# Add time features
data['Week_of_year'] = data['Datum'].dt.isocalendar().week
data['Is_Weekend'] = data['Weekday'].isin(['Saturday', 'Sunday']).astype(int)

# Sort by Warengruppe and Datum for lags
data = data.sort_values(['Warengruppe', 'Datum'])
data['time_idx'] = data.groupby('Warengruppe').cumcount()

# Add lags & diffs for Umsatz
for lag in [1, 7, 365]:
    data[f'Umsatz_lag_{lag}'] = data.groupby('Warengruppe')['Umsatz'].shift(lag)
    data[f'Umsatz_diff_{lag}'] = data['Umsatz'] - data[f'Umsatz_lag_{lag}']

# Add exogenous covariate lags
for col in ['Temperatur', 'Bewoelkung', 'Windgeschwindigkeit']:
    data[f'{col}_lag_1'] = data.groupby('Warengruppe')[col].shift(1)

# Seasonality encoding
data['day_of_year'] = data['Datum'].dt.dayofyear
data['sin_day'] = np.sin(2 * np.pi * data['day_of_year'] / 365.25)
data['cos_day'] = np.cos(2 * np.pi * data['day_of_year'] / 365.25)

# Drop rows with NA
prepared_data = data.dropna().reset_index(drop=True)

# Preview
prepared_data.head()


Unnamed: 0,Datum,Weekday,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,Umsatz,KielerWoche,...,Umsatz_lag_7,Umsatz_diff_7,Umsatz_lag_365,Umsatz_diff_365,Temperatur_lag_1,Bewoelkung_lag_1,Windgeschwindigkeit_lag_1,day_of_year,sin_day,cos_day
0,2014-07-09,Wednesday,0.0,24.65,24,5.0,1407091.0,Brot,116.75,0.0,...,130.35,-13.6,148.83,-32.08,19.6,7.0,15.0,190,-0.126528,-0.991963
1,2014-07-10,Thursday,0.0,24.76,18,0.0,1407101.0,Brot,132.72,0.0,...,158.21,-25.49,159.79,-27.07,24.65,0.0,24.0,191,-0.143572,-0.98964
2,2014-07-11,Friday,1.0,22.94,18,0.0,1407111.0,Brot,180.71,0.0,...,112.9,67.81,111.89,68.82,24.76,0.0,18.0,192,-0.160575,-0.987024
3,2014-07-12,Saturday,0.0,20.38,12,0.0,1407121.0,Brot,142.33,0.0,...,150.98,-8.65,168.86,-26.53,22.94,1.0,18.0,193,-0.177529,-0.984116
4,2014-07-13,Sunday,7.0,18.6,10,61.0,1407131.0,Brot,80.45,0.0,...,84.8,-4.35,171.28,-90.83,20.38,0.0,12.0,194,-0.194431,-0.980916


SPLITTING DATES

In [13]:
# Example date cutoffs (replace by your actual split dates)
# Set date ranges for splits
train_start = '2013-07-01'
train_end = '2017-07-31'
val_start = '2017-08-01'
val_end = '2018-07-31'
test_start = '2018-08-01'
test_end = '2019-07-30'

# Split the data based on date ranges
train_data = prepared_data[(prepared_data['Datum'] >= train_start) & (prepared_data['Datum'] <= train_end)]
valid_data = prepared_data[(prepared_data['Datum'] >= val_start) & (prepared_data['Datum'] <= val_end)]
test_data = prepared_data[(prepared_data['Datum'] >= test_start) & (prepared_data['Datum'] <= test_end)]
prepared_data['Datum'].max()


Timestamp('2018-07-31 00:00:00')

SELECT FEATURES AND LABELS

In [10]:
feature_cols = [col for col in prepared_data.columns if col not in ['Umsatz', 'Datum']]

X_train = train_data[feature_cols]
y_train = train_data['Umsatz']

X_valid = valid_data[feature_cols]
y_valid = valid_data['Umsatz']

X_test = test_data[feature_cols]
y_test = test_data['Umsatz']

SAVE AS PICKLE

In [14]:
os.makedirs("pickle_data_2", exist_ok=True)

X_train.to_pickle("pickle_data_2/training_features.pkl")
y_train.to_pickle("pickle_data_2/training_labels.pkl")

X_valid.to_pickle("pickle_data_2/validation_features.pkl")
y_valid.to_pickle("pickle_data_2/validation_labels.pkl")

X_test.to_pickle("pickle_data_2/test_features.pkl")
y_test.to_pickle("pickle_data_2/test_labels.pkl")

print("Data preparation completed.")
print("Train shape:", X_train.shape)
print("Validation shape:", X_valid.shape)
print("Test shape:", X_test.shape)

Data preparation completed.
Train shape: (5427, 28)
Validation shape: (1725, 28)
Test shape: (0, 28)
