In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Import Data
data = pd.read_csv("/Users/edilbekabdyrakhmanov/Documents/GitHub/bakeryy/0_DataPreparation/initialdata/cleaned_data_fixed_month_year.csv")
data.head()  # Print first few rows to verify

Unnamed: 0.1,Unnamed: 0,Datum,Weekday,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,Umsatz,KielerWoche,AverageTemp,Temp_Deviation,Temp_vs_Avg,Weather_Impression,Is_Holiday,Month,Year,is_outlier
0,394,2013-07-01,Monday,6.0,17.84,15,20.0,1307011.0,Brot,148.83,0.0,20.525278,-2.685278,Significantly colder,okay,0,7,2013,0
1,395,2013-07-01,Monday,6.0,17.84,15,20.0,1307012.0,Brötchen,535.86,0.0,20.525278,-2.685278,Significantly colder,okay,0,7,2013,0
2,396,2013-07-01,Monday,6.0,17.84,15,20.0,1307013.0,Croissant,201.2,0.0,20.525278,-2.685278,Significantly colder,okay,0,7,2013,0
3,397,2013-07-01,Monday,6.0,17.84,15,20.0,1307014.0,Konditorei,65.89,0.0,20.525278,-2.685278,Significantly colder,okay,0,7,2013,0
4,398,2013-07-01,Monday,6.0,17.84,15,20.0,1307015.0,Kuchen,317.48,0.0,20.525278,-2.685278,Significantly colder,okay,0,7,2013,0


DATA PREPARATION

In [3]:
# Drop 'id' column first
if 'id' in data.columns:
    data = data.drop(columns=['id'])

# Define categorical features
categorical_features = ['Weekday', 'Warengruppe', 'Weather_Impression', 'Temp_vs_Avg',
                        'KielerWoche', 'Is_Holiday', 'is_outlier', 'Month', 'Year']
numerical_features = ['Bewoelkung', 'Temperatur', 'Windgeschwindigkeit']

# Inspect data types and unique values for categorical columns
print(data[categorical_features].dtypes)
print("Unique Values:\n", data[categorical_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in categorical_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[categorical_features], drop_first=True, dtype=int)

# Include numerical features and date column
for col in numerical_features:
    features[col] = data[col]
features['Datum'] = data['Datum']  # Keep the date column

# Construct the prepared data set including the dependent variable ('Umsatz')
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)

# Handle missing values by removing rows with any missing values
# Ensure 'Datum' column is datetime for comparison
prepared_data['Datum'] = pd.to_datetime(prepared_data['Datum'])

# Split into two parts: before and after 2018-08-01
cutoff_date = pd.to_datetime('2018-08-01')
before_cutoff = prepared_data[prepared_data['Datum'] < cutoff_date]
after_cutoff = prepared_data[prepared_data['Datum'] >= cutoff_date]

# Drop NaNs only in the data before cutoff
before_cutoff_cleaned = before_cutoff.dropna()

# Concatenate back together
prepared_data = pd.concat([before_cutoff_cleaned, after_cutoff], ignore_index=True)

# Display the new shape
print(prepared_data.shape)
prepared_data.head()

# Display the shape of the prepared data set
print(prepared_data.shape)
prepared_data.head()


Weekday               category
Warengruppe           category
Weather_Impression    category
Temp_vs_Avg           category
KielerWoche           category
Is_Holiday            category
is_outlier            category
Month                 category
Year                  category
dtype: object
Unique Values:
 Weekday               ['Monday', 'Tuesday', 'Wednesday', 'Thursday',...
Warengruppe           ['Brot', 'Brötchen', 'Croissant', 'Konditorei'...
Weather_Impression    ['okay', 'very good', 'good', 'very bad', 'bad...
Temp_vs_Avg           ['Significantly colder', 'Slightly warmer', 'S...
KielerWoche              [0.0, 1.0]
Categories (2, float64): [0.0, 1.0]
Is_Holiday                         [0, 1]
Categories (2, int64): [0, 1]
is_outlier                         [0, 1]
Categories (2, int64): [0, 1]
Month                 [7, 8, 9, 10, 11, ..., 2, 3, 4, 5, 6]
Length: ...
Year                  [2013, 2014, 2015, 2016, 2017, 2018, 2019]
Cat...
dtype: object
(13034, 44)
(13034, 44)


Unnamed: 0,Umsatz,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Warengruppe_Brötchen,Warengruppe_Croissant,Warengruppe_Konditorei,...,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,Year_2019,Bewoelkung,Temperatur,Windgeschwindigkeit,Datum
0,148.83,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
1,535.86,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
2,201.2,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
3,65.89,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
4,317.48,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01


Selection of Training, Validation and Test Data

In [4]:
# Convert date column to datetime if not already (assuming you have a 'Datum' column)
data['Datum'] = pd.to_datetime(data['Datum'])

# Set date ranges for splits
train_start = '2013-07-01'
train_end = '2017-07-31'
val_start = '2017-08-01'
val_end = '2018-07-31'
test_start = '2018-08-01'
test_end = '2019-07-30'

# Split the data based on date ranges
training_data = prepared_data[(prepared_data['Datum'] >= train_start) & (prepared_data['Datum'] <= train_end)]
validation_data = prepared_data[(prepared_data['Datum'] >= val_start) & (prepared_data['Datum'] <= val_end)]
test_data = prepared_data[(prepared_data['Datum'] >= test_start) & (prepared_data['Datum'] <= test_end)]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)

Training features dimensions: (8826, 43)
Validation features dimensions: (2090, 43)
Test features dimensions: (2106, 43)

Training labels dimensions: (8826, 1)
Validation labels dimensions: (2090, 1)
Test labels dimensions: (2106, 1)


Data Export

In [5]:
# Create subdirectory for the pickle files
subdirectory = "final_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")