In [1]:
import pandas as pd
import numpy as np
import os

data = pd.read_csv("../../1_DatasetCharacteristics/merged_data_clean.csv")
data.head() 

Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,Umsatz,KielerWoche
0,2012-01-01,8.0,9.82,14,58.0,,,,
1,2012-01-02,7.0,7.44,12,,,,,
2,2012-01-03,8.0,5.54,18,63.0,,,,
3,2012-01-04,4.0,5.69,19,80.0,,,,
4,2012-01-05,6.0,5.3,23,80.0,,,,


In [2]:
# Convert 'Datum' column to datetime format
data['Datum'] = pd.to_datetime(data['Datum'])

# Exchange Nan in KielerWoche with 0
data['KielerWoche'] = data['KielerWoche'].fillna(0)

In [3]:
# Define categorical and numerical features
cat_features = ['Wettercode', 'KielerWoche', 'Warengruppe']
num_features = ['Bewoelkung', 'Temperatur', 'Windgeschwindigkeit']

# Inspect data types and unique values for categorical columns
print(data[cat_features].dtypes)
print("Unique Values:\n", data[cat_features].apply(lambda x: x.unique()))

# Ensure categorical columns are treated as categories
for col in cat_features:
    data[col] = data[col].astype('category')

# Encode categorical variables using pd.get_dummies
features = pd.get_dummies(data[cat_features], drop_first=True, dtype=int)

# Include numerical features and date column
for col in num_features:
    features[col] = data[col]
features['Datum'] = data['Datum']

# Construct the prepared data set including the dependent variable ('Umsatz')
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)

# Handle missing values by removing rows with any missing values
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
prepared_data.head()

Wettercode     float64
KielerWoche    float64
Warengruppe    float64
dtype: object
Unique Values:
 Wettercode     [58.0, nan, 63.0, 80.0, 61.0, 51.0, 26.0, 68.0...
KielerWoche                                           [0.0, 1.0]
Warengruppe                  [nan, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]
dtype: object
(9264, 51)


Unnamed: 0,Umsatz,Wettercode_1.0,Wettercode_2.0,Wettercode_3.0,Wettercode_5.0,Wettercode_10.0,Wettercode_13.0,Wettercode_17.0,Wettercode_20.0,Wettercode_21.0,...,KielerWoche_1.0,Warengruppe_2.0,Warengruppe_3.0,Warengruppe_4.0,Warengruppe_5.0,Warengruppe_6.0,Bewoelkung,Temperatur,Windgeschwindigkeit,Datum
394,148.83,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,6.0,17.84,15,2013-07-01
395,535.86,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,6.0,17.84,15,2013-07-01
396,201.2,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,6.0,17.84,15,2013-07-01
397,65.89,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,6.0,17.84,15,2013-07-01
398,317.48,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,6.0,17.84,15,2013-07-01


In [4]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)

Training features dimensions: (6484, 50)
Validation features dimensions: (1852, 50)
Test features dimensions: (928, 50)

Training labels dimensions: (6484, 1)
Validation labels dimensions: (1852, 1)
Test labels dimensions: (928, 1)


In [5]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")