In [1]:
import pandas as pd
import numpy as np
import os

data = pd.read_csv("../../1_DatasetCharacteristics/merged_data_clean.csv")
data.head() 

Unnamed: 0,Datum,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,Umsatz,KielerWoche
0,2012-01-01,8.0,9.82,14,58.0,,,,
1,2012-01-02,7.0,7.44,12,,,,,
2,2012-01-03,8.0,5.54,18,63.0,,,,
3,2012-01-04,4.0,5.69,19,80.0,,,,
4,2012-01-05,6.0,5.3,23,80.0,,,,


In [8]:
# Convert 'Datum' column to datetime format
data['Datum'] = pd.to_datetime(data['Datum'])

# Exchange Nan in KielerWoche with 0
data['KielerWoche'] = data['KielerWoche'].fillna(0)

In [9]:
# extract all features but Umsatz and Datum
features = data.drop(columns=['Umsatz', 'Datum'])

# Construct the prepared data set including the dependent variable ('label')
prepared_data = pd.concat([data[['Umsatz']], features], axis=1)

# Handle missing values by removing rows with any missing values
prepared_data = prepared_data.dropna()

# Display the shape of the prepared data set
print(prepared_data.shape)
# Display the first few rows of the prepared data set
prepared_data.head()

(7009, 8)


Unnamed: 0,Umsatz,Bewoelkung,Temperatur,Windgeschwindigkeit,Wettercode,id,Warengruppe,KielerWoche
394,148.83,6.0,17.84,15,20.0,1307011.0,1.0,0.0
395,535.86,6.0,17.84,15,20.0,1307012.0,2.0,0.0
396,201.2,6.0,17.84,15,20.0,1307013.0,3.0,0.0
397,65.89,6.0,17.84,15,20.0,1307014.0,4.0,0.0
398,317.48,6.0,17.84,15,20.0,1307015.0,5.0,0.0


In [10]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the data
prepared_data = prepared_data.sample(frac=1).reset_index(drop=True)

# Calculate the number of rows for each dataset
n_total = len(prepared_data)
n_training = int(0.7 * n_total)
n_validation = int(0.20 * n_total)

# Split the features and labels for training, validation, and test
training_data = prepared_data.iloc[:n_training]
validation_data = prepared_data.iloc[n_training:n_training+n_validation]
test_data = prepared_data.iloc[n_training+n_validation:]

# Separating features and labels
training_features = training_data.drop('Umsatz', axis=1)
validation_features = validation_data.drop('Umsatz', axis=1)
test_features = test_data.drop('Umsatz', axis=1)

training_labels = training_data[['Umsatz']]
validation_labels = validation_data[['Umsatz']]
test_labels = test_data[['Umsatz']]

# Print dimensions of the dataframes
print("Training features dimensions:", training_features.shape)
print("Validation features dimensions:", validation_features.shape)
print("Test features dimensions:", test_features.shape)
print()
print("Training labels dimensions:", training_labels.shape)
print("Validation labels dimensions:", validation_labels.shape)
print("Test labels dimensions:", test_labels.shape)

Training features dimensions: (4906, 7)
Validation features dimensions: (1401, 7)
Test features dimensions: (702, 7)

Training labels dimensions: (4906, 1)
Validation labels dimensions: (1401, 1)
Test labels dimensions: (702, 1)


In [11]:
# Create subdirectory for the pickle files
subdirectory = "pickle_data"
os.makedirs(subdirectory, exist_ok=True)

# Export of the prepared data to subdirectory as pickle files
training_features.to_pickle(f"{subdirectory}/training_features.pkl")
validation_features.to_pickle(f"{subdirectory}/validation_features.pkl")
test_features.to_pickle(f"{subdirectory}/test_features.pkl")
training_labels.to_pickle(f"{subdirectory}/training_labels.pkl")
validation_labels.to_pickle(f"{subdirectory}/validation_labels.pkl")
test_labels.to_pickle(f"{subdirectory}/test_labels.pkl")