In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os
import tensorflow
import missingno as msno
from fancyimpute import IterativeImputer, KNN

2025-01-05 21:55:33.609994: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-05 21:55:33.632706: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-05 21:55:33.807629: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-05 21:55:33.894977: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736114134.026031   56603 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736114134.07

In [2]:
# Load all data files
kiwo_url = '../Data/kiwo.csv'
umsatz_url = '../Data/train.csv'
wetter_url = '../Data/wetter.csv'
test_url = '../Data/test.csv'

df_kiwo = pd.read_csv(kiwo_url)
df_umsatz = pd.read_csv(umsatz_url)
df_wetter = pd.read_csv(wetter_url)
df_test = pd.read_csv(test_url)

# Prepare training data
df_train = pd.merge(df_umsatz, df_wetter, on='Datum', how='left')
df_train = pd.merge(df_train, df_kiwo, on='Datum', how='left')

# Prepare test data
df_test = pd.merge(df_test, df_wetter, on='Datum', how='left')
df_test = pd.merge(df_test, df_kiwo, on='Datum', how='left')

# Apply the same preprocessing to both training and test data
def prepare_data(data):
    # Convert Datum to datetime
    data['Datum'] = pd.to_datetime(data['Datum'])
    
    # Extract additional features from date
    data['DayOfWeek'] = data['Datum'].dt.dayofweek
    data['Month'] = data['Datum'].dt.month
    
    # Create 'temperatur_binned' feature
    def bin_temperature(row):
        month = row['Month']
        temperature = row['Temperatur']
        
        if month in [12, 1, 2]:  # Winter
            if temperature <= 0: return 'Very Cold'
            elif temperature <= 5: return 'Cold'
            elif temperature <= 10: return 'Mild'
            else: return 'Warm'
        elif month in [3, 4, 5]:  # Spring
            if temperature <= 10: return 'Cool'
            elif temperature <= 15: return 'Mild'
            elif temperature <= 25: return 'Warm'
            else: return 'Hot'
        elif month in [6, 7, 8]:  # Summer
            if temperature <= 15: return 'Cool'
            elif temperature <= 20: return 'Mild'
            elif temperature <= 30: return 'Warm'
            else: return 'Hot'
        else:  # Fall
            if temperature <= 10: return 'Cool'
            elif temperature <= 15: return 'Mild'
            elif temperature <= 25: return 'Warm'
            else: return 'Hot'

    data['Temperatur_binned'] = data.apply(bin_temperature, axis=1)

    # Handle KielerWoche
    data['KielerWoche'] = data['KielerWoche'].fillna(0).apply(lambda x: 1 if x != 0 else 0)
    
    # Handle missing data with KNN imputation
    numerical_columns = data.select_dtypes(include=['float64', 'int64']).columns
    non_numerical_columns = data.select_dtypes(exclude=['float64', 'int64']).columns

    imputer_irmi = IterativeImputer()
    imputed_numerical = pd.DataFrame(
        imputer_irmi.fit_transform(data[numerical_columns]),
        columns=numerical_columns,
        index=data.index
    )

    data = pd.concat([imputed_numerical, data[non_numerical_columns]], axis=1)
    
    return data

# Prepare both datasets
df_train = prepare_data(df_train)
df_test = prepare_data(df_test)

# Split training data into train and validation
train_size = int(0.85 * len(df_train))  # Using 85% for training, 15% for validation
df_train_final = df_train[:train_size]
df_val = df_train[train_size:]

# Define categorical features
categorical_features = ['Warengruppe', 'Temperatur_binned', 'DayOfWeek', 'KielerWoche', 'Month', 'Wettercode']

# Process categorical features
for col in categorical_features:
    df_train_final[col] = df_train_final[col].astype('category')
    df_val[col] = df_val[col].astype('category')
    df_test[col] = df_test[col].astype('category')

# Create feature sets
def prepare_features(data):
    features = pd.get_dummies(data[categorical_features], drop_first=False, dtype=int)
    features['Windgeschwindigkeit'] = data['Windgeschwindigkeit']
    features['Datum'] = data['Datum']
    return features

# Prepare features for all datasets
train_features = prepare_features(df_train_final)
val_features = prepare_features(df_val)
test_features = prepare_features(df_test)

# Prepare labels (only for train and validation)
train_labels = df_train_final[['Umsatz']]
val_labels = df_val[['Umsatz']]

# Print shapes to verify
print("Training features shape:", train_features.shape)
print("Validation features shape:", val_features.shape)
print("Test features shape:", test_features.shape)
print("Training labels shape:", train_labels.shape)
print("Validation labels shape:", val_labels.shape)

Training features shape: (7933, 2119)
Validation features shape: (1401, 317)
Test features shape: (1830, 389)
Training labels shape: (7933, 1)
Validation labels shape: (1401, 1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_final[col] = df_train_final[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val[col] = df_val[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_final[col] = df_train_final[col].astype('category')
A value is trying to be set on a copy 

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import numpy as np

# Custom MAPE metric
def mean_absolute_percentage_error(y_true, y_pred):
    """Calculate Mean Absolute Percentage Error"""
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    return tf.reduce_mean(tf.abs((y_true - y_pred) / y_true)) * 100

def create_sequences(features, labels, sequence_length=7):
    """Create sequences for LSTM input"""
    X, y = [], []
    
    # Remove 'Datum' column for sequence creation
    features_no_date = features.drop('Datum', axis=1)
    
    for i in range(len(features) - sequence_length + 1):
        sequence = features_no_date.iloc[i:i+sequence_length].values
        
        # Check if there is a valid label available for this sequence
        if i + sequence_length <= len(labels):
            X.append(sequence)
            y.append(labels.iloc[i+sequence_length-1])
    
    return np.array(X), np.array(y)


def create_lstm_model(input_shape):
    """Create LSTM model with the specified input shape"""
    model = tf.keras.Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae', mean_absolute_percentage_error]
    )
    return model




# Create sequences from your existing splits
sequence_length = 7  # Week of historical data
X_train, y_train = create_sequences(training_features, training_labels, sequence_length)
X_val, y_val = create_sequences(validation_features, validation_labels, sequence_length)
X_test, y_test = create_sequences(test_features, test_labels, sequence_length)

# Get input shape from the processed data
n_features = X_train.shape[2]  # Number of features after one-hot encoding
input_shape = (X_train.shape[1], n_features)  # Sequence length and number of features

# Create and train the model
model = create_lstm_model(input_shape)


# Define callbacks
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    ),
    ModelCheckpoint(
        'best_model.keras',
        monitor='val_loss',
        save_best_only=True
    )
]

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# Evaluate on test set
test_loss, test_mae, test_mape = model.evaluate(X_test, y_test, verbose=0)
print(f'\nTest Results:')
print(f'MAE: {test_mae:.2f}')
print(f'MAPE: {test_mape:.2f}%')

# Make predictions
train_predictions = model.predict(X_train)
val_predictions = model.predict(X_val)
test_predictions = model.predict(X_test)

# Plotting training history
plt.figure(figsize=(15, 4))

# Loss plot
plt.subplot(1, 3, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

# MAE plot
plt.subplot(1, 3, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

# MAPE plot
plt.subplot(1, 3, 3)
plt.plot(history.history['mean_absolute_percentage_error'], label='Training MAPE')
plt.plot(history.history['val_mean_absolute_percentage_error'], label='Validation MAPE')
plt.title('Model MAPE')
plt.xlabel('Epoch')
plt.ylabel('MAPE (%)')
plt.legend()

plt.tight_layout()
plt.show()

# Calculate and print detailed metrics for each dataset
def print_metrics(y_true, y_pred, dataset_name):
    mae = np.mean(np.abs(y_true - y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    print(f"\n{dataset_name} Metrics:")
    print(f"MAE: {mae:.2f}")
    print(f"MAPE: {mape:.2f}%")

print_metrics(y_train, train_predictions, "Training")
print_metrics(y_val, val_predictions, "Validation")
print_metrics(y_test, test_predictions, "Test")

NameError: name 'training_features' is not defined