In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os
import tensorflow
import missingno as msno
from fancyimpute import IterativeImputer, KNN

2025-01-06 10:37:17.536992: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-06 10:37:17.539079: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-06 10:37:17.542254: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-06 10:37:17.551340: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736159837.566487    3584 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736159837.57

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import matplotlib.pyplot as plt

# Load data
kiwo_url = '../../Data/kiwo.csv'
umsatz_url = '../../Data/train.csv'
wetter_url = '../../Data/wetter.csv'
test_url = '../../Data/test.csv'

df_kiwo = pd.read_csv(kiwo_url)
df_umsatz = pd.read_csv(umsatz_url)
df_wetter = pd.read_csv(wetter_url)
df_test = pd.read_csv(test_url)

# Prepare datasets
df_train = pd.merge(df_umsatz, df_wetter, on='Datum', how='left')
df_train = pd.merge(df_train, df_kiwo, on='Datum', how='left')
df_test = pd.merge(df_test, df_wetter, on='Datum', how='left')
df_test = pd.merge(df_test, df_kiwo, on='Datum', how='left')

def prepare_data(data):
    data['Datum'] = pd.to_datetime(data['Datum'])
    data['DayOfWeek'] = data['Datum'].dt.dayofweek
    data['Month'] = data['Datum'].dt.month
    
    def bin_temperature(row):
        month = row['Month']
        temperature = row['Temperatur']
        
        if month in [12, 1, 2]:
            if temperature <= 0: return 'Very Cold'
            elif temperature <= 5: return 'Cold'
            elif temperature <= 10: return 'Mild'
            else: return 'Warm'
        elif month in [3, 4, 5]:
            if temperature <= 10: return 'Cool'
            elif temperature <= 15: return 'Mild'
            elif temperature <= 25: return 'Warm'
            else: return 'Hot'
        elif month in [6, 7, 8]:
            if temperature <= 15: return 'Cool'
            elif temperature <= 20: return 'Mild'
            elif temperature <= 30: return 'Warm'
            else: return 'Hot'
        else:
            if temperature <= 10: return 'Cool'
            elif temperature <= 15: return 'Mild'
            elif temperature <= 25: return 'Warm'
            else: return 'Hot'

    data['Temperatur_binned'] = data.apply(bin_temperature, axis=1)
    data['KielerWoche'] = data['KielerWoche'].fillna(0).astype(int)
    
    numerical_columns = ['Temperatur', 'Windgeschwindigkeit']
    imputer = IterativeImputer(random_state=42)
    data[numerical_columns] = imputer.fit_transform(data[numerical_columns])
    
    return data

# Define categorical features
categorical_features = ['DayOfWeek', 'Month', 'Temperatur_binned', 'KielerWoche']

# Prepare datasets
df_train = prepare_data(df_train)
df_test = prepare_data(df_test)

# Split train/validation
df_train_shuffled = df_train.sample(frac=1, random_state=42)
train_size = int(0.85 * len(df_train))
df_train_final = df_train_shuffled.iloc[:train_size].copy()
df_val = df_train_shuffled.iloc[train_size:].copy()

def prepare_features(data):
    features = pd.get_dummies(data[categorical_features], drop_first=False, dtype=int)
    features['Windgeschwindigkeit'] = data['Windgeschwindigkeit']
    features['Temperatur'] = data['Temperatur']
    features['Datum'] = data['Datum']
    return features

train_features = prepare_features(df_train_final)
val_features = prepare_features(df_val)
test_features = prepare_features(df_test)

train_labels = df_train_final[['Umsatz']]
val_labels = df_val[['Umsatz']]

def create_sequences(features, labels=None, sequence_length=7):
    X = []
    y = [] if labels is not None else None
    
    features_no_date = features.drop('Datum', axis=1)
    
    for i in range(len(features) - sequence_length + 1):
        sequence = features_no_date.iloc[i:i+sequence_length].values
        X.append(sequence)
        
        if labels is not None and i + sequence_length <= len(labels):
            y.append(labels.iloc[i+sequence_length-1].values[0])
    
    return (np.array(X), np.array(y)) if labels is not None else np.array(X)

# Create sequences
X_train, y_train = create_sequences(train_features, train_labels)
X_val, y_val = create_sequences(val_features, val_labels)
X_test = create_sequences(test_features)

def create_lstm_model(input_shape):
    model = tf.keras.Sequential([
        LSTM(64, return_sequences=True, input_shape=input_shape),
        Dropout(0.2),
        LSTM(32),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dense(1)
    ])
    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae', tf.keras.metrics.MeanAbsolutePercentageError()]
    )
    return model

# Train model
input_shape = (X_train.shape[1], X_train.shape[2])
model = create_lstm_model(input_shape)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True)
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# Generate predictions
test_predictions = model.predict(X_test)

# Plot results
plt.figure(figsize=(15, 4))

plt.subplot(1, 3, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(history.history['mae'], label='Training MAE')
plt.plot(history.history['val_mae'], label='Validation MAE')
plt.title('Model MAE')
plt.xlabel('Epoch')
plt.ylabel('MAE')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(history.history['mean_absolute_percentage_error'], label='Training MAPE')
plt.plot(history.history['val_mean_absolute_percentage_error'], label='Validation MAPE')
plt.title('Model MAPE')
plt.xlabel('Epoch')
plt.ylabel('MAPE (%)')
plt.legend()

plt.tight_layout()
plt.show()

# Save predictions
predictions_df = pd.DataFrame({
    'Datum': df_test['Datum'].iloc[6:],  # Adjust for sequence length
    'Predicted_Umsatz': test_predictions.flatten()
})
predictions_df.to_csv('predictions.csv', index=False)

Epoch 1/100


2025-01-06 10:37:20.742821: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 58733.8633 - mae: 196.2107 - mean_absolute_percentage_error: 91.8879 - val_loss: 43681.3516 - val_mae: 149.2808 - val_mean_absolute_percentage_error: 58.8203
Epoch 2/100
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 38759.6602 - mae: 135.7243 - mean_absolute_percentage_error: 53.8050 - val_loss: 25558.0449 - val_mae: 108.4926 - val_mean_absolute_percentage_error: 57.2460
Epoch 3/100
[1m248/248[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 23506.1074 - mae: 110.1181 - mean_absolute_percentage_error: 63.6137 - val_loss: 21844.8906 - val_mae: 110.1299 - val_mean_absolute_percentage_error: 77.8672
Epoch 4/100


KeyboardInterrupt: 