In [12]:
import plotly.express as px
import pandas as pd
from os import getenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
from joblib import Parallel, delayed
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [2]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)
# Import the data to separate dataframes
df_main = pd.read_sql("SELECT * FROM original_data.super_duper_table_of_doom", engine)

#df_main.dropna(inplace=True)

df_main = pd.get_dummies(df_main, columns=['land'])

# Convert the 'date' column to datetime format
df_main['date'] = pd.to_datetime(df_main['date'])

# Create 'year' and 'month' columns
df_main['year'] = df_main['date'].dt.year
df_main['month'] = df_main['date'].dt.month

# Drop nan in the target column
df_main.dropna(subset=['uebernachtungen_anzahl'], inplace=True)

In [5]:
# Define the features and the target
X  = df_main[['year', 'month', 'mean_air_temp_mean', 'durchsch_aufenthaltsdauer_tage', 'urlaubs_campingplaetze_offen', 'urlaubs_stellplaetze_offen', 'mean_drought_index', 'mean_evapo_p', 'mean_evapo_r', 'mean_frost_depth', 'mean_precipitation', 'mean_soil_moist', 'mean_soil_temperature_5cm', 'mean_sunshine_duration', 'land_Baden-Württemberg', 'land_Bayern', 'land_Berlin', 'land_Brandenburg', 'land_Bremen', 'land_Hamburg', 'land_Hessen', 'land_Mecklenburg-Vorpommern', 'land_Niedersachsen', 'land_Nordrhein-Westfalen', 'land_Rheinland-Pfalz', 'land_Saarland', 'land_Sachsen', 'land_Sachsen-Anhalt', 'land_Schleswig-Holstein', 'land_Thüringen']]
y = df_main['uebernachtungen_anzahl']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [13]:
# Define a dictionary of models with hyperparameter grids
models = {
    "GradientBoosting": (HistGradientBoostingRegressor(), {
        'learning_rate': [0.01],
        'max_iter': [100, 1000, 10000],
        'max_depth': [5]
    })}

# Function to perform grid search and evaluate model
def evaluate_model(model_name, model, param_grid, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mse)
    # Extract feature importance if available
    if hasattr(best_model, 'feature_importances_'):
        feature_importance = best_model.feature_importances_
    else:
        feature_importance = None
    return model_name, mse, mae, rmse, grid_search.best_params_, feature_importance

# Run grid search and evaluation in parallel
results = Parallel(n_jobs=-1)(delayed(evaluate_model)(model_name, model, param_grid, X_train_scaled, y_train, X_test_scaled, y_test) for model_name, (model, param_grid) in models.items())

# Extract and store results
model_names, mse_results, mae_results, rmse_results, best_params, feature_importances = zip(*results)
results_df = pd.DataFrame({
    'Model': model_names,
    'MSE': mse_results,
    'MAE': mae_results,
    'RMSE': rmse_results,
    'Best Parameters': best_params,
    'Feature Importances': feature_importances
}).sort_values('RMSE')

# Display the results
print(results_df)

# Debugging: print feature importances
for model_name, feature_importance in zip(model_names, feature_importances):
    if feature_importance is not None:
        print(f"Feature importances for {model_name}: {feature_importance}")
    else:
        print(f"No feature importances for {model_name}")

# Plotting feature importance for models that provide it using Plotly
for model_name, feature_importance in zip(model_names, feature_importances):
    if feature_importance is not None:
        fig = px.bar(
            x=feature_importance,
            y=X_train_scaled.columns,
            orientation='h',
            labels={'x': 'Feature Importance', 'y': 'Feature'},
            title=f'Feature Importance for {model_name}'
        )
        fig.show()


              Model           MSE           MAE          RMSE  \
0  GradientBoosting  3.663793e+09  31597.624711  60529.273857   

                                     Best Parameters Feature Importances  
0  {'learning_rate': 0.01, 'max_depth': 5, 'max_i...                None  
No feature importances for GradientBoosting
