In [7]:
import pandas as pd
from os import getenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
from math import sqrt
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [8]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)
# Import the data to separate dataframes
df_main = pd.read_sql("SELECT * FROM original_data.super_duper_table_of_doom", engine)

df_main.dropna(inplace=True)

df_main = pd.get_dummies(df_main, columns=['land'])

# Convert the 'date' column to datetime format
df_main['date'] = pd.to_datetime(df_main['date'])

# Create 'year' and 'month' columns
df_main['year'] = df_main['date'].dt.year
df_main['month'] = df_main['date'].dt.month

# Drop the 'date' column
#df_main.drop(columns=['date'], inplace=True)

In [9]:
df_main

Unnamed: 0,date,ankuenfte_anzahl,ankuenfte_veraenderung_zum_vorjahreszeitraum_prozent,uebernachtungen_anzahl,uebernachtungen_veraenderung_zum_vorjahreszeitraum_prozent,durchsch_aufenthaltsdauer_tage,mean_air_temp_max,mean_air_temp_mean,mean_air_temp_min,mean_drought_index,...,land_Niedersachsen,land_Nordrhein-Westfalen,land_Rheinland-Pfalz,land_Saarland,land_Sachsen,land_Sachsen-Anhalt,land_Schleswig-Holstein,land_Thüringen,year,month
241,2009-01-01,6347.0,7.1,33881.0,9.2,5.3,0.824230,-2.85785,-6.28443,4.704827,...,False,False,False,False,False,False,False,False,2009,1
242,2009-01-01,9338.0,-5.3,76208.0,4.0,8.2,-0.314553,-3.92341,-7.23113,4.795684,...,False,False,False,False,False,False,False,False,2009,1
243,2009-01-01,143.0,70.2,507.0,90.6,3.5,0.816027,-1.89988,-5.26602,2.102708,...,False,False,False,False,False,False,False,False,2009,1
244,2009-01-01,189.0,87.1,874.0,91.7,4.6,0.539042,-2.28491,-5.86927,2.440468,...,False,False,False,False,False,False,False,False,2009,1
245,2009-01-01,2817.0,-2.6,10990.0,-5.1,3.9,0.592551,-2.87611,-6.35264,5.279178,...,False,False,False,False,False,False,False,False,2009,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,2008-07-01,69815.0,13.4,227792.0,14.7,3.3,25.012440,19.08210,13.15898,1.567899,...,False,False,False,False,False,False,False,False,2008,7
4057,2008-07-01,246519.0,14.1,1214623.0,20.0,4.9,23.600570,18.26186,12.87579,1.408681,...,False,False,False,False,False,False,False,False,2008,7
4058,2008-07-01,43946.0,5.5,133320.0,11.5,3.0,23.958940,18.14972,12.41437,2.328665,...,False,False,False,False,True,False,False,False,2008,7
4059,2008-07-01,26671.0,17.7,64395.0,15.0,2.4,24.916730,18.78346,12.84860,2.062915,...,False,False,False,False,False,True,False,False,2008,7


In [10]:
# Define the features and the target
X  = df_main[['year', 'month', 'land_Baden-Württemberg', 'land_Bayern', 'land_Berlin', 'land_Brandenburg', 'land_Bremen', 'land_Hamburg', 'land_Hessen', 'land_Mecklenburg-Vorpommern', 'land_Niedersachsen', 'land_Nordrhein-Westfalen', 'land_Rheinland-Pfalz', 'land_Saarland', 'land_Sachsen', 'land_Sachsen-Anhalt', 'land_Schleswig-Holstein', 'land_Thüringen']]
y = df_main['ankuenfte_anzahl']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [12]:
# Define a dictionary of models with hyperparameters
models = {"GradientBoosting": GradientBoostingRegressor(n_estimators=20000, learning_rate=0.1, max_depth=10)}

# Initialize lists to store results
model_names = []
mse_results = []
mae_results = []
rmse_results = []

# Loop over the models
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mse)

    # Store results
    model_names.append(model_name)
    mse_results.append(mse)
    mae_results.append(mae)
    rmse_results.append(rmse)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Model': model_names,
    'MSE': mse_results,
    'MAE': mae_results,
    'RMSE': rmse_results
}).sort_values('RMSE')

# Display the results
print(results)

              Model           MSE          MAE          RMSE
0  GradientBoosting  2.119186e+08  6970.898467  14557.423075
