In [167]:
import numpy as np
import pandas as pd
from os import getenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import ARDRegression, BayesianRidge, HuberRegressor, Lars, LassoLars, PassiveAggressiveRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.svm import NuSVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from math import sqrt
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [4]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)
# Import the data to separate dataframes
df_gastro = pd.read_sql("SELECT * FROM original_data.Gastronomieumsaetze_flat", engine)
df_weather = pd.read_sql("SELECT * FROM original_data.weather_data", engine)
df_uebernachtungen_pro_land = pd.read_sql("SELECT * FROM original_data.uebernachtungen_pro_land", engine)
df_herkunft = pd.read_sql("SELECT * FROM original_data.uebernachtungen_nach_herkunftsland", engine)
df_laender_infos = pd.read_sql("SELECT * FROM original_data.germany_laender_infos", engine)

In [121]:
df_gastro_short = df_gastro[['zeit','4_Auspraegung_Code','5_Auspraegung_Code','UMS002__Umsatz__2015=100']].copy()
df_gastro_short.rename(columns={'zeit':'jahr', '4_Auspraegung_Code': 'monat', 'UMS002__Umsatz__2015=100': 'umsatz'}, inplace=True)
df_gastro_short['monat'] = df_gastro_short['monat'].str.replace('MONAT', '')
df_gastro_camping = df_gastro_short[df_gastro_short['5_Auspraegung_Code'] == 'WZ08-553'][['jahr', 'monat', 'umsatz']]
df_gastro_camping['datum'] = df_gastro_camping['jahr'].astype(str) + df_gastro_camping['monat']
df_gastro_camping['datum'] = pd.to_datetime(df_gastro_camping['datum'], format='%Y%m')
df_gastro_camping.drop(columns=['jahr', 'monat'], inplace=True)
df_gastro_camping = df_gastro_camping.groupby('datum').sum().reset_index()

df_weather_re = df_weather.copy()
df_weather_re['MESS_DATUM'] = pd.to_datetime(df_weather_re['MESS_DATUM'], format='%Y%m%d')
df_weather_re = df_weather_re[~(df_weather_re['MESS_DATUM'] < '1994-01-01')]
df_weather_re.replace(-999, np.nan, inplace=True)
df_weather_re.rename(columns={'MESS_DATUM': 'mess_datum', 'QN_3':'qualitaetsniveau_wind', 'FX': 'wind_tagesmax', 'FM': 'wind_tagesmittel', 'QN_4': 'qualitaetsniveau_rest', 'RSK': 'niederschlagshoehe_tagessumme', 'RSKF': 'niederschlagsform', 'SDK': 'sonnenstunden_tagessumme', 'SHK_TAG': 'schneehoehe_tagessumme', 'NM': 'bedeckungsgrad_tagesmittel', 'VPM': 'dampfdruck_tagesmittel', 'PM': 'luftdruck_tagesmittel', 'TMK': 'lufttemperatur_2m_tagesmittel', 'UPM': 'rel_feuchte_tagesmittel', 'TXK': 'lufttemperatur_2m_tagesmax', 'TNK': 'lufttemperatur_2m_tagesmin', 'TGK': 'lufttemperatur_5cm_tagesmin'}, inplace=True)
df_weather_re.drop(columns=['eor'], inplace=True)
df_weather_re.dropna(inplace=True)
df_weather_re['station_id'] = df_weather_re['STATIONS_ID']
df_monthly = df_weather_re.groupby(['STATIONS_ID', df_weather_re.mess_datum.dt.year, df_weather_re.mess_datum.dt.month]).mean()
df_monthly['datum'] = pd.to_datetime(df_monthly.index.get_level_values(1).astype(str) + '-' + df_monthly.index.get_level_values(2).astype(str), format='%Y-%m')
df_monthly.drop(columns=['mess_datum'], inplace=True)
df_monthly.reset_index(drop=True, inplace=True)

df_monthly = df_monthly.merge(df_gastro_camping, how='left', left_on='datum', right_on='datum')

In [122]:
df_monthly

Unnamed: 0,qualitaetsniveau_wind,wind_tagesmax,wind_tagesmittel,qualitaetsniveau_rest,niederschlagshoehe_tagessumme,niederschlagsform,sonnenstunden_tagessumme,schneehoehe_tagessumme,bedeckungsgrad_tagesmittel,dampfdruck_tagesmittel,luftdruck_tagesmittel,lufttemperatur_2m_tagesmittel,rel_feuchte_tagesmittel,lufttemperatur_2m_tagesmax,lufttemperatur_2m_tagesmin,lufttemperatur_5cm_tagesmin,station_id,datum,umsatz
0,10.0,13.387097,5.225806,10.0,2.896774,6.516129,0.796774,0.096774,6.574194,6.696774,1003.774194,3.529032,83.935484,5.680645,1.500000,0.887097,433.0,1994-01-01,68.4
1,10.0,9.321429,3.392857,10.0,0.221429,3.321429,3.221429,0.607143,5.542857,4.710714,1012.607143,-0.492857,75.678571,2.335714,-3.271429,-5.035714,433.0,1994-02-01,52.2
2,10.0,15.580645,5.612903,10.0,3.074194,6.193548,2.938710,0.000000,6.167742,7.451613,1007.354839,6.229032,77.548387,9.722581,3.032258,1.309677,433.0,1994-03-01,81.8
3,10.0,11.333333,3.400000,10.0,2.010000,3.133333,6.303333,0.000000,4.353333,8.173333,1005.533333,9.943333,65.933333,14.353333,5.040000,1.166667,433.0,1994-04-01,119.4
4,10.0,10.419355,3.709677,10.0,2.451613,4.064516,6.932258,0.000000,4.845161,10.270968,1008.193548,13.848387,65.161290,18.100000,8.867742,5.941935,433.0,1994-05-01,99.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2038,10.0,11.193548,3.064516,3.0,2.164516,4.838710,6.672677,0.000000,5.341935,16.538710,983.129032,19.664516,73.278065,25.503226,14.661290,13.480645,5705.0,2023-08-01,189.4
2039,10.0,9.000000,2.433333,3.0,0.540000,2.333333,8.727200,0.000000,4.006667,13.916667,986.766667,18.490000,68.113667,25.800000,11.890000,10.300000,5705.0,2023-09-01,146.8
2040,10.0,10.806452,3.161290,3.0,2.448387,3.677419,4.258645,0.000000,6.522581,11.293548,980.870968,12.709677,77.475161,18.029032,8.177419,6.751613,5705.0,2023-10-01,96.1
2041,3.0,12.733333,4.600000,3.0,2.540000,6.100000,1.450533,0.366667,6.800000,8.030000,976.300000,6.096667,83.588667,8.893333,3.190000,1.733333,5705.0,2023-11-01,45.3


In [None]:
X = df_monthly[['wind_tagesmax', 'wind_tagesmittel', 'niederschlagshoehe_tagessumme', 'sonnenstunden_tagessumme', 'schneehoehe_tagessumme', 'bedeckungsgrad_tagesmittel', 'dampfdruck_tagesmittel', 'luftdruck_tagesmittel', 'lufttemperatur_2m_tagesmittel', 'rel_feuchte_tagesmittel', 'lufttemperatur_2m_tagesmax', 'lufttemperatur_2m_tagesmin', 'lufttemperatur_5cm_tagesmin', 'station_id']].copy()
y = df_monthly['umsatz'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [175]:


# Define a dictionary of models
# Define a dictionary of models with hyperparameters
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "SVM": SVR(kernel='rbf', C=1.0, gamma='scale'),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "DecisionTree": DecisionTreeRegressor(max_depth=10),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10),
    "SGD": SGDRegressor(max_iter=1000, tol=1e-3, penalty='l2', alpha=0.0001),
    "ARD": ARDRegression(max_iter=300),
    "BayesianRidge": BayesianRidge(max_iter=300),
    "Huber": HuberRegressor(max_iter=1000),
    #"Lars": Lars(n_nonzero_coefs=500),
    "LassoLars": LassoLars(alpha=0.1),
    "PassiveAggressive": PassiveAggressiveRegressor(max_iter=1000, tol=1e-3),
    "RANSAC": RANSACRegressor(max_trials=100),
    "TheilSen": TheilSenRegressor(max_subpopulation=10000),
    "NuSVR": NuSVR(C=1.0, nu=0.1),
    "LinearSVR": LinearSVR(C=1.0, max_iter=1000, dual='auto'),
    "KNeighbors": KNeighborsRegressor(n_neighbors=5),
    "GaussianProcess": GaussianProcessRegressor(normalize_y=True),
    "PLS": PLSRegression(n_components=2),
    "ExtraTree": ExtraTreeRegressor(max_depth=10),
    "AdaBoost": AdaBoostRegressor(n_estimators=50),
    "Bagging": BaggingRegressor(n_estimators=10),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100),
    "Stacking": StackingRegressor(estimators=[('lr', Ridge(alpha=1.0)), ('svr', SVR(kernel='rbf', C=1.0, gamma='scale'))]),
    "Voting": VotingRegressor(estimators=[('lr', Ridge(alpha=1.0)), ('svr', SVR(kernel='rbf', C=1.0, gamma='scale'))])
}

# Initialize lists to store results
model_names = []
mse_results = []
mae_results = []
rmse_results = []

# Loop over the models
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mse)

    # Store results
    model_names.append(model_name)
    mse_results.append(mse)
    mae_results.append(mae)
    rmse_results.append(rmse)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Model': model_names,
    'MSE': mse_results,
    'MAE': mae_results,
    'RMSE': rmse_results
}).sort_values('RMSE')

# Display the results
print(results)

                Model          MSE        MAE       RMSE
23         ExtraTrees   616.509552  19.295347  24.829610
24   GradientBoosting   619.860414  19.614446  24.896996
6        RandomForest   637.536984  19.621368  25.249495
22            Bagging   653.575029  19.962372  25.565114
17         KNeighbors   717.144961  20.534621  26.779562
25           Stacking   745.810404  21.542781  27.309530
1               Ridge   750.116002  21.582996  27.388246
9       BayesianRidge   750.207725  21.587242  27.389920
0              Linear   750.263393  21.581940  27.390936
8                 ARD   751.232957  21.619069  27.408629
2               Lasso   751.961357  21.647864  27.421914
11          LassoLars   751.992746  21.648406  27.422486
26             Voting   757.980462  21.395544  27.531445
7                 SGD   764.095980  21.872841  27.642286
14           TheilSen   776.439543  21.785319  27.864665
10              Huber   779.954781  21.241839  27.927671
21           AdaBoost   788.882