In [1]:
import numpy as np
import pandas as pd
from os import getenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import ARDRegression, BayesianRidge, HuberRegressor, Lars, LassoLars, PassiveAggressiveRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.svm import NuSVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, StackingRegressor, VotingRegressor
from math import sqrt
%load_ext dotenv
%dotenv

In [26]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)
# Import the data to separate dataframes
df_uebernachtungen_raw = pd.read_sql("SELECT * FROM original_data.uebernachtungen_pro_land", engine)
df_wetter_raw = pd.read_sql("SELECT * FROM original_data.weather_area", engine)
month_dict = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
df_uebernachtungen = df_uebernachtungen_raw.copy()
df_uebernachtungen = df_uebernachtungen[~df_uebernachtungen['wohnsitz'].isin(['Inland', 'Ausland'])]
df_uebernachtungen = df_uebernachtungen[~df_uebernachtungen['land'].isin(['Gesamt'])]
df_uebernachtungen['monat'] = df_uebernachtungen['monat'].map(month_dict)
df_uebernachtungen['tag'] = 1
df_uebernachtungen['date'] = pd.to_datetime(df_uebernachtungen[['jahr', 'monat', 'tag']].rename(columns={'jahr': 'year', 'monat': 'month', 'tag': 'day'}))
df_uebernachtungen.drop(columns=['wohnsitz', 'tag', 'monat', 'jahr'], inplace=True)

df_wetter = df_wetter_raw.copy()
df_wetter['date'] = pd.to_datetime(df_wetter['date'].astype(str) + '01', format='%Y%m%d')
state_code_dict = {'DE-BW': 'Baden-Württemberg', 'DE-BY': 'Bayern', 'DE-BE': 'Berlin', 'DE-BB': 'Brandenburg', 'DE-HB': 'Bremen', 'DE-HH': 'Hamburg', 'DE-HE': 'Hessen', 'DE-MV': 'Mecklenburg-Vorpommern', 'DE-NI': 'Niedersachsen', 'DE-NW': 'Nordrhein-Westfalen', 'DE-RP': 'Rheinland-Pfalz', 'DE-SL': 'Saarland', 'DE-SN': 'Sachsen', 'DE-ST': 'Sachsen-Anhalt', 'DE-SH': 'Schleswig-Holstein', 'DE-TH': 'Thüringen'}

df_wetter['state_code'] = df_wetter['state_code'].map(state_code_dict)
df_wetter.drop(columns=['objectid', 'state_id', 'count'], inplace=True)
df_wetter_pivot = df_wetter.pivot_table(index=['date', 'state_code'], columns='parameter_name', aggfunc='mean')
df_wetter_pivot.columns = ['_'.join(col).rstrip('_') for col in df_wetter_pivot.columns.values]
df_wetter_pivot.reset_index(inplace=True)
df_wetter_pivot = df_wetter_pivot.rename(columns={'state_code': 'land'})
merged_df = pd.merge(df_uebernachtungen, df_wetter_pivot, on=['date', 'land'], how='inner')
date = merged_df.pop('date')
merged_df.insert(1, 'date', date)
std_cols = merged_df.filter(regex='^std').columns
merged_df[std_cols] = merged_df[std_cols].fillna(0)
merged_df['mean_frost_depth'] = merged_df['mean_frost_depth'].fillna(0)

merged_df.dropna(inplace=True)

#merged_df = pd.get_dummies(merged_df, columns=['land'])

In [27]:
merged_df.columns

Index(['land', 'date', 'ankuenfte_anzahl',
       'ankuenfte_veraenderung_zum_vorjahreszeitraum_prozent',
       'uebernachtungen_anzahl',
       'uebernachtungen_veraenderung_zum_vorjahreszeitraum_prozent',
       'durchsch_aufenthaltsdauer_tage', 'mean_air_temp_max',
       'mean_air_temp_mean', 'mean_air_temp_min', 'mean_drought_index',
       'mean_evapo_p', 'mean_evapo_r', 'mean_frost_depth',
       'mean_precipitation', 'mean_soil_moist', 'mean_soil_temperature_5cm',
       'mean_sunshine_duration', 'std_air_temp_max', 'std_air_temp_mean',
       'std_air_temp_min', 'std_drought_index', 'std_evapo_p', 'std_evapo_r',
       'std_frost_depth', 'std_precipitation', 'std_soil_moist',
       'std_soil_temperature_5cm', 'std_sunshine_duration'],
      dtype='object')

In [33]:
# Define the features and the target
X  = merged_df.drop(columns=['land', 'uebernachtungen_anzahl', 'ankuenfte_veraenderung_zum_vorjahreszeitraum_prozent', 'uebernachtungen_veraenderung_zum_vorjahreszeitraum_prozent', 'ankuenfte_anzahl', 'durchsch_aufenthaltsdauer_tage', 'date', 'std_air_temp_max', 'std_air_temp_mean', 'std_air_temp_min', 'std_drought_index', 'std_evapo_p', 'std_evapo_r', 'std_frost_depth', 'std_precipitation', 'std_soil_moist', 'std_soil_temperature_5cm', 'std_sunshine_duration'])
y = merged_df['uebernachtungen_anzahl']
split_date = pd.Timestamp('2020-01-01')
X_train = X.loc[merged_df['date'] < split_date]
y_train = y.loc[merged_df['date'] < split_date]
X_test = X.loc[merged_df['date'] >= split_date]
y_test = y.loc[merged_df['date'] >= split_date]

In [35]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)

In [36]:
# Define a dictionary of models with hyperparameters
models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "SVM": SVR(kernel='rbf', C=1.0, gamma='scale'),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
    "DecisionTree": DecisionTreeRegressor(max_depth=10),
    "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=10),
    "SGD": SGDRegressor(max_iter=1000, tol=1e-3, penalty='l2', alpha=0.0001),
    "ARD": ARDRegression(max_iter=300),
    "BayesianRidge": BayesianRidge(max_iter=300),
    "Huber": HuberRegressor(max_iter=1000),
    #"Lars": Lars(n_nonzero_coefs=500),
    "LassoLars": LassoLars(alpha=0.1),
    "PassiveAggressive": PassiveAggressiveRegressor(max_iter=1000, tol=1e-3),
    "RANSAC": RANSACRegressor(max_trials=100),
    "TheilSen": TheilSenRegressor(max_subpopulation=10000),
    "NuSVR": NuSVR(C=1.0, nu=0.1),
    "LinearSVR": LinearSVR(C=1.0, max_iter=1000, dual='auto'),
    "KNeighbors": KNeighborsRegressor(n_neighbors=5),
    "GaussianProcess": GaussianProcessRegressor(normalize_y=True),
    "PLS": PLSRegression(n_components=2),
    "ExtraTree": ExtraTreeRegressor(max_depth=10),
    "AdaBoost": AdaBoostRegressor(n_estimators=50),
    "Bagging": BaggingRegressor(n_estimators=10),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100),
    "Stacking": StackingRegressor(estimators=[('lr', Ridge(alpha=1.0)), ('svr', SVR(kernel='rbf', C=1.0, gamma='scale'))]),
    "Voting": VotingRegressor(estimators=[('lr', Ridge(alpha=1.0)), ('svr', SVR(kernel='rbf', C=1.0, gamma='scale'))])
}

# Initialize lists to store results
model_names = []
mse_results = []
mae_results = []
rmse_results = []

# Loop over the models
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_scaled, y_train)

    # Make predictions
    y_pred = model.predict(X_test_scaled)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mse)

    # Store results
    model_names.append(model_name)
    mse_results.append(mse)
    mae_results.append(mae)
    rmse_results.append(rmse)

# Create a DataFrame with the results
results = pd.DataFrame({
    'Model': model_names,
    #'MSE': mse_results,
    'MAE': mae_results,
    'RMSE': rmse_results
}).sort_values('RMSE')

# Display the results
print(results)

  model = cd_fast.enet_coordinate_descent(


                Model            MAE           RMSE
23         ExtraTrees  182422.387060  297600.785865
6        RandomForest  184463.335730  302248.740609
24   GradientBoosting  185150.102764  306195.438642
22            Bagging  185940.287736  306954.451690
0              Linear  189317.231350  307111.921418
11          LassoLars  189316.725746  307112.022368
9       BayesianRidge  189167.474342  307208.852342
2               Lasso  189655.814203  307345.506321
25           Stacking  189296.291040  307652.329904
1               Ridge  189409.158422  307675.894180
8                 ARD  189202.575831  307935.511121
21           AdaBoost  198679.640666  308916.282009
7                 SGD  193331.895348  311321.912917
4          ElasticNet  197670.567551  317984.600691
14           TheilSen  188706.319831  318916.871112
19                PLS  200460.581392  320258.661529
17         KNeighbors  190854.769182  328725.574435
5        DecisionTree  196791.281189  337964.987979
10          