In [1]:
import numpy as np
import pandas as pd
import os
from os import getenv
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import wandb
%load_ext dotenv
%dotenv

In [2]:
os.environ["WANDB_MODE"] = "dryrun"

wandb.init(project="DABI2")

In [25]:
# Define database connection parameters
username = getenv('DB_USER').lower()
password = getenv('DB_PASSWORD')
host = getenv('DB_HOST')
port = getenv('DB_PORT')
database = getenv('DB_NAME')

# Define the connection string
# Format: dialect+driver://username:password@host:port/database
connection_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

# Create the engine
engine = create_engine(connection_string)
# Import the data to separate dataframes
df_uebernachtungen_raw = pd.read_sql("SELECT * FROM original_data.uebernachtungen_pro_land", engine)
df_wetter_raw = pd.read_sql("SELECT * FROM original_data.weather_area", engine)
month_dict = {'Januar': 1, 'Februar': 2, 'März': 3, 'April': 4, 'Mai': 5, 'Juni': 6, 'Juli': 7, 'August': 8, 'September': 9, 'Oktober': 10, 'November': 11, 'Dezember': 12}
df_uebernachtungen = df_uebernachtungen_raw.copy()
df_uebernachtungen = df_uebernachtungen[~df_uebernachtungen['wohnsitz'].isin(['Inland', 'Ausland'])]
df_uebernachtungen = df_uebernachtungen[~df_uebernachtungen['land'].isin(['Gesamt'])]
df_uebernachtungen['monat'] = df_uebernachtungen['monat'].map(month_dict)
df_uebernachtungen['tag'] = 1
df_uebernachtungen['date'] = pd.to_datetime(df_uebernachtungen[['jahr', 'monat', 'tag']].rename(columns={'jahr': 'year', 'monat': 'month', 'tag': 'day'}))
df_uebernachtungen.drop(columns=['wohnsitz', 'tag', 'monat', 'jahr'], inplace=True)

df_wetter = df_wetter_raw.copy()
df_wetter['date'] = pd.to_datetime(df_wetter['date'].astype(str) + '01', format='%Y%m%d')
state_code_dict = {'DE-BW': 'Baden-Württemberg', 'DE-BY': 'Bayern', 'DE-BE': 'Berlin', 'DE-BB': 'Brandenburg', 'DE-HB': 'Bremen', 'DE-HH': 'Hamburg', 'DE-HE': 'Hessen', 'DE-MV': 'Mecklenburg-Vorpommern', 'DE-NI': 'Niedersachsen', 'DE-NW': 'Nordrhein-Westfalen', 'DE-RP': 'Rheinland-Pfalz', 'DE-SL': 'Saarland', 'DE-SN': 'Sachsen', 'DE-ST': 'Sachsen-Anhalt', 'DE-SH': 'Schleswig-Holstein', 'DE-TH': 'Thüringen'}

df_wetter['state_code'] = df_wetter['state_code'].map(state_code_dict)
df_wetter.drop(columns=['objectid', 'state_id', 'count'], inplace=True)
df_wetter_pivot = df_wetter.pivot_table(index=['date', 'state_code'], columns='parameter_name', aggfunc='mean')
df_wetter_pivot.columns = ['_'.join(col).rstrip('_') for col in df_wetter_pivot.columns.values]
df_wetter_pivot.reset_index(inplace=True)
df_wetter_pivot = df_wetter_pivot.rename(columns={'state_code': 'land'})
merged_df = pd.merge(df_uebernachtungen, df_wetter_pivot, on=['date', 'land'], how='inner')
date = merged_df.pop('date') 
merged_df.insert(1, 'date', date)
std_cols = merged_df.filter(regex='^std').columns
merged_df[std_cols] = merged_df[std_cols].fillna(0)
merged_df['mean_frost_depth'] = merged_df['mean_frost_depth'].fillna(0)

merged_df.dropna(inplace=True)

In [38]:
merged_df

Unnamed: 0,land,date,ankuenfte_anzahl,ankuenfte_veraenderung_zum_vorjahreszeitraum_prozent,uebernachtungen_anzahl,uebernachtungen_veraenderung_zum_vorjahreszeitraum_prozent,durchsch_aufenthaltsdauer_tage,mean_air_temp_max,mean_air_temp_mean,mean_air_temp_min,...,std_air_temp_mean,std_air_temp_min,std_drought_index,std_evapo_p,std_evapo_r,std_frost_depth,std_precipitation,std_soil_moist,std_soil_temperature_5cm,std_sunshine_duration
0,Baden-Württemberg,2024-03-01,78875.0,105.8,219418.0,132.4,2.8,12.37474,7.372771,2.708467,...,1.203247,1.146596,2.064584,2.265238,2.228902,0.0,31.037250,3.031370,1.095861,5.038108
1,Bayern,2024-03-01,111365.0,160.6,307709.0,161.7,2.8,12.54522,7.136138,2.099823,...,1.043781,0.982961,2.210020,3.039621,2.951640,0.0,29.322170,3.638682,0.852588,7.238923
2,Berlin,2024-03-01,1457.0,90.5,5734.0,117.1,3.9,13.16839,8.178442,3.079458,...,0.115004,0.184030,0.171887,0.689766,0.556828,0.0,2.292415,0.115587,0.079462,7.108747
3,Brandenburg,2024-03-01,12123.0,193.5,34061.0,281.4,2.8,12.79973,7.691393,2.461740,...,0.267239,0.460389,0.360204,3.164497,2.272613,0.0,5.693805,0.760745,0.320713,7.685427
4,Bremen,2024-03-01,6883.0,108.3,12945.0,89.5,1.9,11.95506,8.040822,4.297468,...,0.078422,0.064090,0.000000,0.244864,0.267516,0.0,2.310950,0.397123,0.061389,0.499358
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4056,Brandenburg,2008-07-01,69815.0,13.4,227792.0,14.7,3.3,25.01244,19.082100,13.158980,...,0.304050,0.420218,0.536479,1.069119,4.735381,0.0,13.101090,1.404636,0.196230,10.404730
4057,Mecklenburg-Vorpommern,2008-07-01,246519.0,14.1,1214623.0,20.0,4.9,23.60057,18.261860,12.875790,...,0.274342,0.441314,0.637362,1.695321,9.204719,0.0,17.260800,3.345063,0.116900,20.143340
4058,Sachsen,2008-07-01,43946.0,5.5,133320.0,11.5,3.0,23.95894,18.149720,12.414370,...,1.072762,0.785271,0.705370,6.029822,4.028210,0.0,15.528350,4.350149,0.914463,6.656854
4059,Sachsen-Anhalt,2008-07-01,26671.0,17.7,64395.0,15.0,2.4,24.91673,18.783460,12.848600,...,0.687402,0.601549,0.457719,3.912404,2.761256,0.0,11.368440,3.024103,0.535875,4.546651


In [39]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'X' is not defined

In [None]:
sweep_config = {
    'method': 'random', 
    'metric': {
        'name': 'mse',
        'goal': 'minimize'
    },
    'parameters': {
        'n_estimators': {
            'values': [50, 100, 200]
        },
        'learning_rate': {
            'min': 0.01,
            'max': 0.1
        },
        'max_depth': {
            'values': [3, 5, 7]
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project="DABI2")

def train():
    run = wandb.init()
    params = run.config
    model = GradientBoostingRegressor(n_estimators=params.n_estimators,
                                      learning_rate=params.learning_rate,
                                      max_depth=params.max_depth,
                                      random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse) 

    wandb.log({"rmse": rmse}) 

wandb.agent(sweep_id, train)