# Template used to test a model locally:

### Outline: 

- Import the Dataset using the correct function

- Use the function transformers to do the feature engineering - ```Please Comment any step that you have changed versus the best submission```

- Retrieve the transformed X_train and run ydata_profiling to find insights

- Create pipeline

- Run CV to get a baseline RMSE score

- Print the most important hyperparameters & run the Optuna training 

- Rerun the pipeline to see the impact - ```Please share any relevant information here```

- Update the script with the final model and test submission

## Example with Best Score Script using the `Coco` +  ['temp', 'rhum', 'wspd', 'prcp']

### Import the Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
import holidays
from lockdowndates.core import LockdownDates
import haversine as hs
from datetime import datetime
from meteostat import Point, Hourly
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import numpy as np
from pathlib import Path
from datetime import datetime
from meteostat import Point, Hourly

In [2]:
def _read_data():
    _target_column_name = 'log_bike_count'
    data = pd.read_parquet(Path("data") / "train.parquet")
    # Sort by date first, so that time based cross-validation would produce correct results
    data = data.sort_values(["date", "counter_name"])
    y_array = data[_target_column_name].values
    X_df = data.drop([_target_column_name, "bike_count"], axis=1)
    return X_df, y_array

In [3]:
X_train, y_train = _read_data()
X_final = pd.read_parquet(Path("data") / "final_test.parquet")

### Function Transformers

In [4]:
# Encode Date with Fourrier + Covid + Holidays (To test Perf With Weekend + Holidays)

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "date_ws"] = X["date"].dt.date.astype('datetime64[ns]') # To be used for merging
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    X.loc[:, "week_of_year"] = X["date"].dt.isocalendar().week
    X.loc[:, "season"] = X["week_of_year"].apply(lambda w: (w // 13) % 4 + 1)

    # Add the column corresponding to holidays
    fr_holidays = holidays.FR(years=X["year"].unique().tolist())
    X.loc[: ,'holidays'] = X['date'].apply(lambda x: x in fr_holidays or x.weekday() >= 5).astype(int)

    # Add covid restrictions
    ld = LockdownDates("France", "2020-09-01", "2022-01-01", ("stay_at_home", "masks"))
    lockdown_dates = ld.dates()
    X = X.reset_index().merge(lockdown_dates['france_stay_at_home'], how='left', left_on='date_ws', right_index=True).set_index('index')
    
    # Month
    months_in_year = 12
    X['sin_month'] = np.sin(2*np.pi*X.month/months_in_year)
    X['cos_month'] = np.cos(2*np.pi*X.month/months_in_year)
    X.drop('month', axis=1, inplace=True)
    
    # Day
    day_in_month = 31
    X['sin_day'] = np.sin(2*np.pi*X.day/day_in_month)
    X['cos_day'] = np.cos(2*np.pi*X.day/day_in_month)
    X.drop('day', axis=1, inplace=True)
    
    # Weekday
    day_in_week = 7
    X['sin_weekday'] = np.sin(2*np.pi*X.weekday/day_in_week)
    X['cos_weekday'] = np.cos(2*np.pi*X.weekday/day_in_week)
    X.drop('weekday', axis=1, inplace=True)
    
    # Hour
    hours_in_day = 24
    X['sin_hour'] = np.sin(2*np.pi*X.hour/hours_in_day)
    X['cos_hour'] = np.cos(2*np.pi*X.hour/hours_in_day)
    X.drop('hour', axis=1, inplace=True)
    
    # Week of year
    weeks_in_year = 52
    X['sin_weekyear'] = np.sin(2*np.pi*X.week_of_year/weeks_in_year)
    X['cos_weekyear'] = np.cos(2*np.pi*X.week_of_year/weeks_in_year)
    X.drop('week_of_year', axis=1, inplace=True)
    
    # Season
    seasons_in_year = 4
    X['sin_season'] = np.sin(2*np.pi*X.season/seasons_in_year)
    X['cos_season'] = np.cos(2*np.pi*X.season/seasons_in_year)
    X.drop('season', axis=1, inplace=True)

    return X

In [5]:
# Retrieve the closest transport station // Test with number of stations in a radius (Categorical)

def _closest_transport(X): 
    column_names = ['longitude', 'latitude', 'station_name']  # Replace with your actual column names
    idf_stations = pd.read_csv(Path("data") / "Stations_IDF.csv", delimiter=';', header=None, names=column_names)

    X = X.copy()
    # Create an empty DataFrame to store the results
    result_df = pd.DataFrame(columns=['counter_id', 'closest_metro_distance'])

    # Iterate over unique counter_ids in X
    for counter_id in X['counter_id'].unique():
        coordinates_counter = (X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
                               X.loc[X['counter_id'] == counter_id, 'longitude'].values[0])

        # Calculate distances to all metro stations
        distances = []
        for _, station_row in idf_stations.iterrows():
            coordinates_station = (station_row['latitude'], station_row['longitude'])
            distance = hs.haversine(coordinates_counter, coordinates_station)
            distances.append(distance)

        # Get the k closest distances
        closest_distance = sorted(distances)[0]

        # Append to the result DataFrame
        result_df = pd.concat([result_df, pd.DataFrame({'counter_id': [counter_id], 'closest_metro_distance': [closest_distance]})])
    
    result_df = result_df.set_index('counter_id')
    X = X.reset_index().merge(result_df['closest_metro_distance'], how='left', left_on='counter_id', right_index=True).set_index('index')
        
    return X

In [6]:
# Add Weather Data //  'Coco' column + All Weather this time

def _add_weather_data(X):
    X = X.copy()
    dfs = []

    for counter_id in X['counter_id'].unique():
        # Get the coordinates of the counter
        coordinates_counter = (
            X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
            X.loc[X['counter_id'] == counter_id, 'longitude'].values[0]
        )

        # Create a Point object with the counter's coordinates
        counter_point = Point(*coordinates_counter)

        # Define the time range (start and end dates)
        start = datetime(2020, 8, 1)
        end = datetime(2022, 1, 1)

        # Create a Hourly object and fetch the weather data
        weather_data = Hourly(counter_point, start, end).interpolate().fetch()
        selected_columns = ['temp', 'rhum', 'wspd', 'prcp', 'coco']
        weather_data = weather_data[selected_columns].shift(-1)

        # Append counter_id to the weather_data DataFrame
        weather_data['counter_id'] = counter_id

        # Append the DataFrame to the list
        dfs.append(weather_data)

    # Concatenate all DataFrames into a single result_df
    result_df = pd.concat(dfs)
    
    # Convert 'time' index to 'date' column for merging
    result_df['date'] = result_df.index
    result_df['date'] = pd.to_datetime(result_df['date'])
    
    # Merge the result DataFrame with the original DataFrame on 'counter_id' and 'date'
    X = X.reset_index().merge(result_df, how='left', on=['counter_id', 'date']).set_index('index')

    return X.drop(columns=["date", "counter_name", "site_id", "site_name", 
                           "counter_installation_date", "coordinates", "counter_technical_id",
                           "longitude", "latitude", "date_ws"])

### Call all functions on X train, concatenate with y_train and run ydata_profiling

In [7]:
# Please don't change the X_train_encode to not overwrite the initial dataset that will be used later for the modelling

X_train_encoded = _encode_dates(X_train)
X_train_encoded = _closest_transport(X_train_encoded)
X_train_encoded = _add_weather_data(X_train_encoded)

Fetching lockdown dates...
Fetched lockdown dates for: France


In [8]:
# Do the necessary changes to proceed to ydata_profiling

y_train_transformed = pd.Series(y_train).reset_index(drop=True)

concatenated_df = pd.concat([X_train_encoded.reset_index(drop=True), y_train_transformed], axis=1)

In [9]:
concatenated_df.head()

Unnamed: 0,counter_id,year,holidays,france_stay_at_home,sin_month,cos_month,sin_day,cos_day,sin_weekday,cos_weekday,...,cos_weekyear,sin_season,cos_season,closest_metro_distance,temp,rhum,wspd,prcp,coco,0
0,100049407-353255860,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,...,-0.354605,-1.0,-1.83697e-16,0.26864,13.0,79.0,3.6,0.0,1.0,1.609438
1,100049407-353255859,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,...,-0.354605,-1.0,-1.83697e-16,0.26864,13.0,79.0,3.6,0.0,1.0,1.386294
2,100036719-104036719,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,...,-0.354605,-1.0,-1.83697e-16,0.016574,13.0,79.0,3.6,0.0,1.0,0.0
3,100036719-103036719,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,...,-0.354605,-1.0,-1.83697e-16,0.016574,13.0,79.0,3.6,0.0,1.0,0.693147
4,100063175-353277233,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,...,-0.354605,-1.0,-1.83697e-16,0.197986,13.0,79.0,3.6,0.0,5.0,2.079442


In [10]:
#import ydata_profiling

#concatenated_df.profile_report()

**Note on the correlation matrix that some of the cos/sin features that we derive are correlated between each other. Try to keep only certain of them and keep the other as categories** 

### Create Pipeline and CV

In [11]:
def get_estimator():
    date_encoder = FunctionTransformer(_encode_dates)
    transport_encoder = FunctionTransformer(_closest_transport)
    weather_encoder = FunctionTransformer(_add_weather_data)
    
    date_cols = ["sin_month", "sin_day", "sin_hour", "sin_weekyear", "sin_weekday",
                 "cos_month", "cos_day", "cos_hour", "cos_weekyear", "cos_weekday"]

    categorical_cols = ["counter_id", "closest_metro_distance", "holidays", "france_stay_at_home", "year", "coco"]
    numerical_cols = ['temp', 'rhum', 'wspd', 'prcp']

    preprocessor = ColumnTransformer(
        [
            ("date", 'passthrough', date_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
            ("num", StandardScaler(), numerical_cols)
        ]
    )
    regressor = CatBoostRegressor()

    pipe = make_pipeline(date_encoder, transport_encoder, weather_encoder, preprocessor, regressor)

    return pipe

In [12]:
pipe = get_estimator()

pipe.fit(X_train, y_train)

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

Fetching lockdown dates...
Fetched lockdown dates for: France
Learning rate set to 0.109189
0:	learn: 1.5455387	total: 78.4ms	remaining: 1m 18s
1:	learn: 1.4470398	total: 97ms	remaining: 48.4s
2:	learn: 1.3624128	total: 114ms	remaining: 38s
3:	learn: 1.2911092	total: 134ms	remaining: 33.4s
4:	learn: 1.2278214	total: 153ms	remaining: 30.4s
5:	learn: 1.1719356	total: 173ms	remaining: 28.6s
6:	learn: 1.1243903	total: 191ms	remaining: 27s
7:	learn: 1.0833677	total: 209ms	remaining: 25.9s
8:	learn: 1.0495571	total: 227ms	remaining: 25s
9:	learn: 1.0190622	total: 246ms	remaining: 24.4s
10:	learn: 0.9941027	total: 262ms	remaining: 23.6s
11:	learn: 0.9719429	total: 279ms	remaining: 23s
12:	learn: 0.9526136	total: 296ms	remaining: 22.4s
13:	learn: 0.9356389	total: 311ms	remaining: 21.9s
14:	learn: 0.9206048	total: 327ms	remaining: 21.4s
15:	learn: 0.9047137	total: 345ms	remaining: 21.2s
16:	learn: 0.8925117	total: 362ms	remaining: 20.9s
17:	learn: 0.8823640	total: 379ms	remaining: 20.7s
18:	lea

In [13]:
pipe

### Hyperparameter Tuning with Optuna

In [14]:
import optuna
from sklearn.metrics import mean_squared_error

date_encoder = FunctionTransformer(_encode_dates)
transport_encoder = FunctionTransformer(_closest_transport)
weather_encoder = FunctionTransformer(_add_weather_data)
    
date_cols = ["sin_month", "sin_day", "sin_hour", "sin_weekyear", "sin_weekday",
                 "cos_month", "cos_day", "cos_hour", "cos_weekyear", "cos_weekday"]

categorical_cols = ["counter_id", "closest_metro_distance", "holidays", "france_stay_at_home", "year", "coco"]
numerical_cols = ['temp', 'rhum', 'wspd', 'prcp']

preprocessor = ColumnTransformer(
        [
            ("date", 'passthrough', date_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
            ("num", StandardScaler(), numerical_cols)
        ]
)

def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }

    model = make_pipeline(date_encoder, transport_encoder, weather_encoder, preprocessor, CatBoostRegressor(**params, silent=True))
    model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    rmse = mean_squared_error(y_train, predictions, squared=False)
    return rmse

In [15]:
study = optuna.create_study(direction='minimize') 
study.optimize(objective, n_trials=50)

[I 2023-12-06 21:12:40,865] A new study created in memory with name: no-name-ba213ea3-f3b9-4fb4-9ec8-ae508cbbf092


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:13:04,441] Trial 0 finished with value: 0.5435038461978766 and parameters: {'learning_rate': 0.06730637646320073, 'depth': 3, 'subsample': 0.6152568018709638, 'colsample_bylevel': 0.6943768607716672, 'min_data_in_leaf': 66}. Best is trial 0 with value: 0.5435038461978766.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:13:35,032] Trial 1 finished with value: 0.6616268804817701 and parameters: {'learning_rate': 0.004449863784605369, 'depth': 8, 'subsample': 0.46574269571862603, 'colsample_bylevel': 0.9231612180228402, 'min_data_in_leaf': 82}. Best is trial 0 with value: 0.5435038461978766.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:14:07,811] Trial 2 finished with value: 0.5771113848503127 and parameters: {'learning_rate': 0.012797768105777074, 'depth': 6, 'subsample': 0.8378478873577272, 'colsample_bylevel': 0.4403667748474213, 'min_data_in_leaf': 71}. Best is trial 0 with value: 0.5435038461978766.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:14:35,527] Trial 3 finished with value: 0.37772580673231515 and parameters: {'learning_rate': 0.040892501992474586, 'depth': 9, 'subsample': 0.2444805464081118, 'colsample_bylevel': 0.5025794328439444, 'min_data_in_leaf': 26}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:15:10,813] Trial 4 finished with value: 0.5829373980741882 and parameters: {'learning_rate': 0.00599883706508035, 'depth': 9, 'subsample': 0.6489936328712163, 'colsample_bylevel': 0.9688723122441241, 'min_data_in_leaf': 75}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:15:38,723] Trial 5 finished with value: 0.4437807896202989 and parameters: {'learning_rate': 0.02360727254026145, 'depth': 8, 'subsample': 0.3118662269880482, 'colsample_bylevel': 0.4538893508071354, 'min_data_in_leaf': 95}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:16:10,859] Trial 6 finished with value: 0.3895388228484279 and parameters: {'learning_rate': 0.06954299312373818, 'depth': 7, 'subsample': 0.6932235011357959, 'colsample_bylevel': 0.32636975602904666, 'min_data_in_leaf': 2}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:16:32,223] Trial 7 finished with value: 0.4044942008961103 and parameters: {'learning_rate': 0.07637047902079623, 'depth': 6, 'subsample': 0.1108537170545075, 'colsample_bylevel': 0.6794995043257073, 'min_data_in_leaf': 5}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:16:58,144] Trial 8 finished with value: 0.40786821309617016 and parameters: {'learning_rate': 0.04453328769914575, 'depth': 7, 'subsample': 0.2991864735164913, 'colsample_bylevel': 0.8538345416431975, 'min_data_in_leaf': 32}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:17:20,852] Trial 9 finished with value: 0.5399169373531322 and parameters: {'learning_rate': 0.07351514806602351, 'depth': 3, 'subsample': 0.5592988470103494, 'colsample_bylevel': 0.5508731267324551, 'min_data_in_leaf': 77}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:17:44,641] Trial 10 finished with value: 1.1466120111490443 and parameters: {'learning_rate': 0.0010335965621350687, 'depth': 10, 'subsample': 0.06711377667950416, 'colsample_bylevel': 0.0735335501998558, 'min_data_in_leaf': 35}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:18:04,122] Trial 11 finished with value: 0.8493532029690362 and parameters: {'learning_rate': 0.030669221059840584, 'depth': 1, 'subsample': 0.9482346568368679, 'colsample_bylevel': 0.2611438756195565, 'min_data_in_leaf': 1}. Best is trial 3 with value: 0.37772580673231515.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:18:47,203] Trial 12 finished with value: 0.32331358636195207 and parameters: {'learning_rate': 0.09598946340975474, 'depth': 10, 'subsample': 0.7771849617556571, 'colsample_bylevel': 0.30040958652995786, 'min_data_in_leaf': 21}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:19:29,417] Trial 13 finished with value: 0.3889332272602319 and parameters: {'learning_rate': 0.03096956373665865, 'depth': 10, 'subsample': 0.8063333286995364, 'colsample_bylevel': 0.24612679689913505, 'min_data_in_leaf': 27}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:20:08,524] Trial 14 finished with value: 0.37212640356339377 and parameters: {'learning_rate': 0.09171031467288204, 'depth': 10, 'subsample': 0.9971530378731234, 'colsample_bylevel': 0.05037934253270532, 'min_data_in_leaf': 19}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:20:34,070] Trial 15 finished with value: 0.5304366211504161 and parameters: {'learning_rate': 0.09239957588836849, 'depth': 4, 'subsample': 0.9997171934440946, 'colsample_bylevel': 0.060437319782552706, 'min_data_in_leaf': 48}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:21:14,964] Trial 16 finished with value: 0.4478959197302367 and parameters: {'learning_rate': 0.018514567543082033, 'depth': 10, 'subsample': 0.8671063835512745, 'colsample_bylevel': 0.15148438856134905, 'min_data_in_leaf': 16}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:21:48,795] Trial 17 finished with value: 0.36486124219022664 and parameters: {'learning_rate': 0.0979644578078054, 'depth': 8, 'subsample': 0.762352313910656, 'colsample_bylevel': 0.1502411131181498, 'min_data_in_leaf': 47}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:22:24,637] Trial 18 finished with value: 0.39911235973409637 and parameters: {'learning_rate': 0.04241906599750503, 'depth': 8, 'subsample': 0.7361209803192267, 'colsample_bylevel': 0.338254243729309, 'min_data_in_leaf': 48}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:22:51,751] Trial 19 finished with value: 0.44196128054809575 and parameters: {'learning_rate': 0.09902500757887696, 'depth': 5, 'subsample': 0.7344000868545146, 'colsample_bylevel': 0.1753746932242465, 'min_data_in_leaf': 62}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:23:30,512] Trial 20 finished with value: 0.3808351918159266 and parameters: {'learning_rate': 0.050292412079205775, 'depth': 9, 'subsample': 0.88193021210747, 'colsample_bylevel': 0.16796685056817182, 'min_data_in_leaf': 40}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:24:11,096] Trial 21 finished with value: 0.3592095379737275 and parameters: {'learning_rate': 0.09686200454215603, 'depth': 10, 'subsample': 0.9845300441291527, 'colsample_bylevel': 0.06313776757105145, 'min_data_in_leaf': 20}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:24:47,385] Trial 22 finished with value: 0.3801203541700188 and parameters: {'learning_rate': 0.053329249256001056, 'depth': 9, 'subsample': 0.7649467106730788, 'colsample_bylevel': 0.14269771858701683, 'min_data_in_leaf': 15}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:25:24,122] Trial 23 finished with value: 0.35417785315434336 and parameters: {'learning_rate': 0.09957714682624862, 'depth': 8, 'subsample': 0.8764468648145668, 'colsample_bylevel': 0.2503338181231783, 'min_data_in_leaf': 41}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:25:59,574] Trial 24 finished with value: 0.41210933617295725 and parameters: {'learning_rate': 0.0512881363902248, 'depth': 7, 'subsample': 0.9125801628368851, 'colsample_bylevel': 0.2630430466991757, 'min_data_in_leaf': 57}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:26:45,950] Trial 25 finished with value: 0.3409398343372778 and parameters: {'learning_rate': 0.062120411625928414, 'depth': 10, 'subsample': 0.846164086708666, 'colsample_bylevel': 0.34244644945907166, 'min_data_in_leaf': 11}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:27:26,096] Trial 26 finished with value: 0.3603916628177582 and parameters: {'learning_rate': 0.05735623442485782, 'depth': 9, 'subsample': 0.8250745386035739, 'colsample_bylevel': 0.3315231940249053, 'min_data_in_leaf': 39}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:28:07,683] Trial 27 finished with value: 0.39452786435932546 and parameters: {'learning_rate': 0.033688749827961134, 'depth': 9, 'subsample': 0.9175888648399666, 'colsample_bylevel': 0.33070980233357655, 'min_data_in_leaf': 11}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:28:55,932] Trial 28 finished with value: 0.34027615453588905 and parameters: {'learning_rate': 0.06105885597582281, 'depth': 10, 'subsample': 0.816242790999513, 'colsample_bylevel': 0.39707381590676394, 'min_data_in_leaf': 26}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:29:15,410] Trial 29 finished with value: 0.8075874718510795 and parameters: {'learning_rate': 0.06263655593209323, 'depth': 1, 'subsample': 0.614850950958451, 'colsample_bylevel': 0.39316634592340105, 'min_data_in_leaf': 9}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:30:03,194] Trial 30 finished with value: 0.3367853827698094 and parameters: {'learning_rate': 0.06550741545866583, 'depth': 10, 'subsample': 0.7956531420891686, 'colsample_bylevel': 0.3834393087096636, 'min_data_in_leaf': 26}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:30:51,057] Trial 31 finished with value: 0.3376868191546287 and parameters: {'learning_rate': 0.06468846156881515, 'depth': 10, 'subsample': 0.8130703994577623, 'colsample_bylevel': 0.40627268091153146, 'min_data_in_leaf': 25}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:31:39,666] Trial 32 finished with value: 0.3643929404691381 and parameters: {'learning_rate': 0.03854318906544161, 'depth': 10, 'subsample': 0.7975350261706974, 'colsample_bylevel': 0.4290283613666614, 'min_data_in_leaf': 26}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:32:18,226] Trial 33 finished with value: 0.3492907982236418 and parameters: {'learning_rate': 0.06585320250911848, 'depth': 9, 'subsample': 0.6835100615585192, 'colsample_bylevel': 0.5449732474675885, 'min_data_in_leaf': 32}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:33:05,088] Trial 34 finished with value: 0.39500659673914895 and parameters: {'learning_rate': 0.024975532280629393, 'depth': 10, 'subsample': 0.7007656325180379, 'colsample_bylevel': 0.501082527167587, 'min_data_in_leaf': 24}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:33:42,203] Trial 35 finished with value: 0.3999864183763585 and parameters: {'learning_rate': 0.03996108517442378, 'depth': 8, 'subsample': 0.7847680347381678, 'colsample_bylevel': 0.40785194093419985, 'min_data_in_leaf': 22}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:34:18,888] Trial 36 finished with value: 0.34639573144248537 and parameters: {'learning_rate': 0.07207379183081447, 'depth': 9, 'subsample': 0.6300593639218613, 'colsample_bylevel': 0.47678803750979637, 'min_data_in_leaf': 29}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:34:59,318] Trial 37 finished with value: 0.44658354631337627 and parameters: {'learning_rate': 0.018326751089604885, 'depth': 9, 'subsample': 0.8248623506874296, 'colsample_bylevel': 0.38380290065082373, 'min_data_in_leaf': 55}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:35:30,261] Trial 38 finished with value: 0.3996356773019205 and parameters: {'learning_rate': 0.052329052768674965, 'depth': 7, 'subsample': 0.5542232285679806, 'colsample_bylevel': 0.5782037106536353, 'min_data_in_leaf': 36}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:35:59,358] Trial 39 finished with value: 0.43573511099736545 and parameters: {'learning_rate': 0.07579081529945156, 'depth': 5, 'subsample': 0.7189286738560114, 'colsample_bylevel': 0.46716532238180875, 'min_data_in_leaf': 93}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:36:36,700] Trial 40 finished with value: 0.3570257717696232 and parameters: {'learning_rate': 0.04468344872470866, 'depth': 10, 'subsample': 0.4534986069121389, 'colsample_bylevel': 0.4097256945321601, 'min_data_in_leaf': 43}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:37:24,164] Trial 41 finished with value: 0.3413645159888762 and parameters: {'learning_rate': 0.06085092676604181, 'depth': 10, 'subsample': 0.8511669309192973, 'colsample_bylevel': 0.3625049305769924, 'min_data_in_leaf': 10}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:38:08,048] Trial 42 finished with value: 0.3466003793850451 and parameters: {'learning_rate': 0.07228088205515594, 'depth': 9, 'subsample': 0.9215718954204127, 'colsample_bylevel': 0.4272556482158346, 'min_data_in_leaf': 15}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:38:52,922] Trial 43 finished with value: 0.3447701539986798 and parameters: {'learning_rate': 0.058275395816810585, 'depth': 10, 'subsample': 0.8337962382499446, 'colsample_bylevel': 0.30102863650656636, 'min_data_in_leaf': 6}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:39:36,097] Trial 44 finished with value: 0.3307807303413826 and parameters: {'learning_rate': 0.07731044072424613, 'depth': 10, 'subsample': 0.6758192165973221, 'colsample_bylevel': 0.3709622178333996, 'min_data_in_leaf': 31}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:40:12,202] Trial 45 finished with value: 0.35794094947089866 and parameters: {'learning_rate': 0.08322324585942727, 'depth': 8, 'subsample': 0.7606535448221445, 'colsample_bylevel': 0.3725962912348478, 'min_data_in_leaf': 31}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:40:33,234] Trial 46 finished with value: 0.6561918280273993 and parameters: {'learning_rate': 0.07747675143481772, 'depth': 2, 'subsample': 0.6823673223838363, 'colsample_bylevel': 0.29312227886652803, 'min_data_in_leaf': 35}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:41:13,339] Trial 47 finished with value: 0.38641108493245224 and parameters: {'learning_rate': 0.0363039413550533, 'depth': 9, 'subsample': 0.7699888759090626, 'colsample_bylevel': 0.4411667541311272, 'min_data_in_leaf': 23}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:42:00,301] Trial 48 finished with value: 0.3520773974443592 and parameters: {'learning_rate': 0.0471558579237265, 'depth': 10, 'subsample': 0.6531916934506677, 'colsample_bylevel': 0.5056034583743076, 'min_data_in_leaf': 17}. Best is trial 12 with value: 0.32331358636195207.


Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France


[I 2023-12-06 21:42:31,224] Trial 49 finished with value: 0.40182715185271817 and parameters: {'learning_rate': 0.08257578412187312, 'depth': 6, 'subsample': 0.734705956881153, 'colsample_bylevel': 0.3748345095737612, 'min_data_in_leaf': 27}. Best is trial 12 with value: 0.32331358636195207.


In [16]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'learning_rate': 0.09598946340975474, 'depth': 10, 'subsample': 0.7771849617556571, 'colsample_bylevel': 0.30040958652995786, 'min_data_in_leaf': 21}
Best RMSE: 0.32331358636195207


### Cross Validation using best_params

In [17]:
tuned_pipe = make_pipeline(date_encoder, transport_encoder, weather_encoder, preprocessor, CatBoostRegressor(**study.best_params, silent=True))
tuned_pipe.fit(X_train, y_train)

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores_tuned = cross_val_score(
    tuned_pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores_tuned)
print(f"RMSE (all folds): {-scores_tuned.mean():.3} ± {(-scores_tuned).std():.3}")

Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
Fetching lockdown dates...
Fetched lockdown dates for: France
RMSE:  [-0.74694763 -0.73901564 -0.74645606 -0.79295119 -0.6283122  -0.70128943]
RMSE (all folds): 0.726 ± 0.0511
