In [1]:
import utils
import pandas as pd
import numpy as np
from vs_feature_eng import feature_eng_v3

In [2]:
X, y = utils.get_train_data()

In [22]:
from sklearn.pipeline import make_pipeline
from catboost import CatBoostRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# DATA PREPROCESSING
# ==============================================================================
numeric_features = [### DATA
                    # Relative to dates
                    'year',
                    'season_sin',
                    'season_cos',
                    'month_sin',
                    'month_cos',
                    'weekday_sin',
                    'weekday_cos',
                    'hour_sin',
                    'hour_cos',
                    'is_weekend',
                    'is_holiday',
                    'is_covid',
                    # Relative to location:
                    'site_id',
                    'latitude',
                    'longitude',

                    ### EXTERNAL DATA
                    't',
                    'raf10', 
                    'etat_sol', 
                    'nnuage1', 
                    'rr12', 
                    # 'rr24',
                    # 'cl', 
                    # 'ssfrai', 
                    # 'w2'
                    ]


preprocessor = ColumnTransformer(
    [('standard-scaler', StandardScaler(), numeric_features)],
    remainder='drop'
    )

# We create the full pipeline:
model = make_pipeline(
    feature_eng_v3.FeatureEngineering(),
    preprocessor,
    CatBoostRegressor(learning_rate=0.05, iterations=300, depth=10, silent=True) # default values
)

In [23]:
### TRAIN_TEST_split and RMSE measures:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

In [24]:
model.fit(X_train, y_train)

In [25]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)

# Calculate and print RMSE for train and validation sets
print(f"Train set, RMSE={mean_squared_error(y_train, y_train_pred, squared=False):.2f}")
print(f"Validation set, RMSE={mean_squared_error(y_valid, y_valid_pred, squared=False):.2f}")

Train set, RMSE=0.56
Validation set, RMSE=0.53




In [26]:
test_set = pd.read_parquet("./data/final_test.parquet")

In [27]:
predictions = model.predict(test_set)

In [28]:
# Prepare submission
output_df = pd.DataFrame({
    'Id': test_set.index,
    'log_bike_count': predictions
})

# Format log_bike_count:
output_df['log_bike_count'] = output_df['log_bike_count'].map(lambda x: f"{x:.4f}")

# Save to CSV:
output_df.to_csv('vs_sub_full_dataset_CatBoost_v3.csv', index=False)
print("Predictions saved to 'vs_sub_full_dataset_CatBoost_v3.csv'.")

Predictions saved to 'vs_sub_full_dataset_CatBoost_v3.csv'.


# __OPTUNA__

In [48]:
import optuna

In [None]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 12)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Create a pipeline with the suggested hyperparameters
    model = make_pipeline(
        feature_eng_v2.FeatureEngineering(),
        preprocessor,
        HistGradientBoostingRegressor(
            learning_rate=learning_rate,
            max_iter=max_iter,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf
        )
    )
    
    # Split your data into training and validation sets
    X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions and calculate the RMSE
    y_valid_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
    
    return rmse  # We want to minimize RMSE

# Run Optuna optimization
study = optuna.create_study(direction='minimize')  # Minimize the objective
study.optimize(objective, n_trials=100)  # Run for 100 trials

# Print the best parameters
print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value}")
print("  Best hyperparameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# After optimization, you can use the best hyperparameters to train your final model
best_params = trial.params
final_model = make_pipeline(
    feature_eng_v2.FeatureEngineering(),
    preprocessor,
    HistGradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        max_iter=best_params['max_iter'],
        max_depth=best_params['max_depth'],
        min_samples_leaf=best_params['min_samples_leaf']
    )
)

# Fit the final model on the full training set
final_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred = final_model.predict(X_valid)

# Evaluate the model
print(f"Final validation RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.2f}")

[I 2024-12-10 16:21:19,031] A new study created in memory with name: no-name-b27909ff-68a8-44ea-832c-242c7183f4ca
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
