In [28]:
import utils
import pandas as pd
import numpy as np
from vs_feature_eng import feature_eng_v3

In [29]:
X, y = utils.get_train_data()

In [None]:
train_data = pd.read_parquet('data/train.parquet')
train_data = feature_eng_v3._merge_external_data(train_data)
train_data.select_dtypes(exclude=['category']).corr()['log_bike_count']

site_id                      0.020998
bike_count                   0.738168
date                         0.044621
counter_installation_date    0.118209
latitude                     0.113355
longitude                    0.044494
log_bike_count               1.000000
t                            0.299766
cl                           0.037632
tend24                       0.014070
etat_sol                    -0.094691
rr3                         -0.022647
w1                          -0.019882
nbas                        -0.011700
nnuage1                     -0.075017
w2                          -0.024153
n                            0.005702
raf10                        0.151816
ht_neige                     0.003080
ssfrai                      -0.030301
rr12                        -0.056783
rr24                        -0.050975
Name: log_bike_count, dtype: float64

In [27]:
train_data.select_dtypes(exclude=['category']).corr()['log_bike_count'].abs().nlargest(15)

log_bike_count               1.000000
bike_count                   0.738168
t                            0.299766
raf10                        0.151816
counter_installation_date    0.118209
latitude                     0.113355
etat_sol                     0.094691
nnuage1                      0.075017
rr12                         0.056783
rr24                         0.050975
date                         0.044621
longitude                    0.044494
cl                           0.037632
ssfrai                       0.030301
w2                           0.024153
Name: log_bike_count, dtype: float64

In [34]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import HistGradientBoostingRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

# DATA PREPROCESSING
# ==============================================================================
numeric_features = [### DATA
                    # Relative to dates
                    'year',
                    'season_sin',
                    'season_cos',
                    'month_sin',
                    'month_cos',
                    'weekday_sin',
                    'weekday_cos',
                    'hour_sin',
                    'hour_cos',
                    'is_weekend',
                    'is_holiday',
                    'is_covid',
                    # Relative to location:
                    'site_id',
                    'latitude',
                    'longitude',

                    ### EXTERNAL DATA
                    't',
                    'raf10', 
                    'etat_sol', 
                    'nnuage1', 
                    'rr12', 
                    'rr24',
                    'cl', 
                    'ssfrai', 
                    'w2'
                    ]


preprocessor = ColumnTransformer(
    [('standard-scaler', StandardScaler(), numeric_features)],
    remainder='drop'
    )

# We create the full pipeline:
model = make_pipeline(
    feature_eng_v3.FeatureEngineering(),
    preprocessor,
    HistGradientBoostingRegressor(learning_rate=0.05, max_iter=300, max_depth=10) # default values
)

In [35]:
### TRAIN_TEST_split and RMSE measures:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

In [36]:
model.fit(X_train, y_train)

In [37]:
from sklearn.metrics import mean_squared_error

y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)

# Calculate and print RMSE for train and validation sets
print(f"Train set, RMSE={mean_squared_error(y_train, y_train_pred, squared=False):.2f}")
print(f"Validation set, RMSE={mean_squared_error(y_valid, y_valid_pred, squared=False):.2f}")

Train set, RMSE=0.58
Validation set, RMSE=0.55




In [38]:
test_set = pd.read_parquet("./data/final_test.parquet")

In [39]:
predictions = model.predict(test_set)

In [40]:
# Prepare submission
output_df = pd.DataFrame({
    'Id': test_set.index,
    'log_bike_count': predictions
})

# Format log_bike_count:
output_df['log_bike_count'] = output_df['log_bike_count'].map(lambda x: f"{x:.4f}")

# Save to CSV:
output_df.to_csv('vs_sub_full_dataset_HGB_v3.csv', index=False)
print("Predictions saved to 'vs_sub_full_dataset_HGB_v3.csv'.")

Predictions saved to 'vs_sub_full_dataset_HGB_v3.csv'.


# __OPTUNA__

In [41]:
import optuna

In [42]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 12)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Create a pipeline with the suggested hyperparameters
    model = make_pipeline(
        feature_eng_v3.FeatureEngineering(),
        preprocessor,
        HistGradientBoostingRegressor(
            learning_rate=learning_rate,
            max_iter=max_iter,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf
        )
    )
    
    # Split your data into training and validation sets
    X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions and calculate the RMSE
    y_valid_pred = model.predict(X_valid)
    rmse = mean_squared_error(y_valid, y_valid_pred, squared=False)
    
    return rmse  # We want to minimize RMSE

# Run Optuna optimization
study = optuna.create_study(direction='minimize')  # Minimize the objective
study.optimize(objective, n_trials=100)  # Run for 100 trials

# Print the best parameters
print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value}")
print("  Best hyperparameters:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# After optimization, you can use the best hyperparameters to train your final model
best_params = trial.params
final_model = make_pipeline(
    feature_eng_v3.FeatureEngineering(),
    preprocessor,
    HistGradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        max_iter=best_params['max_iter'],
        max_depth=best_params['max_depth'],
        min_samples_leaf=best_params['min_samples_leaf']
    )
)

# Fit the final model on the full training set
final_model.fit(X_train, y_train)

# Make predictions on the validation set
y_valid_pred = final_model.predict(X_valid)

# Evaluate the model
print(f"Final validation RMSE: {mean_squared_error(y_valid, y_valid_pred, squared=False):.2f}")

[I 2024-12-10 18:15:15,764] A new study created in memory with name: no-name-c1508c02-afbc-4f9d-ba4f-699565770ed6
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
[I 2024-12-10 18:15:37,686] Trial 0 finished with value: 1.1213534559053844 and parameters: {'learning_rate': 0.0028791538458069455, 'max_iter': 218, 'max_depth': 4, 'min_samples_leaf': 16}. Best is trial 0 with value: 1.1213534559053844.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
[I 2024-12-10 18:16:01,494] Trial 1 finished with value: 1.074273399053178 and parameters: {'learning_rate': 0.003925243865644665, 'max_iter': 157, 'max_depth': 7, 'min_samples_leaf': 6}. Best is trial 1 with value: 1.074273399053178.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1)
[I 2024-12-10 18:16:15,576] Trial 2 finished with value: 0.8097040199908195 and parameters: {'learning_rate': 0.016796453947694755, 'max_iter': 197, 'max_depth': 3, 'min_samples_leaf': 12}. Best i

Best trial:
  RMSE: 0.5409006584096521
  Best hyperparameters:
    learning_rate: 0.08044476287861722
    max_iter: 500
    max_depth: 10
    min_samples_leaf: 7
Final validation RMSE: 0.54




In [43]:
final_predictions = final_model.predict(test_set)

In [44]:
# Prepare submission
output_df = pd.DataFrame({
    'Id': test_set.index,
    'log_bike_count': final_predictions
})

# Format log_bike_count:
output_df['log_bike_count'] = output_df['log_bike_count'].map(lambda x: f"{x:.4f}")

# Save to CSV:
output_df.to_csv('vs_sub_full_dataset_HGB_v3.csv', index=False)
print("Predictions saved to 'vs_sub_full_dataset_HGB_v3.csv'.")

Predictions saved to 'vs_sub_full_dataset_HGB_v3.csv'.
