# Clean Pipeline

In [None]:
import utils
import pandas as pd
import numpy as np
from feature_eng_adrien import feature_test_adrien

X, y = utils.get_train_data()
df_ext = pd.read_csv("./external_data/external_data.csv")

col_ext = ["date", 
           "t", # temperature, obvious factor
           "cl", # cloud covering
           "tend24", #weather tendency on 24h
           "etat_sol", # state of the floor
           "rr3", # rain during the last 3 hours
           "w1",
           "nbas",
           "nnuage1",
           "w2",
           "n",
           "raf10",
           "ht_neige",
           "ssfrai",
           "rr12",
           "rr24"] 

training_full_set = feature_test_adrien.merge_external_data(X, df_ext, col_ext)
training_full_set = feature_test_adrien._encode_dates(training_full_set)
training_full_set.columns

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(mode_value, inplace=True)


Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 't', 'cl', 'tend24', 'etat_sol', 'rr3', 'w1',
       'nbas', 'nnuage1', 'w2', 'n', 'raf10', 'ht_neige', 'ssfrai', 'rr12',
       'rr24', 'year', 'month', 'weekday', 'hour', 'is_night', 'is_weekend',
       'is_holiday', 'is_covid', 'month_sin', 'month_cos', 'season'],
      dtype='object')

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.impute import SimpleImputer # To handle the NaNs
from sklearn.preprocessing import OneHotEncoder

### PIPELINE CREATION
# Columns of interest:
scaling_columns = [ 't', 
                    'etat_sol',
                    'cl',
                    "nbas",
                    "nnuage1",
                    "raf10",
                    "ht_neige",
                    "ssfrai",
                    "rr3",
                    "longitude",
                    "latitude"
                    ]

one_hot_columns = ["site_id",
                   "n", # nuage covering, scale 0 to 8
                   "tend24",
                   "w1"]

binary_encoding_columns = ['is_weekend',
                           'is_holiday',
                           'is_covid',
                           'is_night']

#rbf = RepeatingBasisFunction(
 #   n_periods=12, 
  #  column="month", 
   # input_range=(1, 12), 
    #remainder="drop"
#)
numeric_features = ["hour",
                    "season",
                    "weekday",
                    "month_cos",
                    "month_sin",
                    "year"]

numeric_imputer = SimpleImputer(strategy='mean')

preprocessor = ColumnTransformer(
    transformers=[
        ("standard-scaler", StandardScaler(), scaling_columns),
        ("numeric-feature", SimpleImputer(), numeric_features),
        ("one-hot-encoding", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), one_hot_columns),
        ("binoary-encoder", OrdinalEncoder(), binary_encoding_columns)],
    remainder="drop"  # Drop columns not specified
)

# We create the full pipeline:
pipeline = make_pipeline(
#    rbf
    preprocessor,          # Apply imputation and encoding
    HistGradientBoostingRegressor() # default values
)

### TRAIN_TEST_split and RMSE measures:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(training_full_set, y)

In [42]:
pipeline.fit(X_train, y_train)

In [43]:
test_set = pd.read_parquet("./data/final_test.parquet")
test_full_set = feature_test_adrien.merge_external_data(test_set, df_ext, col_ext)
test_full_set = feature_test_adrien._encode_dates(test_full_set)
test_full_set.columns

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X[col].fillna(mode_value, inplace=True)


Index(['counter_id', 'counter_name', 'site_id', 'site_name', 'date',
       'counter_installation_date', 'coordinates', 'counter_technical_id',
       'latitude', 'longitude', 't', 'cl', 'tend24', 'etat_sol', 'rr3', 'w1',
       'nbas', 'nnuage1', 'w2', 'n', 'raf10', 'ht_neige', 'ssfrai', 'rr12',
       'rr24', 'year', 'month', 'weekday', 'hour', 'is_night', 'is_weekend',
       'is_holiday', 'is_covid', 'month_sin', 'month_cos', 'season'],
      dtype='object')

In [45]:
# We print the RMSE obtained on the train and test sets:
print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Validation set, RMSE={mean_squared_error(y_valid, pipeline.predict(X_valid), squared=False):.2f}"
)

### PREDICTION

predictions = pipeline.predict(test_full_set)

### SUBMISSION
output_df = pd.DataFrame({
    'Id': test_set.index,  # Use the original index or a specific ID column if it exists
    'log_bike_count': predictions
})

# Format log_bike_count:
output_df['log_bike_count'] = output_df['log_bike_count'].map(lambda x: f"{x:.4f}")

# Save to CSV:
output_df.to_csv('senghor_sub_full_dataset_HGBooster_encoded_v4.csv', index=False)
print("Predictions saved to 'senghor_sub_full_dataset_HGBooster_encoded_v4.csv'.")



Train set, RMSE=0.59




Validation set, RMSE=0.57
Predictions saved to 'senghor_sub_full_dataset_HGBooster_encoded_v4.csv'.
