In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# __DATA EXPLORATION__

In [2]:
data = pd.read_parquet(Path("data") / "train.parquet")
data.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude,log_bike_count
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.693147
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,0.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,1.609438
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429,2.302585


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 48321 to 929187
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[us]
 6   counter_installation_date  496827 non-null  datetime64[us]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   latitude                   496827 non-null  float64       
 10  longitude                  496827 non-null  float64       
 11  log_bike_count             496827 non-null  float64  

# __FEATURE ENGINEERING__

We will do feature extraction from the "date" column as seen in the starter kit, adding new features to the _encode_dates() function:

In [4]:

from datetime import datetime

def _encode_dates(X):
    """
    We will add feature engineering related to dates, including:
    - Weekend, since we saw in the starter kit that it has a big influence on the pattern
    - Public holiday detection, since it can have a big influence on the pattern
    - We will introducecCyclical encoding for month and weekday, 
    for Python to understand these features as cyclical and not linear
    
    Parameters:
    X (pd.DataFrame): Input DataFrame with a "date" column.

    Returns:
    pd.DataFrame: Transformed DataFrame with new features.
    """

    X = X.copy()  # modify a copy of X

    # Encode the date information from the DateOfDeparture columns
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour

    # Identify weekends
    X["is_weekend"] = X["weekday"].isin([5, 6]).astype(int)
    
    # Public holidays
    french_holidays = [
        # 2020 Holidays
        datetime(2020, 1, 1),   # New Year's Day
        datetime(2020, 4, 13),  # Easter Monday
        datetime(2020, 5, 1),   # Labor Day
        datetime(2020, 5, 8),   # Victory in Europe Day
        datetime(2020, 5, 21),  # Ascension Day
        datetime(2020, 6, 1),   # Whit Monday
        datetime(2020, 7, 14),  # Bastille Day
        datetime(2020, 8, 15),  # Assumption Day
        datetime(2020, 11, 1),  # All Saints' Day
        datetime(2020, 11, 11), # Armistice Day
        datetime(2020, 12, 25), # Christmas Day

        # 2021 Holidays
        datetime(2021, 1, 1),   # New Year's Day
        datetime(2021, 4, 5),   # Easter Monday
        datetime(2021, 5, 1),   # Labor Day
        datetime(2021, 5, 8),   # Victory in Europe Day
        datetime(2021, 5, 13),  # Ascension Day
        datetime(2021, 5, 24),  # Whit Monday
        datetime(2021, 7, 14),  # Bastille Day
        datetime(2021, 8, 15),  # Assumption Day
        datetime(2021, 11, 1),  # All Saints' Day
        datetime(2021, 11, 11), # Armistice Day
        datetime(2021, 12, 25), # Christmas Day
    ]
    X["is_holiday"] = X["date"].isin(french_holidays).astype(int)

    # Cyclical encoding for months
    X["month_sin"] = np.sin(2 * np.pi * X["month"] / 12)
    X["month_cos"] = np.cos(2 * np.pi * X["month"] / 12)

    X["weekday_sin"] = np.sin(2 * np.pi * X["weekday"] / 7)
    X["weekday_cos"] = np.cos(2 * np.pi * X["weekday"] / 7)

    return X.drop(columns=["date", "month", "weekday"])

Then, we will do feature extraction from the "latitude" and "longitude" columns; we will drop the "coordinates" column that appears to be redundant with the 2 previously mentioned columns (it is a written repetition):

In [5]:

def _encode_lat_lon(X):
    """
    We will add feature engineering related to latitude and longitude, including:
    - The parisian arrondissement the counter is located
    
    Parameters:
    X (pd.DataFrame): Input DataFrame with a "latitude" and a "longitude" column.

    Returns:
    pd.DataFrame: Transformed DataFrame with new features.
    """

    X = X.copy()  # modify a copy of X

    # We took some help to find the list of the coordinates of the Parisian arrondissements:
    arrondissement_ranges = {
    1: ((48.861992, 48.865215), (2.332125, 2.336405)),  # 1st arrondissement
    2: ((48.863206, 48.866611), (2.341881, 2.347173)),  # 2nd arrondissement
    3: ((48.861255, 48.865822), (2.360033, 2.367505)),  # 3rd arrondissement
    4: ((48.843799, 48.861194), (2.354573, 2.365420)),  # 4th arrondissement
    5: ((48.831783, 48.843197), (2.342185, 2.354164)),  # 5th arrondissement
    6: ((48.841360, 48.851082), (2.322960, 2.335156)),  # 6th arrondissement
    7: ((48.855246, 48.861785), (2.292324, 2.304169)),  # 7th arrondissement
    8: ((48.871865, 48.876433), (2.298935, 2.316489)),  # 8th arrondissement
    9: ((48.878293, 48.886988), (2.332082, 2.342776)),  # 9th arrondissement
    10: ((48.867081, 48.877137), (2.354785, 2.368476)), # 10th arrondissement
    11: ((48.860539, 48.868028), (2.368059, 2.377489)), # 11th arrondissement
    12: ((48.838929, 48.853496), (2.372685, 2.395029)), # 12th arrondissement
    13: ((48.832859, 48.845874), (2.364123, 2.377156)), # 13th arrondissement
    14: ((48.835038, 48.844455), (2.308681, 2.334001)), # 14th arrondissement
    15: ((48.846062, 48.868057), (2.285918, 2.314678)), # 15th arrondissement
    16: ((48.846780, 48.876165), (2.246473, 2.296048)), # 16th arrondissement
    17: ((48.873200, 48.887113), (2.284953, 2.319586)), # 17th arrondissement
    18: ((48.877165, 48.895797), (2.324915, 2.363556)), # 18th arrondissement
    19: ((48.868440, 48.886066), (2.377394, 2.396128)), # 19th arrondissement
    20: ((48.855109, 48.873755), (2.382319, 2.411434))  # 20th arrondissement
    }
    
    # We will use a function to determine, based on the lat & lon of the counter,
    # in which arrondissement it is located:
    def find_arrondissement(lat, lon):
        for arr, ((lat_min, lat_max), (lon_min, lon_max)) in arrondissement_ranges.items():
            if lat_min <= lat <= lat_max and lon_min <= lon <= lon_max:
                return arr
        return None
    
    X['arrondissement'] = X.apply(lambda row: find_arrondissement(row['latitude'], row['longitude']), axis=1)

    return X.drop(columns=["latitude", "longitude"])

In [6]:
_encode_lat_lon(data).info()

<class 'pandas.core.frame.DataFrame'>
Index: 496827 entries, 48321 to 929187
Data columns (total 11 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   counter_id                 496827 non-null  category      
 1   counter_name               496827 non-null  category      
 2   site_id                    496827 non-null  int64         
 3   site_name                  496827 non-null  category      
 4   bike_count                 496827 non-null  float64       
 5   date                       496827 non-null  datetime64[us]
 6   counter_installation_date  496827 non-null  datetime64[us]
 7   coordinates                496827 non-null  category      
 8   counter_technical_id       496827 non-null  category      
 9   log_bike_count             496827 non-null  float64       
 10  arrondissement             259323 non-null  float64       
dtypes: category(5), datetime64[us](2), float64(3), int64(

# __PIPELINE CREATION__

First, we wrap our functions with [FunctionTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html) to use them with scikit-learn estimators:

In [7]:
from sklearn.preprocessing import FunctionTransformer

date_encoder = FunctionTransformer(_encode_dates, validate=False)
lat_lon_encoder = FunctionTransformer(_encode_lat_lon, validate=False)

# We apply date transformation:
data_encoded = date_encoder.fit_transform(data)
# and then latitude/longitude transformation:
data_encoded = lat_lon_encoder.fit_transform(data_encoded)

data_encoded

Unnamed: 0,counter_id,counter_name,site_id,site_name,bike_count,counter_installation_date,coordinates,counter_technical_id,log_bike_count,year,day,hour,is_weekend,is_holiday,month_sin,month_cos,weekday_sin,weekday_cos,arrondissement
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,0.000000,2020,1,2,0,0,-1.0,-1.836970e-16,0.781831,0.623490,12.0
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,1.0,2013-01-18,"48.846028,2.375429",Y2H15027244,0.693147,2020,1,3,0,0,-1.0,-1.836970e-16,0.781831,0.623490,12.0
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,0.0,2013-01-18,"48.846028,2.375429",Y2H15027244,0.000000,2020,1,4,0,0,-1.0,-1.836970e-16,0.781831,0.623490,12.0
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,4.0,2013-01-18,"48.846028,2.375429",Y2H15027244,1.609438,2020,1,15,0,0,-1.0,-1.836970e-16,0.781831,0.623490,12.0
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,9.0,2013-01-18,"48.846028,2.375429",Y2H15027244,2.302585,2020,1,18,0,0,-1.0,-1.836970e-16,0.781831,0.623490,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929175,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,445.0,2020-11-29,"48.83977,2.30198",Y2H20114504,6.100319,2021,9,6,0,0,-1.0,-1.836970e-16,0.433884,-0.900969,
929178,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,145.0,2020-11-29,"48.83977,2.30198",Y2H20114504,4.983607,2021,9,10,0,0,-1.0,-1.836970e-16,0.433884,-0.900969,
929181,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,218.0,2020-11-29,"48.83977,2.30198",Y2H20114504,5.389072,2021,9,15,0,0,-1.0,-1.836970e-16,0.433884,-0.900969,
929184,300014702-353245971,254 rue de Vaugirard SO-NE,300014702,254 rue de Vaugirard,21.0,2020-11-29,"48.83977,2.30198",Y2H20114504,3.091042,2021,9,22,0,0,-1.0,-1.836970e-16,0.433884,-0.900969,


We will apply OneHotEncoder to the categorical features, and numeric features that are hardly linearly correlated to the y label such as "hour", "arrondissement", "month_sin",	"month_cos", "weekday_sin", "weekday_cos":

In [8]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse_output=False)

enc.fit_transform(data_encoded[["hour", "arrondissement", "month_sin",	"month_cos", "weekday_sin", "weekday_cos"]])

array([[0., 0., 1., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### __Linear model__

In [9]:
import utils

X, y = utils.get_train_data()

def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

In [10]:
X_train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
400125,100049407-353255860,152 boulevard du Montparnasse E-O,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
408305,100049407-353255859,152 boulevard du Montparnasse O-E,100049407,152 boulevard du Montparnasse,2020-09-01 01:00:00,2018-12-07,"48.840801,2.333233",Y2H19070373,48.840801,2.333233
87516,100036719-104036719,18 quai de l'Hôtel de Ville NO-SE,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702
98518,100036719-103036719,18 quai de l'Hôtel de Ville SE-NO,100036719,18 quai de l'Hôtel de Ville,2020-09-01 01:00:00,2017-07-12,"48.85372,2.35702",Y2H19027732,48.85372,2.35702
875137,100063175-353277233,20 Avenue de Clichy NO-SE,100063175,20 Avenue de Clichy,2020-09-01 01:00:00,2020-07-22,"48.88529,2.32666",Y2H20073268,48.88529,2.32666


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer # To handle the NaNs not supported by the Ridge() regressor...

date_encoder = FunctionTransformer(_encode_dates, validate=False)
lat_lon_encoder = FunctionTransformer(_encode_lat_lon, validate=False)

# 1. Preprocess date column
date_pipeline = make_pipeline(date_encoder, SimpleImputer(strategy='mean'))

# 2. Preprocess latitude and longitude columns
lat_lon_pipeline = make_pipeline(lat_lon_encoder, SimpleImputer(strategy='mean'))

# 3. Define OneHotEncoder for categorical columns
categorical_encoder = OneHotEncoder(handle_unknown="ignore")

# 4. Create ColumnTransformers for each preprocessing step
preprocessor = ColumnTransformer(
    transformers=[
        ("date", date_pipeline, ["date"]),  # Process date column
        ("lat_lon", lat_lon_pipeline, ["latitude", "longitude"]),  # Process lat/lon columns
        ("categorical", categorical_encoder, ["counter_name", "site_name"]),  # Categorical features
    ],
    remainder='drop'  # Drop other columns not specified in transformers
)

# 5. Create the full pipeline with regression model
pipeline = make_pipeline(preprocessor,Ridge())

pipeline.fit(X_train, y_train)

In [12]:
from sklearn.metrics import mean_squared_error

print(
    f"Train set, RMSE={mean_squared_error(y_train, pipeline.predict(X_train), squared=False):.2f}"
)
print(
    f"Valid set, RMSE={mean_squared_error(y_valid, pipeline.predict(X_valid), squared=False):.2f}"
)



Train set, RMSE=1.43
Valid set, RMSE=1.16




# __CROSS VAL - MODEL SELECTION__

In [None]:
# TODO

# __HYPERPARAMETER TUNING__

In [None]:
# TODO

# __MODEL VALIDATION__

In [None]:
# TODO

# __TEST__

In [13]:
# We import the test set:
test_data = pd.read_parquet(Path("data") / "final_test.parquet")

In [15]:
# We do the predictions:
predictions = pipeline.predict(test_data)

In [17]:
# We export the predictions:

# Output
output_df = pd.DataFrame({
    'Id': test_data.index,  # Use the original index or a specific ID column if it exists
    'log_bike_count': predictions
})

# Format log_bike_count:
output_df['log_bike_count'] = output_df['log_bike_count'].map(lambda x: f"{x:.4f}")

# Save to CSV:
output_df.to_csv('predictions_victor_SOTO_Ridge_3.csv', index=False)

print("Predictions saved to 'predictions_victor_SOTO_Ridge_3.csv'.")

Predictions saved to 'predictions_victor_SOTO_Ridge_3.csv'.
