In [21]:
!pip install holidays

Collecting holidays
  Downloading holidays-0.62-py3-none-any.whl.metadata (26 kB)
Downloading holidays-0.62-py3-none-any.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 1.2/1.2 MB 9.7 MB/s eta 0:00:00
Installing collected packages: holidays
Successfully installed holidays-0.62


In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from pathlib import Path
import holidays

import utils
_target_column_name = "log_bike_count"

In [110]:
def get_data():
    # Define the starting directory (e.g., current directory)
    file_name = "train.parquet"
    start_dir = Path(".")  # Current directory

    # Search for the specific file
    parquet_file = next(start_dir.rglob(file_name), None)

    if parquet_file:
        print(f"Loading file: {parquet_file}")
        
        # Load the Parquet file into a pandas DataFrame
        df = pd.read_parquet(parquet_file)  # Use the found file path

        y_array = df["log_bike_count"].values
        X_df = df[["date", "counter_name"]]
        return X_df, y_array
    else:
        print(f"File '{file_name}' not found in the directory.")


In [111]:
def merge_ext_data(X):
    # Define the starting directory (e.g., current directory)
    file_name = "external_data.csv"
    start_dir = Path(".")  # Current directory

    # Search for the specific file
    csv_file = next(start_dir.rglob(file_name), None)

    if csv_file:
        print(f"Loading file: {csv_file}")
        
        # Load the Parquet file into a pandas DataFrame
        df_ext = pd.read_csv(csv_file)  # Use the found file path
        df_ext = df_ext.fillna(0)
        df_ext["date"] = df_ext["date"].astype('datetime64[us]')
        X = X.copy()
        X["orig_index"] = np.arange(X.shape[0])
        X = pd.merge_asof(
        X.sort_values("date"), df_ext[["date", "t", "etat_sol", 'rr1', "ff"]].sort_values("date"), on="date")
        X = X.sort_values("orig_index")
        del X["orig_index"]

        df_expanded = X.loc[X.index.repeat(3)].reset_index(drop=True)
        # Add hourly intervals to the timestamp
        df_expanded["date"] += pd.to_timedelta(df_expanded.groupby(df_expanded.index // 3).cumcount(), unit="h")
        # Sort by timestamp
        df_expanded = df_expanded.sort_values("date").reset_index(drop=True)
        return X
    else:
        print(f"File '{file_name}' not found in the directory.")


In [112]:
def encode_dates(X):
    X = X.copy()  # modify a copy of X
    X["year"] = X["date"].dt.year
    X["month"] = X["date"].dt.month
    X["day"] = X["date"].dt.day
    X["weekday"] = X["date"].dt.weekday
    X["hour"] = X["date"].dt.hour
    for hour in range(24):
        weekend_feature = f"weekend_hour_{hour:02d}"
        weekday_feature = f"weekday_hour_{hour:02d}"

        X[weekend_feature] = ((X["weekday"].isin([0, 1, 2, 3, 4])) & (X["hour"] == hour)).astype(int)
        X[weekday_feature] = ((X["weekday"].isin([5, 6])) & (X["hour"] == hour)).astype(int)


    fr_holidays = holidays.France(years=[(2015 + i) for i in range(10)])
    X["is_red_day"] = X["date"].dt.date.isin(fr_holidays.keys()).astype(int)

    X = X.drop(columns=["date", "hour"])

    return X

In [113]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [None]:
X, y = get_data()
X = merge_ext_data(X)

date_encoder = FunctionTransformer(encode_dates)
date_cols = encode_dates(X[["date"]]).columns.tolist()

categorical_encoder = OneHotEncoder(handle_unknown="ignore")
categorical_cols = ["counter_name", "etat_sol"]
numerical_cols = ["t", "ff", "rr1"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols),
        ("cat", categorical_encoder, categorical_cols),
        ("num", StandardScaler(), numerical_cols)
    ]
)


Loading file: data\train.parquet
Loading file: data\external_data.csv


(496827, 6)

In [122]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]
    return X_train, y_train, X_valid, y_valid

X_train, y_train, X_valid, y_valid = train_test_split_temporal(X, y)

In [123]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn import ensemble


# params = {
#     "n_estimators": 500,
#     "max_depth": 6,
#     "min_samples_split": 5,
#     "learning_rate": 0.1,
#     "loss": "squared_error",
# }

# regressor = ensemble.GradientBoostingRegressor(**params)

regressor = LinearRegression()

pipe = make_pipeline(date_encoder, preprocessor, regressor)

In [124]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cv = TimeSeriesSplit(n_splits=5)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

RMSE:  [-1.30226406e+12 -6.43542918e+11 -1.60835657e+12 -9.05634152e-01
 -1.33591733e+00]
RMSE (all folds): 7.11e+11 ± 6.59e+11
