In [100]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import holidays

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import BallTree
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline

In [101]:
df_train = pd.read_parquet(Path("../data") / "train.parquet")
df_test = pd.read_parquet(Path("../data") / "final_test.parquet")

X_train = df_train.drop(columns=['log_bike_count', 'bike_count'])
y_train = df_train['log_bike_count']
X_test = df_test

X_test.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
0,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 01:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
1,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 13:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
2,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 17:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
3,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 19:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
4,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2021-09-10 22:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In [102]:
X_train.head()

Unnamed: 0,counter_id,counter_name,site_id,site_name,date,counter_installation_date,coordinates,counter_technical_id,latitude,longitude
48321,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 02:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
48324,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 03:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
48327,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 04:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
48330,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 15:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429
48333,100007049-102007049,28 boulevard Diderot E-O,100007049,28 boulevard Diderot,2020-09-01 18:00:00,2013-01-18,"48.846028,2.375429",Y2H15027244,48.846028,2.375429


In the following cell, we drop columns from the dataset:

In [103]:
def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour

    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["date"])

def _drop_init_cols(X):
    return X.drop(columns=['counter_name',
                           'coordinates',
                           'site_name',
                           'site_id',
                           'counter_technical_id',
                           'counter_installation_date'
                           ])

def _merge_external_data(X):
    file_path = Path("../Notebooks") / "weather_v1.csv"
    df_ext = pd.read_csv(file_path, parse_dates=["date"])

    X = X.copy()
    # When using merge_asof left frame need to be sorted
    X["orig_index"] = np.arange(X.shape[0])
    X = pd.merge_asof(
        X.sort_values("date"), df_ext[['date', 'pmer', 'tend', 'cod_tend', 
                                       'dd', 'ff', 't', 'td','u', 'vv', 'ww',  
                                       'n', 'nbas', 'pres', 'tend24', 'raf10', 'ht_neige',
                                       'rr1', 'rr6']].sort_values("date"), on="date"
    ) #'w1', 'w2', 'rafper', 'etat_sol',  'rr3', 'rr12', 'rr24', 'numer_sta', per
    # Sort back to the original order
    X = X.sort_values("orig_index")
    del X["orig_index"]
    return X

def _add_holiday_column(X):
    years=[2020, 2021]
    fr_holidays = holidays.France(years=years)
    X = X.copy()

    def is_holiday(date):
        weekday = date.weekday()
        if weekday > 4 or date in fr_holidays:
            return 1
        else:
            return 0

    X['is_holiday'] = X['date'].apply(is_holiday)

    return X

def _merge_connected_roads(X):
    file_path = Path("../Notebooks") / "reseau_cyclable_v1.csv"
    df_cyclable_roads = pd.read_csv(file_path)

    # Convert lat/lon to radians for haversine distance calculation
    lat_lon_cyclable_roads = np.deg2rad(df_cyclable_roads[['latitude', 'longitude']].values)
    lat_lon_original = np.deg2rad(X[['latitude', 'longitude']].values)

    # Create a BallTree with cyclable roads' coordinates
    tree = BallTree(lat_lon_cyclable_roads, metric='haversine')

    # Define your search radius in meters and convert to radians (Earth radius is approximately 6371 km)
    radius = 100 / 6371000

    # Query the tree for roads within the radius for each bike traffic point
    indices = tree.query_radius(lat_lon_original, r=radius)

    # Copy the DataFrame to avoid modifying the original one
    X = X.copy()

    # Count the number of roads within the radius for each site and add to DataFrame
    X['number_of_connected_roads'] = [len(index) for index in indices]

    return X

def _merge_velib_info(X):
    file_path = Path("../Notebooks") / "info_velib_v1.csv"
    df_velib = pd.read_csv(file_path)

    # Convert lat/lon to radians for haversine distance calculation
    lat_lon_stations = np.deg2rad(df_velib[['latitude', 'longitude']].values)
    lat_lon_original = np.deg2rad(X[['latitude', 'longitude']].values)

    # Create a BallTree with station coordinates
    tree = BallTree(lat_lon_stations, metric='haversine')

    # Define your search radius in meters and convert to radians (Earth radius is approximately 6371 km)
    radius = 150 / 6371000  # Example radius of 500 meters

    # Query the tree for stations within the radius for each point in X
    indices = tree.query_radius(lat_lon_original, r=radius)

    X = X.copy() # copy the DataFrame to avoid modifying the original one

    # Calculate the sum of capacities for stations within the radius for each site in X
    X['total_nearby_station_capacity'] = [df_velib.iloc[index]['Capacité de la station'].sum() for index in indices]
    X['number_of_nearby_stations'] = [len(index) for index in indices]

    return X

def _process_column_id(X):
    X = X.copy() 
    
    def process_id(counter_id):
        parts = counter_id.split('-')
        # If the second part is numeric, use it; otherwise, use the first part
        return parts[1] if parts[1].isdigit() else parts[0]

    X['counter_id'] = X['counter_id'].apply(process_id)
    X['counter_id'] = pd.to_numeric(X['counter_id']) # Convert the processed column_id to numeric

    return X


In [104]:
date_encoder = FunctionTransformer(_encode_dates)
is_holiday_col = FunctionTransformer(_add_holiday_column)
init_data_eng = FunctionTransformer(_drop_init_cols)
merge_external_data = FunctionTransformer(_merge_external_data, validate=False)
merge_connected_roads = FunctionTransformer(_merge_connected_roads)
merge_velib_info = FunctionTransformer(_merge_velib_info)
process_column_id = FunctionTransformer(_process_column_id)

In the following cell, we do some feature engineering on our dataset:

In [105]:
categorical_columns = ['counter_id']
#num_cols = 
one_hot = OneHotEncoder(handle_unknown='ignore')

def one_hot_encode_and_concat(X):
    one_hot_encoded_data = one_hot.fit_transform(X[categorical_columns])

    one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data.toarray(), 
                                      columns=one_hot.get_feature_names_out(categorical_columns))

    X_dropped = X.drop(columns=categorical_columns)
    X_encoded = pd.concat([X_dropped.reset_index(drop=True), one_hot_encoded_df.reset_index(drop=True)], axis=1)

    return X_encoded

one_hot_transformer = FunctionTransformer(one_hot_encode_and_concat)


In [106]:
regressor = RandomForestRegressor(max_depth=20, 
                                  n_estimators=100, 
                                  n_jobs=-1, 
                                  random_state=42)

In [107]:
regressor_dummy = Ridge(alpha=0.1)

In [108]:
pipeline = Pipeline([
    ("merging_ext_data", merge_external_data),
    ("adding_is_holiday_column", is_holiday_col),
    ("merging_connected_roads_data", merge_connected_roads),
    ("merging_velib_info", merge_velib_info),
    ("encoding_dates", date_encoder),
    ("dropping_redundant_columns_in_initial_data", init_data_eng),
    ("processing_column_id", process_column_id),
    # ("one_hot_encoding", one_hot_transformer),
    #('scaler', StandardScaler(),      #for the ridge regression
    ("regressor", regressor)
])

In [95]:
# X_train.to_csv("df_original.csv", index=False)

In [96]:
# pipeline.fit_transform(X_train, y_train)

x = pipeline.fit_transform(X_train, y_train)
x_df = pd.DataFrame(x)

# x_df [x_df['site_name'] == 'Totem 73 boulevard de Sébastopol']



AttributeError: This 'Pipeline' has no attribute 'fit_transform'

In [79]:
x_df.shape

(496827, 30)

In [80]:
x_df.head()

Unnamed: 0,counter_id,latitude,longitude,pmer,tend,cod_tend,dd,ff,t,td,...,rr6,is_holiday,number_of_connected_roads,total_nearby_station_capacity,number_of_nearby_stations,year,month,day,weekday,hour
107,102007049,48.846028,2.375429,102050,-10,8,340,1.6,285.75,282.55,...,0.0,0,16,45,1,2020,9,1,1,2
153,102007049,48.846028,2.375429,101990,-60,6,290,1.1,283.95,282.05,...,0.0,0,16,45,1,2020,9,1,1,3
201,102007049,48.846028,2.375429,101990,-60,6,290,1.1,283.95,282.05,...,0.0,0,16,45,1,2020,9,1,1,4
782,102007049,48.846028,2.375429,101740,-110,6,40,4.0,293.65,279.95,...,0.0,0,16,45,1,2020,9,1,1,15
925,102007049,48.846028,2.375429,101760,10,3,20,3.0,292.15,280.55,...,0.0,0,16,45,1,2020,9,1,1,18


In [36]:
pipeline.fit(X_train, y_train)

KeyboardInterrupt: 

In [575]:
y_pred = pipeline.predict(X_test)

In [576]:
results = pd.DataFrame(
    dict(
        Id=np.arange(y_pred.shape[0]),
        log_bike_count=y_pred,
    )
)

results.to_csv("submission.csv", index=False)

In [577]:
test = pd.read_csv("submission.csv")
test.head()

Unnamed: 0,Id,log_bike_count
0,0,0.236738
1,1,1.428725
2,2,1.845324
3,3,0.485453
4,4,0.429309


## **<u>Train-test splitting and doing grid_search:</u>**

Because we are dealing with temporal data and our objective is to use historical data to predict future data, we define a new train test split.

In [109]:
def train_test_split_temporal(X, y, delta_threshold="30 days"):
    
    cutoff_date = X["date"].max() - pd.Timedelta(delta_threshold)
    mask = (X["date"] <= cutoff_date)
    X_train, X_valid = X.loc[mask], X.loc[~mask]
    y_train, y_valid = y[mask], y[~mask]

    return X_train, y_train, X_valid, y_valid

In [110]:
X_train_loc, y_train_loc, X_test_loc, y_test_loc = train_test_split_temporal(X_train, y_train)

In [83]:
pipeline.fit(X_train_loc, y_train_loc)

In [84]:
y_pred = pipeline.predict(X_test_loc)

AttributeError: 'FunctionTransformer' object has no attribute 'predict'

In [None]:
mse = mean_squared_error(y_test_loc, y_pred)

# Calculate root mean squared error
rmse = np.sqrt(mse)

print(f"RMSE: {rmse}")

In [None]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipeline, X_train_loc, y_train_loc, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

In [377]:
# Using GridSearch

param_grid = {
    'regressor__max_depth': [15, 20, 25],
    'regressor__n_estimators': [100, 200]
}

grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=1
)

grid_search.fit(X_train_, y_train_)

print("Best Parameters Found:")
print(grid_search.best_params_)
print('\n')

print("Best Score: ", grid_search.best_score_)
print("Best Parameters: ", grid_search.best_params_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 