# Template used to test a model locally:

### Outline: 

- Import the Dataset using the correct function

- Use the function transformers to do the feature engineering - ```Please Comment any step that you have changed versus the best submission```

- Retrieve the transformed X_train and run ydata_profiling to find insights

- Create pipeline

- Run CV to get a baseline RMSE score

- Print the most important hyperparameters & run the Optuna training 

- Rerun the pipeline to see the impact - ```Please share any relevant information here```

- Update the script with the final model and test submission

## Example with Best Score Script using the `Coco` features instead of ['temp', 'rhum', 'wspd', 'prcp']

### Import the Dataset

In [1]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge
import holidays
from lockdowndates.core import LockdownDates
import haversine as hs
from datetime import datetime
from meteostat import Point, Hourly
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import numpy as np
from pathlib import Path
from datetime import datetime
from meteostat import Point, Hourly

In [2]:
def _read_data():
    _target_column_name = 'log_bike_count'
    data = pd.read_parquet('/Users/ghalichraibi/Desktop/DATA Challenge/Data_Challenge/data/train.parquet')
    # Sort by date first, so that time based cross-validation would produce correct results
    data = data.sort_values(["date", "counter_name"])
    y_array = data[_target_column_name].values
    X_df = data.drop([_target_column_name, "bike_count"], axis=1)
    return X_df, y_array

In [3]:
X_train, y_train = _read_data()
X_final = pd.read_parquet('/Users/ghalichraibi/Desktop/DATA Challenge/Data_Challenge/data/final_test.parquet')

### Function Transformers

In [4]:
# Encode Date with Fourrier + Covid + Holidays (To test Perf With Weekend + Holidays)

def _encode_dates(X):
    X = X.copy()  # modify a copy of X
    # Encode the date information from the DateOfDeparture columns
    X.loc[:, "year"] = X["date"].dt.year
    X.loc[:, "date_ws"] = X["date"].dt.date.astype('datetime64[ns]') # To be used for merging
    X.loc[:, "month"] = X["date"].dt.month
    X.loc[:, "day"] = X["date"].dt.day
    X.loc[:, "weekday"] = X["date"].dt.weekday
    X.loc[:, "hour"] = X["date"].dt.hour
    X.loc[:, "week_of_year"] = X["date"].dt.isocalendar().week
    X.loc[:, "season"] = X["week_of_year"].apply(lambda w: (w // 13) % 4 + 1)

    # Add the column corresponding to holidays
    fr_holidays = holidays.FR(years=X["year"].unique().tolist())
    X.loc[: ,'holidays'] = X['date'].apply(lambda x: x in fr_holidays or x.weekday() >= 5).astype(int)

    # Add covid restrictions
    ld = LockdownDates("France", "2020-09-01", "2022-01-01", ("stay_at_home", "masks"))
    lockdown_dates = ld.dates()
    X = X.reset_index().merge(lockdown_dates['france_stay_at_home'], how='left', left_on='date_ws', right_index=True).set_index('index')
    
    # Month
    months_in_year = 12
    X['sin_month'] = np.sin(2*np.pi*X.month/months_in_year)
    X['cos_month'] = np.cos(2*np.pi*X.month/months_in_year)
    X.drop('month', axis=1, inplace=True)
    
    # Day
    day_in_month = 31
    X['sin_day'] = np.sin(2*np.pi*X.day/day_in_month)
    X['cos_day'] = np.cos(2*np.pi*X.day/day_in_month)
    X.drop('day', axis=1, inplace=True)
    
    # Weekday
    day_in_week = 7
    X['sin_weekday'] = np.sin(2*np.pi*X.weekday/day_in_week)
    X['cos_weekday'] = np.cos(2*np.pi*X.weekday/day_in_week)
    X.drop('weekday', axis=1, inplace=True)
    
    # Hour
    hours_in_day = 24
    X['sin_hour'] = np.sin(2*np.pi*X.hour/hours_in_day)
    X['cos_hour'] = np.cos(2*np.pi*X.hour/hours_in_day)
    X.drop('hour', axis=1, inplace=True)
    
    # Week of year
    weeks_in_year = 52
    X['sin_weekyear'] = np.sin(2*np.pi*X.week_of_year/weeks_in_year)
    X['cos_weekyear'] = np.cos(2*np.pi*X.week_of_year/weeks_in_year)
    X.drop('week_of_year', axis=1, inplace=True)
    
    # Season
    seasons_in_year = 4
    X['sin_season'] = np.sin(2*np.pi*X.season/seasons_in_year)
    X['cos_season'] = np.cos(2*np.pi*X.season/seasons_in_year)
    X.drop('season', axis=1, inplace=True)

    return X

In [5]:
# Retrieve the closest transport station // Test with number of stations in a radius (Categorical)

def _closest_transport(X): 
    column_names = ['longitude', 'latitude', 'station_name']  # Replace with your actual column names
    idf_stations = pd.read_csv('/Users/ghalichraibi/Desktop/DATA Challenge/Data_Challenge/data/Stations_IDF.csv', delimiter=';', header=None, names=column_names)

    X = X.copy()
    # Create an empty DataFrame to store the results
    result_df = pd.DataFrame(columns=['counter_id', 'closest_metro_distance'])

    # Iterate over unique counter_ids in X
    for counter_id in X['counter_id'].unique():
        coordinates_counter = (X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
                               X.loc[X['counter_id'] == counter_id, 'longitude'].values[0])

        # Calculate distances to all metro stations
        distances = []
        for _, station_row in idf_stations.iterrows():
            coordinates_station = (station_row['latitude'], station_row['longitude'])
            distance = hs.haversine(coordinates_counter, coordinates_station)
            distances.append(distance)

        # Get the k closest distances
        closest_distance = sorted(distances)[0]

        # Append to the result DataFrame
        result_df = pd.concat([result_df, pd.DataFrame({'counter_id': [counter_id], 'closest_metro_distance': [closest_distance]})])
    
    result_df = result_df.set_index('counter_id')
    X = X.reset_index().merge(result_df['closest_metro_distance'], how='left', left_on='counter_id', right_index=True).set_index('index')
        
    return X

In [6]:
# Add Weather Data // Only the 'Coco' column this time

def _add_weather_data(X):
    X = X.copy()
    dfs = []

    for counter_id in X['counter_id'].unique():
        # Get the coordinates of the counter
        coordinates_counter = (
            X.loc[X['counter_id'] == counter_id, 'latitude'].values[0],
            X.loc[X['counter_id'] == counter_id, 'longitude'].values[0]
        )

        # Create a Point object with the counter's coordinates
        counter_point = Point(*coordinates_counter)

        # Define the time range (start and end dates)
        start = datetime(2020, 8, 1)
        end = datetime(2022, 1, 1)

        # Create a Hourly object and fetch the weather data
        weather_data = Hourly(counter_point, start, end).fetch()
        selected_columns = ['coco']
        weather_data = weather_data[selected_columns].astype('int').shift(-1)

        # Append counter_id to the weather_data DataFrame
        weather_data['counter_id'] = counter_id

        # Append the DataFrame to the list
        dfs.append(weather_data)

    # Concatenate all DataFrames into a single result_df
    result_df = pd.concat(dfs)
    
    # Convert 'time' index to 'date' column for merging
    result_df['date'] = result_df.index
    result_df['date'] = pd.to_datetime(result_df['date'])
    
    # Merge the result DataFrame with the original DataFrame on 'counter_id' and 'date'
    X = X.reset_index().merge(result_df, how='left', on=['counter_id', 'date']).set_index('index')

    return X.drop(columns=["date", "counter_name", "site_id", "site_name", 
                           "counter_installation_date", "coordinates", "counter_technical_id",
                           "longitude", "latitude", "date_ws"])

### Call all functions on X train, concatenate with y_train and run ydata_profiling

In [7]:
# Please don't change the X_train_encode to not overwrite the initial dataset that will be used later for the modelling

X_train_encoded = _encode_dates(X_train)
X_train_encoded = _closest_transport(X_train_encoded)
X_train_encoded = _add_weather_data(X_train_encoded)

Fetching lockdown dates...
Fetched lockdown dates for: France


In [8]:
# Do the necessary changes to proceed to ydata_profiling

y_train_transformed = pd.Series(y_train).reset_index(drop=True)

concatenated_df = pd.concat([X_train_encoded.reset_index(drop=True), y_train_transformed], axis=1)

In [9]:
concatenated_df.head()

Unnamed: 0,counter_id,year,holidays,france_stay_at_home,sin_month,cos_month,sin_day,cos_day,sin_weekday,cos_weekday,sin_hour,cos_hour,sin_weekyear,cos_weekyear,sin_season,cos_season,closest_metro_distance,coco,0
0,100049407-353255860,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,0.258819,0.965926,-0.935016,-0.354605,-1.0,-1.83697e-16,0.26864,1.0,1.609438
1,100049407-353255859,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,0.258819,0.965926,-0.935016,-0.354605,-1.0,-1.83697e-16,0.26864,1.0,1.386294
2,100036719-104036719,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,0.258819,0.965926,-0.935016,-0.354605,-1.0,-1.83697e-16,0.016574,1.0,0.0
3,100036719-103036719,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,0.258819,0.965926,-0.935016,-0.354605,-1.0,-1.83697e-16,0.016574,1.0,0.693147
4,100063175-353277233,2020,0,0.0,-1.0,-1.83697e-16,0.201299,0.97953,0.781831,0.62349,0.258819,0.965926,-0.935016,-0.354605,-1.0,-1.83697e-16,0.197986,5.0,2.079442


In [10]:
#import ydata_profiling

#concatenated_df.profile_report()

**Note on the correlation matrix that some of the cos/sin features that we derive are correlated between each other. Try to keep only certain of them and keep the other as categories** 

### Create Pipeline and CV

In [11]:
def get_estimator():
    date_encoder = FunctionTransformer(_encode_dates)
    transport_encoder = FunctionTransformer(_closest_transport)
    weather_encoder = FunctionTransformer(_add_weather_data)
    
    date_cols = ["sin_month", "sin_day", "sin_hour", "sin_weekyear", "sin_weekday",
                 "cos_month", "cos_day", "cos_hour", "cos_weekyear", "cos_weekday"]

    categorical_cols = ["counter_id", "closest_metro_distance", "holidays", "france_stay_at_home", "year", "coco"]
    #numerical_cols = ['temp', 'rhum', 'wspd', 'prcp']

    preprocessor = ColumnTransformer(
        [
            ("date", 'passthrough', date_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
            #("num", StandardScaler(), numerical_cols)
        ]
    )
    regressor = XGBRegressor()

    pipe = make_pipeline(date_encoder, transport_encoder, weather_encoder, preprocessor, regressor)

    return pipe

In [12]:
pipe = get_estimator()

pipe.fit(X_train, y_train)

cv = TimeSeriesSplit(n_splits=6)

# When using a scorer in scikit-learn it always needs to be better when smaller, hence the minus sign.
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
print("RMSE: ", scores)
print(f"RMSE (all folds): {-scores.mean():.3} ± {(-scores).std():.3}")

Fetching lockdown dates...
Fetched lockdown dates for: France
Learning rate set to 0.109189
0:	learn: 1.5447028	total: 76.8ms	remaining: 1m 16s
1:	learn: 1.4469341	total: 92.6ms	remaining: 46.2s
2:	learn: 1.3627290	total: 109ms	remaining: 36.2s
3:	learn: 1.2915703	total: 124ms	remaining: 31s
4:	learn: 1.2306188	total: 139ms	remaining: 27.7s
5:	learn: 1.1744404	total: 157ms	remaining: 26s
6:	learn: 1.1264022	total: 174ms	remaining: 24.7s
7:	learn: 1.0860563	total: 190ms	remaining: 23.6s
8:	learn: 1.0496096	total: 208ms	remaining: 22.9s
9:	learn: 1.0205137	total: 223ms	remaining: 22.1s
10:	learn: 0.9945802	total: 237ms	remaining: 21.3s
11:	learn: 0.9714565	total: 253ms	remaining: 20.8s
12:	learn: 0.9522651	total: 266ms	remaining: 20.2s
13:	learn: 0.9320198	total: 282ms	remaining: 19.9s
14:	learn: 0.9174434	total: 299ms	remaining: 19.6s
15:	learn: 0.9037098	total: 312ms	remaining: 19.2s
16:	learn: 0.8879186	total: 329ms	remaining: 19s
17:	learn: 0.8710332	total: 344ms	remaining: 18.8s
18:

In [13]:
pipe