# Project Objectives:
1. **Analyse Usage and Demand Patterns:** Examine the extensive trip data available in Co-Wheels’ booking system (TripIQ) to identify patterns in vehicle usage and demand across different locations and times.
2. **Design and Develop a Pricing Model and Tool:**
     1. Create a pricing model that incorporates fixed and variable costs, including fuel and electricity, to determine optimal hourly and daily rates for different locations and times.
     2. Develop a straightforward tool that allows Co-Wheels to input various cost factors and receive tailored pricing options based on location, demand, and seasonal variations.   
4. **Evaluate Seasonal and Temporal Variations:** Assess the impact of seasonal changes and time-of-day variations on car-sharing demand and integrate these factors into the pricing model.
5. **Assess Profitability and Utilisation Impact:** Model potential outcomes of different pricing strategies to evaluate their impact on profitability and vehicle utilisation rates in various locations.
6. **Validate pricing tool:** Test the pricing tool with real-world data to ensure its accuracy and effectiveness in optimising Co-Wheels’ pricing strategy.

# 1. Loading libraries and data

In [1]:
pip install category_encoders



In [2]:
# Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
import warnings
from scipy import stats
from matplotlib.ticker import FuncFormatter
from category_encoders import BinaryEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
import joblib

# Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns', None)

# Ignore all warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

# Data location
DATA_PATH = "/content/drive/MyDrive/MSc Dissertation/data/"
# Encoders location
ENCODERS_PATH = "/content/drive/MyDrive/MSc Dissertation/encoders/"
# Model location
MODEL_PATH = "/content/drive/MyDrive/MSc Dissertation/models/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 1.1. Load data

In [3]:
df_transformed = pd.read_csv(DATA_PATH + 'transformed_dataset.csv')
df = pd.read_csv(DATA_PATH + 'scaled_dataset.csv')

print('Scaled data: ', df.shape)
print('Transformed data: ', df_transformed.shape)

Scaled data:  (378614, 57)
Transformed data:  (378614, 39)


In [4]:
df.head()

Unnamed: 0,vehicle_description,vehicle_registration,location_0,location_1,location_2,location_3,location_4,location_5,location_6,booking_id,booking_actual_start,booking_actual_end,booking_actual_duration,booking_billed_start,booking_billed_start_hour,booking_billed_start_dayofweek,booking_billed_start_month,booking_billed_start_year,booking_billed_end,booking_billed_end_hour,booking_billed_end_dayofweek,booking_billed_end_month,booking_billed_end_year,booking_billed_duration,booking_mileage,booking_rates_hours,booking_rates_24hours,booking_rates_overnight,booking_actual_cost_distance,booking_actual_cost_time,booking_actual_cost_total,booking_created_at,booking_created_at_hour,booking_created_at_dayofweek,booking_created_at_month,booking_created_at_year,season_Winter,season_Autumn,season_Summer,season_Spring,is_holiday_1.0,is_holiday_0.0,Vehicle Type_City,Vehicle Type_Everyday,Vehicle Type_Family,Vehicle Type_Van,Vehicle Type_7 Seater,Fuel Type_Petrol,Fuel Type_EV,Fuel Type_Hydrogen,is_peak_hour_1.0,is_peak_hour_0.0,is_weekend_1.0,is_weekend_0.0,hourly_rate,daily_rate,per_mile
0,White Toyota Aygo XPlay Nav (NL66 NKZ),NL66 NKZ,0,0,0,0,0,0,1,fc59ba9c-2ce9-11ea-8efc-bfa292f04054,2020-01-01 23:06:00,2020-01-02 01:24:00,4.934474,2020-01-01 23:00:00,23,2,1,2020,2020-01-02 08:00:00,8,3,1,2020,6.293419,4.418841,0.0,0.0,1.0,2.757475,2.351375,3.229222,2020-01-01 22:56:00,22,2,1,2020,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,4.75,33.25,0.18
1,Renault Zoe EV (DL68 LNK),DL68 LNK,0,0,0,0,0,0,1,1cf7fae6-2ce7-11ea-a970-fbf7b3786523,2020-01-01 22:46:00,2020-01-02 02:01:00,5.278115,2020-01-01 22:45:00,22,2,1,2020,2020-01-02 02:30:00,2,3,1,2020,5.420535,1.791759,0.0,0.0,1.0,0.0,2.484907,2.484907,2020-01-01 22:36:00,22,2,1,2020,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,1,5.5,38.5,0.18
2,White Toyota Yaris Petrol Hybrid (Automatic) ...,ND19 HWY,0,0,0,0,0,1,0,591b7a98-2cd8-11ea-a341-71665a42c0e8,2020-01-01 21:29:00,2020-01-01 23:28:00,4.787492,2020-01-01 21:30:00,21,2,1,2020,2020-01-01 23:30:00,23,2,1,2020,4.795791,1.94591,2.0,0.0,0.0,0.732368,2.484907,2.571084,2020-01-01 20:50:00,20,2,1,2020,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,5.5,38.5,0.18
3,White Toyota Aygo XPlay (NL68 JVP),NL68 JVP,0,0,0,0,0,1,1,b6177928-2c55-11ea-a2e0-e9685b57c653,2020-01-01 21:22:00,2020-01-01 21:38:00,2.833213,2020-01-01 21:15:00,21,2,1,2020,2020-01-01 21:45:00,21,2,1,2020,3.433987,1.94591,0.5,0.0,0.0,0.732368,1.217876,1.495149,2020-01-01 05:15:00,5,2,1,2020,1,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,1,4.75,33.25,0.18
4,White Toyota Yaris Hybrid NJ68 JHV Automatic 5...,NJ68 JHV,0,0,0,0,1,0,0,9ffdc790-27ba-11ea-a36d-c540c1997f03,2020-01-01 22:02:00,2020-01-02 19:50:00,6.598509,2020-01-01 20:30:00,20,2,1,2020,2020-01-02 20:30:00,20,3,1,2020,6.820016,3.988984,0.0,1.0,0.0,2.355178,3.676301,3.892636,2019-12-26 08:35:00,8,3,12,2019,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,5.5,38.5,0.18


In [5]:
df_transformed.head()

Unnamed: 0,vehicle_description,vehicle_registration,location,booking_id,booking_actual_start,booking_actual_end,booking_actual_duration,booking_billed_start,booking_billed_start_hour,booking_billed_start_dayofweek,booking_billed_start_month,booking_billed_start_year,booking_billed_end,booking_billed_end_hour,booking_billed_end_dayofweek,booking_billed_end_month,booking_billed_end_year,booking_billed_duration,booking_mileage,booking_rates_hours,booking_rates_24hours,booking_rates_overnight,booking_actual_cost_distance,booking_actual_cost_time,booking_actual_cost_total,booking_created_at,booking_created_at_hour,booking_created_at_dayofweek,booking_created_at_month,booking_created_at_year,season,is_holiday,Vehicle Type,Fuel Type,hourly_rate,daily_rate,per_mile,is_weekend,is_peak_hour
0,White Toyota Aygo XPlay Nav (NL66 NKZ),NL66 NKZ,Bristol,fc59ba9c-2ce9-11ea-8efc-bfa292f04054,2020-01-01 23:06:00,2020-01-02 01:24:00,4.934474,2020-01-01 23:00:00,23,2,1,2020,2020-01-02 08:00:00,8,3,1,2020,6.293419,4.418841,0.0,0.0,1.0,2.757475,2.351375,3.229222,2020-01-01 22:56:00,22,2,1,2020,Winter,1,City,Petrol,4.75,33.25,0.18,0,0
1,Renault Zoe EV (DL68 LNK),DL68 LNK,Bristol,1cf7fae6-2ce7-11ea-a970-fbf7b3786523,2020-01-01 22:46:00,2020-01-02 02:01:00,5.278115,2020-01-01 22:45:00,22,2,1,2020,2020-01-02 02:30:00,2,3,1,2020,5.420535,1.791759,0.0,0.0,1.0,0.0,2.484907,2.484907,2020-01-01 22:36:00,22,2,1,2020,Winter,1,Everyday,EV,5.5,38.5,0.18,0,0
2,White Toyota Yaris Petrol Hybrid (Automatic) ...,ND19 HWY,Newbury,591b7a98-2cd8-11ea-a341-71665a42c0e8,2020-01-01 21:29:00,2020-01-01 23:28:00,4.787492,2020-01-01 21:30:00,21,2,1,2020,2020-01-01 23:30:00,23,2,1,2020,4.795791,1.94591,2.0,0.0,0.0,0.732368,2.484907,2.571084,2020-01-01 20:50:00,20,2,1,2020,Winter,1,Everyday,Petrol,5.5,38.5,0.18,0,0
3,White Toyota Aygo XPlay (NL68 JVP),NL68 JVP,Swindon,b6177928-2c55-11ea-a2e0-e9685b57c653,2020-01-01 21:22:00,2020-01-01 21:38:00,2.833213,2020-01-01 21:15:00,21,2,1,2020,2020-01-01 21:45:00,21,2,1,2020,3.433987,1.94591,0.5,0.0,0.0,0.732368,1.217876,1.495149,2020-01-01 05:15:00,5,2,1,2020,Winter,1,City,Petrol,4.75,33.25,0.18,0,0
4,White Toyota Yaris Hybrid NJ68 JHV Automatic 5...,NJ68 JHV,Horsham,9ffdc790-27ba-11ea-a36d-c540c1997f03,2020-01-01 22:02:00,2020-01-02 19:50:00,6.598509,2020-01-01 20:30:00,20,2,1,2020,2020-01-02 20:30:00,20,3,1,2020,6.820016,3.988984,0.0,1.0,0.0,2.355178,3.676301,3.892636,2019-12-26 08:35:00,8,3,12,2019,Winter,1,Everyday,Petrol,5.5,38.5,0.18,0,0


In [6]:
# Convert to pandas datetime object
df['booking_billed_start'] = pd.to_datetime(df['booking_billed_start'])
df_transformed['booking_billed_start'] = pd.to_datetime(df_transformed['booking_billed_start'])

# Sort data by booking_actual_start to ensure temporal order
df = df.sort_values(by='booking_billed_start').reset_index(drop=True)
df_transformed = df_transformed.sort_values(by='booking_billed_start').reset_index(drop=True)

# Drop unnecessary columns
df.drop(columns=['vehicle_description', 'vehicle_registration', 'booking_id', 'booking_actual_start',
                 'booking_actual_end', 'booking_billed_end', 'booking_created_at'], inplace=True)

df.shape

(378614, 51)

In [7]:
df.head()

Unnamed: 0,location_0,location_1,location_2,location_3,location_4,location_5,location_6,booking_id,booking_actual_duration,booking_billed_start,booking_billed_start_hour,booking_billed_start_dayofweek,booking_billed_start_month,booking_billed_start_year,booking_billed_end_hour,booking_billed_end_dayofweek,booking_billed_end_month,booking_billed_end_year,booking_billed_duration,booking_mileage,booking_rates_hours,booking_rates_24hours,booking_rates_overnight,booking_actual_cost_distance,booking_actual_cost_time,booking_actual_cost_total,booking_created_at_hour,booking_created_at_dayofweek,booking_created_at_month,booking_created_at_year,season_Winter,season_Autumn,season_Summer,season_Spring,is_holiday_1.0,is_holiday_0.0,Vehicle Type_City,Vehicle Type_Everyday,Vehicle Type_Family,Vehicle Type_Van,Vehicle Type_7 Seater,Fuel Type_Petrol,Fuel Type_EV,Fuel Type_Hydrogen,is_peak_hour_1.0,is_peak_hour_0.0,is_weekend_1.0,is_weekend_0.0,hourly_rate,daily_rate,per_mile
0,0,0,0,0,1,1,0,63a1866c-0ca2-11e9-b63c-bbf97d8700d8,4.454347,2018-12-31 02:30:00,2,0,12,2018,4,0,12,2018,4.663439,3.367296,1.75,0.0,0.0,1.798404,2.36368,2.751748,2,0,12,2018,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,1,5.5,38.5,0.18
1,0,0,0,0,1,1,0,5da81746-0c78-11e9-8e9e-017b0b6178d0,3.178054,2018-12-31 08:00:00,8,0,12,2018,8,0,12,2018,3.828641,1.609438,0.75,0.0,0.0,0.542324,1.635106,1.766442,21,6,12,2018,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,5.5,38.5,0.18
2,0,0,0,0,0,0,1,ee131264-0c63-11e9-8d53-a129eaa6e810,4.927254,2018-12-31 08:15:00,8,0,12,2018,11,0,12,2018,5.111988,2.772589,2.75,0.0,0.0,1.308333,2.780681,2.935451,18,6,12,2018,0,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,1,5.5,38.5,0.18
3,0,0,0,1,0,1,1,4ced6a56-0c4f-11e9-814f-63c5d39c3910,4.672829,2018-12-31 08:30:00,8,0,12,2018,10,0,12,2018,4.795791,2.397895,2.0,0.0,0.0,0.0,2.484907,2.484907,16,6,12,2018,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,1,5.5,38.5,0.18
4,0,0,0,0,1,1,1,2b4e190c-0bb6-11e9-8b21-79c4cede1e0f,4.553877,2018-12-31 08:30:00,8,0,12,2018,11,0,12,2018,5.01728,2.397895,2.5,0.0,0.0,1.029619,2.555676,2.686486,22,5,12,2018,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,4.75,33.25,0.18


In [8]:
df_transformed.head()

Unnamed: 0,vehicle_description,vehicle_registration,location,booking_id,booking_actual_start,booking_actual_end,booking_actual_duration,booking_billed_start,booking_billed_start_hour,booking_billed_start_dayofweek,booking_billed_start_month,booking_billed_start_year,booking_billed_end,booking_billed_end_hour,booking_billed_end_dayofweek,booking_billed_end_month,booking_billed_end_year,booking_billed_duration,booking_mileage,booking_rates_hours,booking_rates_24hours,booking_rates_overnight,booking_actual_cost_distance,booking_actual_cost_time,booking_actual_cost_total,booking_created_at,booking_created_at_hour,booking_created_at_dayofweek,booking_created_at_month,booking_created_at_year,season,is_holiday,Vehicle Type,Fuel Type,hourly_rate,daily_rate,per_mile,is_weekend,is_peak_hour
0,White Toyota Yaris Hybrid NJ68 VPA Automatic 5...,NJ68 VPA,Glasgow,63a1866c-0ca2-11e9-b63c-bbf97d8700d8,2018-12-31 02:25:00,2018-12-31 03:50:00,4.454347,2018-12-31 02:30:00,2,0,12,2018,2018-12-31 04:15:00,4,0,12,2018,4.663439,3.367296,1.75,0.0,0.0,1.798404,2.36368,2.751748,2018-12-31 02:18:00,2,0,12,2018,Autumn,1,Everyday,Petrol,5.5,38.5,0.18,0,0
1,Toyota Yaris Hybrid NG18 BMY Automatic 5 Seats,NG18 BMY,Glasgow,5da81746-0c78-11e9-8e9e-017b0b6178d0,2018-12-31 08:11:00,2018-12-31 08:34:00,3.178054,2018-12-31 08:00:00,8,0,12,2018,2018-12-31 08:45:00,8,0,12,2018,3.828641,1.609438,0.75,0.0,0.0,0.542324,1.635106,1.766442,2018-12-30 21:17:00,21,6,12,2018,Autumn,1,Everyday,Petrol,5.5,38.5,0.18,0,1
2,Toyota Yaris Manual (NK17 RYM),NK17 RYM,Bristol,ee131264-0c63-11e9-8d53-a129eaa6e810,2018-12-31 08:18:00,2018-12-31 10:35:00,4.927254,2018-12-31 08:15:00,8,0,12,2018,2018-12-31 11:00:00,11,0,12,2018,5.111988,2.772589,2.75,0.0,0.0,1.308333,2.780681,2.935451,2018-12-30 18:51:00,18,6,12,2018,Autumn,1,Everyday,Petrol,5.5,38.5,0.18,0,1
3,Renault Zoe EV (SD67 GFV),SD67 GFV,Aberdeen,4ced6a56-0c4f-11e9-814f-63c5d39c3910,2018-12-31 08:25:00,2018-12-31 10:11:00,4.672829,2018-12-31 08:30:00,8,0,12,2018,2018-12-31 10:30:00,10,0,12,2018,4.795791,2.397895,2.0,0.0,0.0,0.0,2.484907,2.484907,2018-12-30 16:23:00,16,6,12,2018,Autumn,1,Everyday,EV,5.5,38.5,0.18,0,1
4,Toyota Aygo XPlay Nav (NL66 UBU),NL66 UBU,Oxford,2b4e190c-0bb6-11e9-8b21-79c4cede1e0f,2018-12-31 09:10:00,2018-12-31 10:44:00,4.553877,2018-12-31 08:30:00,8,0,12,2018,2018-12-31 11:00:00,11,0,12,2018,5.01728,2.397895,2.5,0.0,0.0,1.029619,2.555676,2.686486,2018-12-29 22:07:00,22,5,12,2018,Autumn,1,City,Petrol,4.75,33.25,0.18,0,1


## 1.2 Load Encoders

In [9]:
# Load the encoders
binary_encoder = joblib.load(ENCODERS_PATH + 'binary_encoder.pkl')
one_hot_encoder = joblib.load(ENCODERS_PATH + 'one_hot_encoder.pkl')

---

# 2. DecisionTreeRegressor Model

In [None]:
# Create a mapping of original locations to binary encoded columns
location_mapping = df_transformed[['location']].copy()
location_mapping = binary_encoder.transform(location_mapping)

In [None]:
locations_to_filter = ['Bristol', 'Newbury']
filter_mask = df_transformed['location'].isin(locations_to_filter)
filtered_data = df[filter_mask]

In [None]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Train separate models for each vehicle type
vehicle_types = [col for col in df.columns if col.startswith('Vehicle Type_')]
# vehicle_types.remove('Vehicle Type_7 Seater')

# Hyperparameter grid for DecisionTreeRegressor
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

for vehicle_type in vehicle_types:
    vehicle_data = df[df[vehicle_type] == 1].copy()

    # Define features and target for the current vehicle type
    features_vehicle = vehicle_data.drop(columns=['hourly_rate', 'daily_rate'])
    target_hourly_vehicle = vehicle_data['hourly_rate']
    target_daily_vehicle = vehicle_data['daily_rate']

    # Split the data based on time
    split_ratio = 0.8
    split_index_vehicle = int(len(vehicle_data) * split_ratio)

    X_train_hourly_vehicle, X_test_hourly_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_hourly_vehicle, y_test_hourly_vehicle = target_hourly_vehicle.iloc[:split_index_vehicle], target_hourly_vehicle.iloc[split_index_vehicle:]

    X_train_daily_vehicle, X_test_daily_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_daily_vehicle, y_test_daily_vehicle = target_daily_vehicle.iloc[:split_index_vehicle], target_daily_vehicle.iloc[split_index_vehicle:]

    # Hyperparameter tuning for hourly rate model
    model_hourly = DecisionTreeRegressor()
    grid_search_hourly = GridSearchCV(model_hourly, param_grid, cv=tscv, scoring='neg_mean_absolute_error')
    grid_search_hourly.fit(X_train_hourly_vehicle, y_train_hourly_vehicle)

    best_model_hourly = grid_search_hourly.best_estimator_

    # Hyperparameter tuning for daily rate model
    model_daily = DecisionTreeRegressor()
    grid_search_daily = GridSearchCV(model_daily, param_grid, cv=tscv, scoring='neg_mean_absolute_error')
    grid_search_daily.fit(X_train_daily_vehicle, y_train_daily_vehicle)

    best_model_daily = grid_search_daily.best_estimator_

    # Evaluate the models
    hourly_predictions = best_model_hourly.predict(X_test_hourly_vehicle)
    daily_predictions = best_model_daily.predict(X_test_daily_vehicle)

    print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
    print("Hourly Rate Model - Best Params:", grid_search_hourly.best_params_)
    print("Hourly Rate Model - MAE:", mean_absolute_error(y_test_hourly_vehicle, hourly_predictions))
    print("Hourly Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_hourly_vehicle, hourly_predictions)))
    print("Daily Rate Model - Best Params:", grid_search_daily.best_params_)
    print("Daily Rate Model - MAE:", mean_absolute_error(y_test_daily_vehicle, daily_predictions))
    print("Daily Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_daily_vehicle, daily_predictions)))
    print('\n')

    # Save the best models
    joblib.dump(model_hourly, MODEL_PATH + f'DecisionTreeRegressor_{vehicle_type}_hourly_rate_model.pkl')
    joblib.dump(model_daily, MODEL_PATH + f'DecisionTreeRegressor_{vehicle_type}_daily_rate_model.pkl')

# 3. XGBRegressor

In [None]:
from xgboost import XGBRegressor

# Hyperparameter grid for XGBRegressor
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3]
}

# TimeSeriesSplit for cross-validation
tscv = TimeSeriesSplit(n_splits=5)

for vehicle_type in vehicle_types:
    vehicle_data = df[df[vehicle_type] == 1].copy()

    # Define features and target for the current vehicle type
    features_vehicle = vehicle_data.drop(columns=['hourly_rate', 'daily_rate'])
    target_hourly_vehicle = vehicle_data['hourly_rate']
    target_daily_vehicle = vehicle_data['daily_rate']

    # Split the data based on time
    split_ratio = 0.8
    split_index_vehicle = int(len(vehicle_data) * split_ratio)

    X_train_hourly_vehicle, X_test_hourly_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_hourly_vehicle, y_test_hourly_vehicle = target_hourly_vehicle.iloc[:split_index_vehicle], target_hourly_vehicle.iloc[split_index_vehicle:]

    X_train_daily_vehicle, X_test_daily_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_daily_vehicle, y_test_daily_vehicle = target_daily_vehicle.iloc[:split_index_vehicle], target_daily_vehicle.iloc[split_index_vehicle:]

    # Hyperparameter tuning for hourly rate model
    model_hourly = XGBRegressor()
    grid_search_hourly = GridSearchCV(model_hourly, param_grid, cv=tscv, scoring='neg_mean_absolute_error')
    grid_search_hourly.fit(X_train_hourly_vehicle, y_train_hourly_vehicle)

    best_model_hourly = grid_search_hourly.best_estimator_

    # Hyperparameter tuning for daily rate model
    model_daily = XGBRegressor()
    grid_search_daily = GridSearchCV(model_daily, param_grid, cv=tscv, scoring='neg_mean_absolute_error')
    grid_search_daily.fit(X_train_daily_vehicle, y_train_daily_vehicle)

    best_model_daily = grid_search_daily.best_estimator_

    # Evaluate the models
    hourly_predictions = best_model_hourly.predict(X_test_hourly_vehicle)
    daily_predictions = best_model_daily.predict(X_test_daily_vehicle)

    print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
    print("Hourly Rate Model - Best Params:", grid_search_hourly.best_params_)
    print("Hourly Rate Model - MAE:", mean_absolute_error(y_test_hourly_vehicle, hourly_predictions))
    print("Hourly Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_hourly_vehicle, hourly_predictions)))
    print("Daily Rate Model - Best Params:", grid_search_daily.best_params_)
    print("Daily Rate Model - MAE:", mean_absolute_error(y_test_daily_vehicle, daily_predictions))
    print("Daily Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_daily_vehicle, daily_predictions)))

    # Save the best models
    joblib.dump(model_hourly, MODEL_PATH + f'XGBRegressor_{vehicle_type}_hourly_rate_model.pkl')
    joblib.dump(model_daily, MODEL_PATH + f'XGBRegressor_{vehicle_type}_daily_rate_model.pkl')

Final model so far based on the performance is - **XGBRegressor**

# 4. Neural Network Model

## 4.1. Import libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 4.2. Define Model

In [None]:
# Define the neural network model
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

## 4.3. Training Model

In [None]:
# Train separate models for each vehicle type
vehicle_types = [col for col in df.columns if col.startswith('Vehicle Type_')]

for vehicle_type in vehicle_types:
    vehicle_data = df[df[vehicle_type] == 1].copy()

    # Define features and target for the current vehicle type
    features_vehicle = vehicle_data.drop(columns=['hourly_rate', 'daily_rate'])
    target_hourly_vehicle = vehicle_data['hourly_rate']
    target_daily_vehicle = vehicle_data['daily_rate']

    # Split the data based on time
    split_ratio = 0.8
    split_index_vehicle = int(len(vehicle_data) * split_ratio)

    X_train = features_vehicle.iloc[:split_index_vehicle]
    X_test = features_vehicle.iloc[split_index_vehicle:]
    y_train_hourly = target_hourly_vehicle.iloc[:split_index_vehicle]
    y_test_hourly = target_hourly_vehicle.iloc[split_index_vehicle:]
    y_train_daily = target_daily_vehicle.iloc[:split_index_vehicle]
    y_test_daily = target_daily_vehicle.iloc[split_index_vehicle:]

    # Normalize the features
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Save the scaler
    joblib.dump(scaler, ENCODERS_PATH + f'scaler_{vehicle_type}.pkl')

    # Train the model for hourly rate prediction
    model_hourly = create_model(X_train.shape[1])
    model_hourly.fit(X_train, y_train_hourly, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the hourly rate model
    hourly_predictions = model_hourly.predict(X_test)
    hourly_mae = mean_absolute_error(y_test_hourly, hourly_predictions)
    hourly_rmse = np.sqrt(mean_squared_error(y_test_hourly, hourly_predictions))

    print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
    print("Hourly Rate Model - MAE:", hourly_mae)
    print("Hourly Rate Model - RMSE:", hourly_rmse)

    # Save the hourly rate model
    model_hourly.save(MODEL_PATH + f'NeuralNetwork_{vehicle_type}_hourly_rate_model.h5')

    # Train the model for daily rate prediction
    model_daily = create_model(X_train.shape[1])
    model_daily.fit(X_train, y_train_daily, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

    # Evaluate the daily rate model
    daily_predictions = model_daily.predict(X_test)
    daily_mae = mean_absolute_error(y_test_daily, daily_predictions)
    daily_rmse = np.sqrt(mean_squared_error(y_test_daily, daily_predictions))

    print("Daily Rate Model - MAE:", daily_mae)
    print("Daily Rate Model - RMSE:", daily_rmse)

    # Save the daily rate model
    model_daily.save(MODEL_PATH + f'NeuralNetwork_{vehicle_type}_daily_rate_model.h5')

# 5. RandomForestRegressor

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib
import numpy as np

# Train separate models for each vehicle type
vehicle_types = [col for col in df.columns if col.startswith('Vehicle Type_')]

for vehicle_type in vehicle_types:
    vehicle_data = df[df[vehicle_type] == 1].copy()

    # Define features and target for the current vehicle type
    features_vehicle = vehicle_data.drop(columns=['hourly_rate', 'daily_rate'])
    target_hourly_vehicle = vehicle_data['hourly_rate']
    target_daily_vehicle = vehicle_data['daily_rate']

    # Split the data based on time
    split_ratio = 0.8
    split_index_vehicle = int(len(vehicle_data) * split_ratio)

    X_train_hourly_vehicle, X_test_hourly_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_hourly_vehicle, y_test_hourly_vehicle = target_hourly_vehicle.iloc[:split_index_vehicle], target_hourly_vehicle.iloc[split_index_vehicle:]

    X_train_daily_vehicle, X_test_daily_vehicle = features_vehicle.iloc[:split_index_vehicle], features_vehicle.iloc[split_index_vehicle:]
    y_train_daily_vehicle, y_test_daily_vehicle = target_daily_vehicle.iloc[:split_index_vehicle], target_daily_vehicle.iloc[split_index_vehicle:]

    # Train the model for hourly rate
    model_hourly = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    model_hourly.fit(X_train_hourly_vehicle, y_train_hourly_vehicle)

    # Train the model for daily rate
    model_daily = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
    model_daily.fit(X_train_daily_vehicle, y_train_daily_vehicle)

    # Evaluate the models
    hourly_predictions = model_hourly.predict(X_test_hourly_vehicle)
    daily_predictions = model_daily.predict(X_test_daily_vehicle)

    print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
    print("Hourly Rate Model - MAE:", mean_absolute_error(y_test_hourly_vehicle, hourly_predictions))
    print("Hourly Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_hourly_vehicle, hourly_predictions)))
    print("Daily Rate Model - MAE:", mean_absolute_error(y_test_daily_vehicle, daily_predictions))
    print("Daily Rate Model - RMSE:", np.sqrt(mean_squared_error(y_test_daily_vehicle, daily_predictions)))
    print('\n')

    # Save the best models
    # joblib.dump(model_hourly, MODEL_PATH + f'RandomForestRegressor{vehicle_type}_hourly_rate_model.pkl')
    # joblib.dump(model_daily, MODEL_PATH + f'RandomForestRegressor{vehicle_type}_daily_rate_model.pkl')

# 6. Final Model Building Code

In [10]:
# # Define function to create and compile neural network model
# def create_nn_model(input_dim):
#     model = Sequential()
#     model.add(Dense(128, input_dim=input_dim, activation='relu'))
#     model.add(Dropout(0.2))
#     model.add(Dense(64, activation='relu'))
#     model.add(Dense(1))  # Output layer for regression
#     model.compile(loss='mean_squared_error', optimizer='adam')
#     return model

# Define function to create and compile neural network model
def create_nn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [None]:
# # Function to calculate demand factor based on location popularity and peak usage hours
# def calculate_demand_factor(historical_data, location, hour):
#     # Calculate hourly booking count demand factor
#     location_data = historical_data[historical_data['location'] == location]
#     hourly_demand = location_data.groupby(location_data['booking_billed_start_hour'])['booking_id'].count()
#     print('hourly_demand: ', hourly_demand)
#     hourly_demand_factor = hourly_demand / hourly_demand.mean()

#     # Calculate overall booking count demand factor for location
#     location_demand = historical_data.groupby('location')['booking_id'].count()
#     print('location_demand: ', location_demand)
#     location_demand_factor = location_demand / location_demand.mean()

#     # Calculate peak hour demand factor based on location
#     peak_hours = location_data['booking_billed_start_hour'].value_counts().idxmax()
#     print('peak_hours: ', peak_hours)
#     peak_hour_demand_factor = hourly_demand[peak_hours] / hourly_demand.mean()

#     return hourly_demand_factor.get(hour, 1), location_demand_factor.get(location, 1), peak_hour_demand_factor

In [11]:
# # Function to calculate demand factor based on location popularity and peak usage hours
# def calculate_demand_factor(historical_data, location):
#     # Calculate hourly booking count demand factor
#     location_data = historical_data[historical_data['location'] == location]
#     hourly_demand = location_data.groupby(location_data['booking_billed_start_hour'])['booking_id'].count()
#     hourly_demand_factor = hourly_demand / hourly_demand.mean()

#     # Calculate overall booking count demand factor for location
#     location_demand = historical_data.groupby('location')['booking_id'].count()
#     location_demand_factor = location_demand / location_demand.mean()

#     # Calculate peak hour demand factor based on location
#     peak_hours = location_data['booking_billed_start_hour'].value_counts().idxmax()
#     peak_hour_demand_factor = hourly_demand[peak_hours] / hourly_demand.mean()

#     return hourly_demand_factor, location_demand_factor, peak_hour_demand_factor

In [None]:
# Function to calculate demand factor based on location popularity and peak usage hours
def calculate_demand_factor(historical_data, location):
    # Calculate hourly booking count demand factor
    location_data = historical_data[historical_data['location'] == location]
    hourly_demand = location_data.groupby(location_data['booking_billed_start_hour'])['booking_id'].count()
    hourly_demand_factor = hourly_demand / hourly_demand.mean()

    # Calculate overall booking count demand factor for location
    location_demand = historical_data.groupby('location')['booking_id'].count()
    location_demand_factor = location_demand / location_demand.mean()

    # Calculate peak hour demand factor based on location
    # peak_hours = location_data['booking_billed_start_hour'].value_counts().idxmax()
    # peak_hour_demand_factor = hourly_demand[peak_hours] / hourly_demand.mean()

    return hourly_demand_factor, location_demand_factor

In [12]:
# Function to train and evaluate models
def train_evaluate_models(df):
    vehicle_types = [col for col in df.columns if col.startswith('Vehicle Type_')]
    results = {}
    predictions_df = pd.DataFrame()

    for vehicle_type in vehicle_types:
        vehicle_data = df[df[vehicle_type] == 1].copy()

        # Sort the data based on time
        vehicle_data = vehicle_data.sort_values(by='booking_billed_start')

        # Define features and target for the current vehicle type
        features_vehicle = vehicle_data.drop(columns=['hourly_rate', 'daily_rate', 'booking_billed_start'])
        target_hourly_vehicle = vehicle_data['hourly_rate']
        target_daily_vehicle = vehicle_data['daily_rate']

        # Split the data based on time
        split_ratio = 0.8
        split_index_vehicle = int(len(vehicle_data) * split_ratio)

        X_train = features_vehicle.iloc[:split_index_vehicle]
        X_test = features_vehicle.iloc[split_index_vehicle:]
        y_train_hourly = target_hourly_vehicle.iloc[:split_index_vehicle]
        y_test_hourly = target_hourly_vehicle.iloc[split_index_vehicle:]
        y_train_daily = target_daily_vehicle.iloc[:split_index_vehicle]
        y_test_daily = target_daily_vehicle.iloc[split_index_vehicle:]

        # Set values to None
        hourly_mae = None
        hourly_rmse = None
        daily_mae = None
        daily_rmse = None
        y_pred_hourly = None
        y_pred_daily = None


        if 'City' in vehicle_type or '7 Seater' in vehicle_type:
            # Normalize the features
            scaler = MinMaxScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            # Save the scaler
            joblib.dump(scaler, ENCODERS_PATH + f'scaler_{vehicle_type}.pkl')

            # Neural Network for both hourly and daily rates
            model_hourly = create_nn_model(X_train.shape[1])
            model_hourly.fit(X_train, y_train_hourly, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
            model_hourly.save(MODEL_PATH + f'{vehicle_type}_nn_hourly_rate_model.keras')

            # Evaluate the hourly rate model
            y_pred_hourly = model_hourly.predict(X_test)
            hourly_mae = np.round(mean_absolute_error(y_test_hourly, y_pred_hourly), 5)
            hourly_rmse = np.round(np.sqrt(mean_squared_error(y_test_hourly, y_pred_hourly)), 5)

            model_daily = create_nn_model(X_train.shape[1])
            model_daily.fit(X_train, y_train_daily, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
            model_daily.save(MODEL_PATH + f'{vehicle_type}_nn_daily_rate_model.keras')

            # Evaluate the daily rate model
            y_pred_daily = model_daily.predict(X_test)
            daily_mae = np.round(mean_absolute_error(y_test_daily, y_pred_daily), 5)
            daily_rmse = np.round(np.sqrt(mean_squared_error(y_test_daily, y_pred_daily)), 5)

            print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
            print("Hourly Rate Model - MAE:", hourly_mae)
            print("Hourly Rate Model - RMSE:", hourly_rmse)
            print("Daily Rate Model - MAE:", daily_mae)
            print("Daily Rate Model - RMSE:", daily_rmse)

        elif 'Everyday' in vehicle_type or 'Van' in vehicle_type:
            # XGBRegressor for both hourly and daily rates
            model_hourly = XGBRegressor()
            model_hourly.fit(X_train, y_train_hourly)
            joblib.dump(model_hourly, MODEL_PATH + f'{vehicle_type}_xgb_hourly_rate_model.pkl')

            model_daily = XGBRegressor()
            model_daily.fit(X_train, y_train_daily)
            joblib.dump(model_daily, MODEL_PATH + f'{vehicle_type}_xgb_daily_rate_model.pkl')

            # Evaluate models
            y_pred_hourly = model_hourly.predict(X_test)
            y_pred_daily = model_daily.predict(X_test)

            # hourly and daily mae and rmse
            hourly_mae = np.round(mean_absolute_error(y_test_hourly, y_pred_hourly), 5)
            hourly_rmse = np.round(np.sqrt(mean_squared_error(y_test_hourly, y_pred_hourly)), 5)
            daily_mae = np.round(mean_absolute_error(y_test_daily, y_pred_daily), 5)
            daily_rmse = np.round(np.sqrt(mean_squared_error(y_test_daily, y_pred_daily)), 5)

            print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
            print("Hourly Rate Model - MAE:", hourly_mae)
            print("Hourly Rate Model - RMSE:", hourly_rmse)
            print("Daily Rate Model - MAE:", daily_mae)
            print("Daily Rate Model - RMSE:", daily_rmse)

        elif 'Family' in vehicle_type:
            # XGBRegressor for hourly rate, DecisionTreeRegressor for daily rate
            model_hourly = XGBRegressor()
            model_hourly.fit(X_train, y_train_hourly)
            joblib.dump(model_hourly, MODEL_PATH + f'{vehicle_type}_xgb_hourly_rate_model.pkl')

            model_daily = DecisionTreeRegressor()
            model_daily.fit(X_train, y_train_daily)
            joblib.dump(model_daily, MODEL_PATH + f'{vehicle_type}_dt_daily_rate_model.pkl')

            # Evaluate models
            y_pred_hourly = model_hourly.predict(X_test)
            y_pred_daily = model_daily.predict(X_test)

            # hourly and daily mae and rmse
            hourly_mae = np.round(mean_absolute_error(y_test_hourly, y_pred_hourly), 5)
            hourly_rmse = np.round(np.sqrt(mean_squared_error(y_test_hourly, y_pred_hourly)), 5)
            daily_mae = np.round(mean_absolute_error(y_test_daily, y_pred_daily), 5)
            daily_rmse = np.round(np.sqrt(mean_squared_error(y_test_daily, y_pred_daily)), 5)

            print(f"Vehicle Type: {vehicle_type.replace('Vehicle Type_', '')}")
            print("Hourly Rate Model - MAE:", hourly_mae)
            print("Hourly Rate Model - RMSE:", hourly_rmse)
            print("Daily Rate Model - MAE:", daily_mae)
            print("Daily Rate Model - RMSE:", daily_rmse)

        # Add predictions to the dataframe
        temp_df = pd.DataFrame({
            'vehicle_type': vehicle_type.replace('Vehicle Type_', ''),
            'booking_billed_start_hour': X_test_copy['booking_billed_start_hour'],
            'actual_hourly': y_test_hourly.tolist(),
            'predicted_hourly': y_pred_hourly.flatten().tolist(),
            'actual_daily': y_test_daily.tolist(),
            'predicted_daily': y_pred_daily.flatten().tolist()
        })

        predictions_df = pd.concat([predictions_df, temp_df], axis=0, ignore_index=True)

        # hourly and daily mae and rmse
        results[vehicle_type] = {
            'hourly_rate': {
                'MAE': hourly_mae,
                'RMSE': hourly_rmse
            },
            'daily_rate': {
                'MAE': daily_mae,
                'RMSE': daily_rmse
            }
        }

    return results, predictions_df

In [13]:
results, predictions_df = train_evaluate_models(df)

[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Vehicle Type: City
Hourly Rate Model - MAE: 0.40884
Hourly Rate Model - RMSE: 0.55072
Daily Rate Model - MAE: 5.69552
Daily Rate Model - RMSE: 7.13093
Vehicle Type: Everyday
Hourly Rate Model - MAE: 0.64146
Hourly Rate Model - RMSE: 0.83981
Daily Rate Model - MAE: 2.92157
Daily Rate Model - RMSE: 3.53526
Vehicle Type: Family
Hourly Rate Model - MAE: 0.4226
Hourly Rate Model - RMSE: 0.55652
Daily Rate Model - MAE: 0.00046
Daily Rate Model - RMSE: 0.02727
Vehicle Type: Van
Hourly Rate Model - MAE: 0.76837
Hourly Rate Model - RMSE: 1.19642
Daily Rate Model - MAE: 3.10934
Daily Rate Model - RMSE: 4.3839
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step




[1m1/4[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m0s[0m 62ms/step



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Vehicle Type: 7 Seater
Hourly Rate Model - MAE: 0.42823
Hourly Rate Model - RMSE: 0.4517
Daily Rate Model - MAE: 2.69953
Daily Rate Model - RMSE: 3.00991


In [15]:
predictions_df

Unnamed: 0,vehicle_type,location,booking_billed_start_hour,actual_hourly,predicted_hourly,actual_daily,predicted_daily
0,City,Oxford,15,4.75,4.697309,33.25,33.253937
1,City,Bristol,15,4.75,4.697309,33.25,33.253761
2,City,Aberdeen,15,4.75,4.697309,33.25,33.253822
3,City,Aberdeen,15,4.75,4.697309,33.25,33.253834
4,City,Gateshead,16,4.75,4.697309,33.25,33.253826
...,...,...,...,...,...,...,...
75720,7 Seater,Gateshead,11,7.50,7.124960,60.00,63.008049
75721,7 Seater,Aberdeen,8,7.50,7.017983,60.00,62.707439
75722,7 Seater,Gateshead,9,7.50,7.059603,60.00,62.271297
75723,7 Seater,Gateshead,8,7.50,6.957718,60.00,61.843082


In [18]:
# # Apply demand factors to predictions
# def apply_demand_factors(predictions_df, historical_data):
#     adjusted_hourly_rates = []
#     adjusted_daily_rates = []

#     for idx, row in predictions_df.iterrows():
#         location = row['location']
#         hour = row['booking_billed_start_hour']
#         hourly_demand_factor, location_demand_factor, peak_hour_demand_factor = calculate_demand_factor(historical_data, location)
#         adjusted_hourly_rate = row['predicted_hourly'] * (1 + hourly_demand_factor) * (1 + location_demand_factor) * peak_hour_demand_factor
#         adjusted_daily_rate = row['predicted_daily'] * (1 + hourly_demand_factor) * (1 + location_demand_factor) * peak_hour_demand_factor
#         adjusted_hourly_rates.append(adjusted_hourly_rate)
#         adjusted_daily_rates.append(adjusted_daily_rate)

#     predictions_df['adjusted_hourly_rate'] = adjusted_hourly_rates
#     predictions_df['adjusted_daily_rate'] = adjusted_daily_rates

#     return predictions_df

In [22]:
# # Inverse transform the location columns
# location_columns = [col for col in historical_data_df.columns if col.startswith('location_')]
# historical_data_df['location'] = binary_encoder.inverse_transform(historical_data_df[location_columns])['location']

In [None]:
print(results)

In [None]:
predictions_df.head()

---