In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import joblib

# Example dataset creation
data = {
    'ride_id': [1, 2, 3, 4, 5, 6],
    'seat_number': [1, 2, 3, 4, 5, 6],
    'payment_method': ['Mpesa', 'Cash', 'Mpesa', 'Cash', 'Mpesa', 'Cash'],
    'payment_receipt': ['0001', '0002', '0003', '0004', '0005', '0006'],
    'travel_date': ['01/01/2023', '01/01/2023', '01/02/2023', '01/02/2023', '01/03/2023', '01/03/2023'],
    'travel_time': ['08:00', '09:00', '07:30', '10:00', '08:30', '09:30'],
    'travel_from': ['Awendo', 'Homa Bay', 'Kisii', 'Migori', 'Kehancha', 'Rongo'],
    'travel_to': ['Nairobi', 'Nairobi', 'Nairobi', 'Nairobi', 'Nairobi', 'Nairobi'],
    'car_type': ['shuttle', 'bus', 'shuttle', 'bus', 'shuttle', 'bus'],
    'max_capacity': [11, 25, 12, 30, 10, 28]
}

df = pd.DataFrame(data)

# Data preprocessing
# Convert travel_date and travel_time to datetime objects
df['travel_date'] = pd.to_datetime(df['travel_date'], format='%m/%d/%Y')
df['travel_time'] = pd.to_datetime(df['travel_time'], format='%H:%M').dt.time

# Feature engineering
# Assuming `travel_duration` is calculated based on provided information (8-9 hours to outskirts, 2-3 hours to CBD)
df['travel_duration'] = np.random.randint(8, 10, size=len(df)) + np.random.uniform(2, 3, size=len(df))

# Define features and target
features = ['travel_date', 'travel_time', 'travel_from', 'car_type', 'travel_duration']
target = 'seat_number'

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Preprocessing pipeline for numeric and categorical features
numeric_features = ['travel_duration']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_features = ['travel_from', 'car_type']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Append regressor to preprocessing pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', GradientBoostingRegressor())])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__learning_rate': [0.05, 0.1, 0.2],
    'regressor__max_depth': [3, 4, 5]
}

# Grid search CV to find best parameters with 3-fold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters found: ", grid_search.best_params_)
print("Best RMSE found: ", np.sqrt(-grid_search.best_score_))

# Predict on test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on test set: {mse}')

# Serialize the model for deployment
joblib.dump(grid_search.best_estimator_, 'seat_demand_prediction_model.pkl')

# Example of how to load the model later
# loaded_model = joblib.load('seat_demand_prediction_model.pkl')
# result = loaded_model.predict(new_data)


Best parameters found:  {'regressor__learning_rate': 0.2, 'regressor__max_depth': 5, 'regressor__n_estimators': 300}
Best RMSE found:  1.9135488840491788
Mean Squared Error on test set: 9.055192964860789


['seat_demand_prediction_model.pkl']