In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingRegressor, VotingClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, make_scorer
from xgboost import XGBRegressor, XGBClassifier
import joblib

In [11]:
# Load the dataset
df = pd.read_csv('../data/wapi_daktari_healthcare_dataset.csv')

In [17]:
# Define features and target
features = [
    'day_of_week', 'is_weekend', 'is_holiday', 'is_strike_day', 'department', 'time_block',
    'doctors_on_shift', 'expected_patients', 'actual_patients',
    'peak_hour', 'doctor_available', 'doctor_arrival_delay', 'month',
    'day', 'patient_load_ratio', 'doctor_patient_ratio', 'holiday_strike_interaction',
    'expected_walk_ins', 'emergencies', 'seasonal_illnesses', 'public_holidays_events',
    'hour_of_day', 'day_of_month', 'quarter', 'season', 'previous_day_patients',
    'previous_week_patients', 'previous_month_patients', 'temperature', 'humidity', 'rainfall',
    'school_holidays', 'national_events', 'average_waiting_time_last_week', 'average_patients_last_month',
    'previous_day_waiting_time', 'previous_week_waiting_time', 'previous_month_waiting_time',
    'doctors_on_shift_expected_patients', 'doctor_patient_ratio_congestion_level',
    'flu_season', 'malaria_season'
]
target_regression = 'waiting_time_minutes'
target_classification = 'congestion_level'

# Define the preprocessing pipeline
categorical_features = ['department', 'time_block', 'season', 'peak_hour', 'doctor_available', ]
numerical_features = [
    'day_of_week', 'is_weekend', 'is_holiday', 'is_strike_day',
    'doctors_on_shift', 'expected_patients', 'actual_patients', 'doctor_arrival_delay', 'month',
    'day', 'patient_load_ratio', 'doctor_patient_ratio', 'holiday_strike_interaction',
    'expected_walk_ins', 'emergencies', 'seasonal_illnesses', 'public_holidays_events',
    'hour_of_day', 'day_of_month', 'quarter', 'previous_day_patients',
    'previous_week_patients', 'previous_month_patients', 'temperature', 'humidity', 'rainfall',
    'school_holidays', 'national_events', 'average_waiting_time_last_week', 'average_patients_last_month',
    'previous_day_waiting_time', 'previous_week_waiting_time', 'previous_month_waiting_time',
    'doctors_on_shift_expected_patients', 'doctor_patient_ratio_congestion_level',
    'flu_season', 'malaria_season'
]

# Add the handle_unknown flag to prevent errors during transform
categorical_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', categorical_encoder, categorical_features)
    ])


# Split the data for regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df[features], df[target_regression], test_size=0.2, random_state=42)

# Split the data for classification
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(df[features], df[target_classification], test_size=0.2, random_state=42)

le = LabelEncoder()
y_train_class_encoded = le.fit_transform(y_train_class)
y_test_class_encoded = le.transform(y_test_class)


# Preprocess the data
X_train_reg_scaled = preprocessor.fit_transform(X_train_reg)
X_test_reg_scaled = preprocessor.transform(X_test_reg)
X_train_class_scaled = preprocessor.fit_transform(X_train_class)
X_test_class_scaled = preprocessor.transform(X_test_class)

In [18]:
# Define a function to perform cross-validation
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring=make_scorer(mean_squared_error))
    return train_score, test_score, cv_scores

In [5]:
# Train and evaluate a RandomForest Regressor with hyperparameter tuning
rf_regressor = RandomForestRegressor(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search_rf = GridSearchCV(estimator=rf_regressor, param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train_reg_scaled, y_train_reg)
best_rf_regressor = grid_search_rf.best_estimator_
train_score_rf_reg, test_score_rf_reg, cv_scores_rf_reg = evaluate_model(best_rf_regressor, X_train_reg_scaled, y_train_reg, X_test_reg_scaled, y_test_reg)
print(f'RandomForest Regressor - Train Score: {train_score_rf_reg}, Test Score: {test_score_rf_reg}, CV Scores: {cv_scores_rf_reg}')

RandomForest Regressor - Train Score: 1.0, Test Score: 1.0, CV Scores: [0. 0. 0. 0. 0.]


In [6]:
# Train and evaluate an XGBoost Regressor with hyperparameter tuning
xgb_regressor = XGBRegressor(random_state=42)
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}
grid_search_xgb = GridSearchCV(estimator=xgb_regressor, param_grid=param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_search_xgb.fit(X_train_reg_scaled, y_train_reg)
best_xgb_regressor = grid_search_xgb.best_estimator_
train_score_xgb_reg, test_score_xgb_reg, cv_scores_xgb_reg = evaluate_model(best_xgb_regressor, X_train_reg_scaled, y_train_reg, X_test_reg_scaled, y_test_reg)
print(f'XGBoost Regressor - Train Score: {train_score_xgb_reg}, Test Score: {test_score_xgb_reg}, CV Scores: {cv_scores_xgb_reg}')

XGBoost Regressor - Train Score: 1.0, Test Score: 1.0, CV Scores: [4.22088614e-10 4.98903918e-10 5.24443711e-10 5.01436392e-10
 4.33945269e-10]


In [7]:
# Train and evaluate a Hybrid Regressor (Voting Regressor)
hybrid_regressor = VotingRegressor(estimators=[('rf', best_rf_regressor), ('xgb', best_xgb_regressor)])
train_score_hybrid_reg, test_score_hybrid_reg, cv_scores_hybrid_reg = evaluate_model(hybrid_regressor, X_train_reg_scaled, y_train_reg, X_test_reg_scaled, y_test_reg)
print(f'Hybrid Regressor - Train Score: {train_score_hybrid_reg}, Test Score: {test_score_hybrid_reg}, CV Scores: {cv_scores_hybrid_reg}')

Hybrid Regressor - Train Score: 0.9999999999998135, Test Score: 0.9999999999998115, CV Scores: [1.05522150e-10 1.24725985e-10 1.31110929e-10 1.25359102e-10
 1.08486314e-10]


In [20]:
def evaluate_classification_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_score = accuracy_score(y_train, y_train_pred)
    test_score = accuracy_score(y_test, y_test_pred)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    return train_score, test_score, cv_scores

In [21]:

#Train and evaluate a RandomForest Classifier with hyperparameter tuning
rf_classifier = RandomForestClassifier(random_state=42)

param_grid_rf_class = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search_rf_class = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid_rf_class,
    cv=5,
    scoring='accuracy'
)

grid_search_rf_class.fit(X_train_class_scaled, y_train_class)
best_rf_classifier = grid_search_rf_class.best_estimator_

#Use the correct evaluation function
train_score_rf_class, test_score_rf_class, cv_scores_rf_class = evaluate_classification_model(
    best_rf_classifier,
    X_train_class_scaled,
    y_train_class,
    X_test_class_scaled,
    y_test_class
)

print(f'RandomForest Classifier - Train Score: {train_score_rf_class}, Test Score: {test_score_rf_class}, CV Scores: {cv_scores_rf_class}')

RandomForest Classifier - Train Score: 1.0, Test Score: 1.0, CV Scores: [1. 1. 1. 1. 1.]


In [25]:
# Encode target labels for classification
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_class_encoded = label_encoder.fit_transform(y_train_class)
y_test_class_encoded = label_encoder.transform(y_test_class)

# Train and evaluate an XGBoost Classifier with hyperparameter tuning
xgb_classifier = XGBClassifier(random_state=42, eval_metric='mlogloss')

param_grid_xgb_class = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}

grid_search_xgb_class = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid_xgb_class,
    cv=5,
    scoring='accuracy'
)

grid_search_xgb_class.fit(X_train_class_scaled, y_train_class_encoded)
best_xgb_classifier = grid_search_xgb_class.best_estimator_

train_score_xgb_class, test_score_xgb_class, cv_scores_xgb_class = evaluate_classification_model(
    best_xgb_classifier,
    X_train_class_scaled,
    y_train_class_encoded,
    X_test_class_scaled,
    y_test_class_encoded
)

print(f'XGBoost Classifier - Train Score: {train_score_xgb_class}, Test Score: {test_score_xgb_class}, CV Scores: {cv_scores_xgb_class}')

XGBoost Classifier - Train Score: 1.0, Test Score: 1.0, CV Scores: [1. 1. 1. 1. 1.]


In [27]:
# Train and evaluate a Hybrid Classifier (Voting Classifier)
hybrid_classifier = VotingClassifier(estimators=[
    ('rf', best_rf_classifier),
    ('xgb', best_xgb_classifier)
])

train_score_hybrid_class, test_score_hybrid_class, cv_scores_hybrid_class = evaluate_classification_model(
    hybrid_classifier,
    X_train_class_scaled,
    y_train_class_encoded,
    X_test_class_scaled,
    y_test_class_encoded
)

print(f'Hybrid Classifier - Train Score: {train_score_hybrid_class}, Test Score: {test_score_hybrid_class}, CV Scores: {cv_scores_hybrid_class}')

# Save the models and scaler
joblib.dump(best_rf_regressor, '../src/api/random_forest_regressor.pkl')
joblib.dump(best_xgb_regressor, '../src/api/xgboost_regressor.pkl')
joblib.dump(hybrid_regressor, '../src/api/hybrid_regressor.pkl')
joblib.dump(best_rf_classifier, '../src/api/random_forest_classifier.pkl')
joblib.dump(best_xgb_classifier, '../src/api/xgboost_classifier.pkl')
joblib.dump(hybrid_classifier, '../src/api/hybrid_classifier.pkl')
joblib.dump(preprocessor, '../src/api/preprocessor.pkl')
joblib.dump(label_encoder, '../src/api/label_encoder.pkl') 

Hybrid Classifier - Train Score: 1.0, Test Score: 1.0, CV Scores: [1. 1. 1. 1. 1.]


['../src/api/label_encoder.pkl']