# Load data and prepare visualization

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
pre_test_data = pd.read_csv('Maternal Health Risk Data Set.csv')

# Stratified sample

In [2]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=11)

for train_index, test_index in split.split(pre_test_data, pre_test_data['Risk_Level']):
    strat_train_set = pre_test_data.loc[train_index]
    strat_test_set = pre_test_data.loc[test_index]
    
X_train_strat = strat_train_set[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'Body Temp', 'Heart Rate']]
y_train_strat = strat_train_set['Risk_Level']

X_test_strat = strat_test_set[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'Body Temp', 'Heart Rate']]
y_test_strat = strat_test_set['Risk_Level']

# CatBoost without scaling and stratified sampling

In [3]:
def catBoost_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat):
    import copy
    import pandas as pd
    import numpy as np
    from sklearn import metrics
    from catboost import CatBoostClassifier, Pool
    from sklearn.metrics import accuracy_score, classification_report
    
    # Declare input training data indices that have categorical variables
    cat_features = []
    
    # Training and test datasets prep for CatBoost
    train_CatData = Pool(data=X_train_strat, label=y_train_strat, cat_features=cat_features)
    eval_CatData = Pool(data=X_test_strat, label=y_test_strat, cat_features=cat_features)
    
    # Initialize CatBoostClassifier
    cat_model = CatBoostClassifier(iterations=1111, learning_rate=0.11, depth=11, loss_function='MultiClass', verbose=False)
    
    # Fit model
    cat_model.fit(train_CatData)
    
    # Get predicted classes
    y_pred = cat_model.predict(eval_CatData)
    
    # Get predicted probabilities for each class
    preds_proba = cat_model.predict_proba(eval_CatData)
    
    # Get predicted RawFormulaVal
    preds_raw = cat_model.predict(eval_CatData, prediction_type='RawFormulaVal')
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

# XGBoost without scaling and with stratified sampling

In [4]:
def XGBoost_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat):
    from xgboost import XGBClassifier
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn import metrics
    import numpy as np
    
    le = LabelEncoder()
    y_trainXGb = le.fit_transform(y_train_strat)
    
    # Train the Random Forest classifier
    clf_xgboost = XGBClassifier()
    clf_xgboost.fit(X_train_strat, y_trainXGb)
    
    # Predict on test set
    y_pred = clf_xgboost.predict(X_test_strat)
    y_pred = le.inverse_transform(y_pred)
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

# Random forest without scaling and with stratified sampling

In [5]:
def rndmforest_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat):
    #Preprocess data to see if accuracy improved 
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn import metrics
    import numpy as np
    
    # Train the Random Forest classifier
    clf_rf = RandomForestClassifier(random_state=42)
    clf_rf.fit(X_train_strat, y_train_strat)
    
    # Predict on test set
    y_pred_rf = clf_rf.predict(X_test_strat)
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred_rf)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

# Tuning

In [6]:
def catBoost_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat):
    #Import libraries
    import copy
    import pandas as pd
    import numpy as np
    import scipy.stats as stats
    from sklearn.model_selection import RandomizedSearchCV
    from catboost import CatBoostClassifier, Pool
    
    # Declare input training data indices that have categorical variables
    cat_features = []
    
    # Training and test datasets prep for CatBoost
    train_CatData = Pool(data=X_train_strat, label=y_train_strat, cat_features=cat_features)
    eval_CatData = Pool(data=X_test_strat, label=y_test_strat, cat_features=cat_features)
    
    #Instantiate CatBoostClassifier
    cbc = CatBoostClassifier(verbose=False, loss_function='MultiClass', iterations=1111)
    
    # Creating the hyperparameter grid
    param_dist = { "learning_rate": np.linspace(0.01,0.25,10),
                   "max_depth": stats.randint(10, 15)
                }
                   
    #Instantiate RandomSearchCV object
    rscv = RandomizedSearchCV(cbc , param_dist, scoring='accuracy', cv=10, random_state=11)
    
    #Fit the model
    rscv.fit(X_train_strat, y_train_strat)
    
    # Print the tuned parameters and score
    print(rscv.best_params_)
    print(rscv.best_score_)
    
    best_hyperparams = rscv.best_params_

    return best_hyperparams

In [7]:
def XGBoost_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat):
    import pandas as pd
    import numpy as np
    import scipy.stats as stats
    import xgboost as xgb
    from sklearn.metrics import accuracy_score
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import RandomizedSearchCV
    
    import warnings
    warnings.filterwarnings("ignore")
    
    param_dist={'max_depth': stats.randint(5, 15)}
    
    le = LabelEncoder()
    y_trainXGb = le.fit_transform(y_train_strat)
    
    # Create the XGBoost model object
    xgb_model = xgb.XGBClassifier()
    
    # Create the RandomizedSearchCV object
    random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=25, cv=10, scoring='accuracy', random_state=11)
    
    # Fit the RandomizedSearchCV object to the training data
    random_search.fit(X_train_strat, y_trainXGb)
    
    # Print the best set of hyperparameters and the corresponding score
    print("Best set of hyperparameters: ", random_search.best_params_)
    print("Best score: ", random_search.best_score_)

    best_hyperparams = random_search.best_params_
    
    return best_hyperparams

In [8]:
def rndmforest_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat):

    import copy
    import pandas as pd
    import numpy as np
    from sklearn import metrics
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.model_selection import RandomizedSearchCV
    import scipy.stats as stats
    
    import warnings
    warnings.filterwarnings("ignore")
    
    # Train the Random Forest classifier
    clf_rf = RandomForestClassifier()
    
    # Creating the hyperparameter grid
    param_grid = { 
        'n_estimators': stats.randint(100, 250), 
        'max_features': ['sqrt', 'log2', None], 
    } 
                   
    #Instantiate RandomSearchCV object
    rscv = RandomizedSearchCV(clf_rf , param_grid, scoring='accuracy', cv=10, random_state=1)
    
    #Fit the model
    rscv.fit(X_train_strat, y_train_strat)
    
    # Print the tuned parameters and score
    print(rscv.best_params_)
    print(rscv.best_score_)

    best_hyperparams = rscv.best_params_
    
    return best_hyperparams

# Tuned Model Testing

In [9]:
def catBoost_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, best_hyperparams):
    import copy
    import pandas as pd
    import numpy as np
    from sklearn import metrics
    from catboost import CatBoostClassifier, Pool
    from sklearn.metrics import accuracy_score, classification_report
    
    # Declare input training data indices that have categorical variables
    cat_features = []
    
    # Training and test datasets prep for CatBoost
    train_CatData = Pool(data=X_train_strat, label=y_train_strat, cat_features=cat_features)
    eval_CatData = Pool(data=X_test_strat, label=y_test_strat, cat_features=cat_features)
    
    # Initialize CatBoostClassifier
    cat_model = CatBoostClassifier(iterations=1111, learning_rate=best_hyperparams['learning_rate'], depth=int(best_hyperparams['max_depth']), loss_function='MultiClass', verbose=False)
    
    # Fit model
    cat_model.fit(train_CatData)
    
    # Get predicted classes
    y_pred = cat_model.predict(eval_CatData)
    
    # Get predicted probabilities for each class
    preds_proba = cat_model.predict_proba(eval_CatData)
    
    # Get predicted RawFormulaVal
    preds_raw = cat_model.predict(eval_CatData, prediction_type='RawFormulaVal')
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

In [10]:
def XGBoost_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, best_hyperparams):
    
    from xgboost import XGBClassifier
    import pandas as pd
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn import metrics
    import numpy as np
    
    le = LabelEncoder()
    y_trainXGb = le.fit_transform(y_train_strat)
    
    # Train the Random Forest classifier
    clf_xgboost = XGBClassifier(max_depth = int(best_hyperparams['max_depth']))
    clf_xgboost.fit(X_train_strat, y_trainXGb)
    
    # Predict on test set
    y_pred = clf_xgboost.predict(X_test_strat)
    y_pred = le.inverse_transform(y_pred)
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

In [11]:
def rndmforest_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, best_hyperparams):
    
    #Preprocess data to see if accuracy improved 
    import pandas as pd
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn import metrics
    import numpy as np
    
    # Train the Random Forest classifier
    clf_rf = RandomForestClassifier(n_estimators=best_hyperparams['n_estimators'],
                                    max_features=best_hyperparams['max_features'])
    clf_rf.fit(X_train_strat, y_train_strat)
    
    # Predict on test set
    y_pred_rf = clf_rf.predict(X_test_strat)
    
    # Evaluate the model
    accuracy_rf = accuracy_score(y_test_strat, y_pred_rf)
    print(f"Accuracy of Random Forest: {accuracy_rf*100:.4f}%")

    return accuracy_rf

# Results

In [None]:
# Tune the models
catBoost_hyperparameters = catBoost_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat)
XGBoost_hyperparameters = XGBoost_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat)
rndmforest_hyperparameters = rndmforest_tuning(X_train_strat, X_test_strat, y_train_strat, y_test_strat)

In [None]:
# Compare the models
# STRATIFIES K-FOLD CROSS VALIDATION {10-fold}
  
# Import Required Modules.
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)

# Empty list for storing accuracies
catBoost_accuBase = []
XGBoost_accuBase = []
rndmforest_accuBase = []
catBoost_accuTune = []
XGBoost_accuTune = []
rndmforest_accuTune = []

  
for train_index, test_index in skf.split(pre_test_data, pre_test_data['Risk_Level']):
    strat_train_set = pre_test_data.loc[train_index]
    strat_test_set = pre_test_data.loc[test_index]

    X_train_strat = strat_train_set[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'Body Temp', 'Heart Rate']]
    y_train_strat = strat_train_set['Risk_Level']
    
    X_test_strat = strat_test_set[['Age', 'SystolicBP', 'DiastolicBP', 'BS', 'Body Temp', 'Heart Rate']]
    y_test_strat = strat_test_set['Risk_Level']

    cat_base = catBoost_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat)
    xgB_base = XGBoost_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat)
    rdF_base = rndmforest_base(X_train_strat, X_test_strat, y_train_strat, y_test_strat)

    cat_tune = catBoost_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, catBoost_hyperparameters)
    xgB_tune = XGBoost_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, XGBoost_hyperparameters)
    rdF_tune = rndmforest_tuned(X_train_strat, X_test_strat, y_train_strat, y_test_strat, rndmforest_hyperparameters)

    catBoost_accuBase.append(cat_base*100)
    XGBoost_accuBase.append(xgB_base*100)
    rndmforest_accuBase.append(rdF_base*100)

    catBoost_accuTune.append(cat_tune*100)
    XGBoost_accuTune.append(xgB_tune*100)
    rndmforest_accuTune.append(rdF_tune*100)

    

In [None]:
import matplotlib.pyplot as plt

data = [catBoost_accuBase, catBoost_accuTune]
labels = ['Base', 'Tuned']

plt.boxplot(data, labels=labels, patch_artist=True)
plt.xlabel("Model type")
plt.ylabel("Accuracy")
plt.title("CatBoost hypermeter tuning")
plt.savefig("CatBoost_parameter_tuning.png", dpi=600, transparent=True, bbox_inches='tight')
plt.show()

In [None]:
import matplotlib.pyplot as plt

data = [XGBoost_accuBase, XGBoost_accuTune]
labels = ['Base', 'Tuned']

plt.boxplot(data, labels=labels, patch_artist=True)
plt.xlabel("Model type")
plt.ylabel("Accuracy")
plt.title("XGBoost hypermeter tuning")
plt.savefig("XGBoost_parameter_tuning.png", dpi=600, transparent=True, bbox_inches='tight')
plt.show()

In [None]:
import matplotlib.pyplot as plt

data = [rndmforest_accuBase, rndmforest_accuTune]
labels = ['Base', 'Tuned']

plt.boxplot(data, labels=labels, patch_artist=True)
plt.xlabel("Model type")
plt.ylabel("Accuracy")
plt.title("Random Forest hypermeter tuning")
plt.savefig("RandomForest_parameter_tuning.png", dpi=600, transparent=True, bbox_inches='tight')
plt.show()