# Experimentation to obtain models for prediction of diabetes cases

Please refer to **Workflow Sequence and Code Walkthrough** in **README.md** for a walkthrough of the code in this notebook.

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFECV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

import pickle

In [18]:
# Load the data
df_raw = pd.read_csv('./data/diabetes_binary_health_indicators_BRFSS2015.csv')

# Check for missing vaues
df_raw.isna().sum()

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64

In [19]:
# Number of duplicated rows
df_raw.duplicated().sum()

24206

In [20]:
# Drop duplicates to prevent data leakage
df = df_raw.drop_duplicates()

# Remove rows containing younger respondents (age < 30)
df = df[df['Age']>2]

In [21]:
# Shape of df_raw
print(df_raw.shape)

# Shape of df
print(df.shape)

(253680, 22)
(216899, 22)


In [22]:
# Percentage of target that are positive diabetes cases - Imbalanced dataset
print(f"Percentage of positive diabetes cases in df: \
{df['Diabetes_binary'].mean()*100:.2f}%")

Percentage of positive diabetes cases in df: 16.08%


In [23]:
# List the unique values in each column
unique_col_vals = {column: df[column].unique() for column in df.columns}
print('Columns and their values')
print('------------------------')
for column, values in unique_col_vals.items():
    print(f'{column}: {sorted([int(value) for value in values])}')

Columns and their values
------------------------
Diabetes_binary: [0, 1]
HighBP: [0, 1]
HighChol: [0, 1]
CholCheck: [0, 1]
BMI: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 95, 96, 98]
Smoker: [0, 1]
Stroke: [0, 1]
HeartDiseaseorAttack: [0, 1]
PhysActivity: [0, 1]
Fruits: [0, 1]
Veggies: [0, 1]
HvyAlcoholConsump: [0, 1]
AnyHealthcare: [0, 1]
NoDocbcCost: [0, 1]
GenHlth: [1, 2, 3, 4, 5]
MentHlth: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
PhysHlth: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
DiffWalk: [0, 1]
Sex: [0, 1]
Age: [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
Education: [

In [28]:
# Load the data dictionary
data_dict = pd.read_csv('data/data_dictionary.csv')
print(data_dict)

# Define the target variable
target_variable = 'Diabetes_binary'

# Group the feature types in a dictionary
features = {}
nominals = list(data_dict[data_dict['Type']=='Nominal']['Variable'])
nominals.remove(target_variable)
features['nominal'] = nominals.copy()
features['ordinal'] = list(
    data_dict[data_dict['Type']=='Ordinal']['Variable']
)
features['numerical'] = list(
    data_dict[data_dict['Type']=='Numerical']['Variable']
)

                Variable    Source       Type  \
0        Diabetes_binary       NaN    Nominal   
1                 HighBP  _RFHYPE5    Nominal   
2               HighChol   TOLDHI2    Nominal   
3              CholCheck  _CHOLCHK    Nominal   
4                    BMI     _BMI5  Numerical   
5                 Smoker  SMOKE100    Nominal   
6                 Stroke  CVDSTRK3    Nominal   
7   HeartDiseaseorAttack    _MICHD    Nominal   
8           PhysActivity  _TOTINDA    Nominal   
9                 Fruits   _FRTLT1    Nominal   
10               Veggies    VEGLT1    Nominal   
11     HvyAlcoholConsump  _RFDRHV5    Nominal   
12         AnyHealthcare  HLTHPLN1    Nominal   
13           NoDocbcCost   MEDCOST    Nominal   
14               GenHlth   GENHLTH    Ordinal   
15              MentHlth  MENTHLTH  Numerical   
16              PhysHlth  PHYSHLTH  Numerical   
17              DiffWalk  DIFFWALK    Nominal   
18                   Sex       SEX    Nominal   
19                  

In [29]:
# Set random state
# For large dataset, split in the ratio [0.8, 0.1, 0.1]
X = df.drop(target_variable, axis=1)
y = df[target_variable]

random_state = 1

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=random_state
)

In [30]:
# Split again to create a validation dataset. Validation and test datasets
# contain more than 20,000 rows each
X_val, X_test, y_val, y_test = train_test_split(
    X_test,
    y_test,
    test_size=0.5,
    stratify=y_test,
    random_state=random_state
)

# Shape of X_val and X_test
X_val.shape, X_test.shape

((21690, 21), (21690, 21))

In [8]:
# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        # Using 'passthrough' as the nominal variables are already  encoded
        (
            'nominal',
            'passthrough',
            features['nominal']
        ),
        # No OrdinalEncoder as the ordinal variables are already  encoded
        (
            'ordinal_numerical',
            StandardScaler(),
            features['ordinal'] + features['numerical']
        )
    ]
)

In [9]:
# Define the base models
logistic = LogisticRegression(
    solver='saga',
    max_iter=10000,
    random_state=1
)
random_forest = RandomForestClassifier(
    random_state=1
)
xgboost = XGBClassifier(
    eval_metric='logloss',
    seed=1
)
    
algorithms = [
    ('Logistic', logistic),
    ('Random Forest', random_forest),
    ('XGBoost', xgboost)
]

In [10]:
# Define the rebalancing strategies
balancing_strategies = {
    'Class-Weighted': 'balanced',
    'Undersample': RandomUnderSampler(random_state=random_state),
    'SMOTE': SMOTE(random_state=random_state)
}

# Calculate class weight
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

In [11]:
# Define the parameters for the grid search
param_grids = {
    'Logistic': {
        'classifier__C': 10.0**np.arange(-2,3) # 0.1, 1, 10
    },
    'Random Forest': {
        'classifier__n_estimators': [50, 100, 250],
        'classifier__max_depth': [5, 10, 15]
    },
    'XGBoost': {
        'classifier__eta': [0.01, 0.1, 0.3],
        'classifier__max_depth': [5, 10, 15],
        'classifier__subsample': [0.7, 1],
    }
}

In [12]:
# Set up cross-validation
cv = StratifiedKFold(n_splits=5)

# Initialize a DataFrame to store the results
results = pd.DataFrame(
    columns=[
        'Balancing Strategy',
        'Classifier',
        'Parameters',
        'Accuracy',
        'Recall',
        'Neg. Class Recall',
        'Precision',
        'F1',
        'AUC-ROC',
        'Confusion Matrix',
        'No. of Features',
        'Features List',
        'Model'
    ]
)

In [16]:
# Select features for each classifier using RFECV
selected_features_dict = {}
print('Recursive feature selection for')
for strategy_name, strategy in balancing_strategies.items():
    for algorithm_name, algorithm in algorithms:
        print(f'Balancing Strategy: {strategy_name:<14}   Algorithm: {algorithm_name}')
        
        # Set steps and parameters for RFECV
        steps = []
        if strategy_name == 'Class-Weighted':
            if algorithm_name in ['Logistic', 'Random Forest']:
                algorithm.set_params(class_weight=strategy)
            elif algorithm_name == 'XGBoost':
                algorithm.set_params(scale_pos_weight=scale_pos_weight)
        else:
            # For other balancing strategies, add the balancing step
            steps.append(('balance', strategy))
        
        rfecv = RFECV(
            estimator=algorithm,
            step=1,
            cv=cv,
            scoring='f1',
            n_jobs=-1
        )
        
        steps.append(('preprocessor', preprocessor))
        steps.append(('rfecv', rfecv))
        
        # Fit RFECV
        rfecv_pipeline = Pipeline(steps)
        rfecv_pipeline.fit(X_train, y_train)
        
        # Store results
        support = rfecv_pipeline.named_steps['rfecv'].support_
        key = f'{strategy_name}_{algorithm_name}'
        selected_features_dict[key] = {
            'Balancing Strategy': strategy_name,
            'Classifier Algorithm': algorithm_name,
            'Features': X_train.columns[support],
            'Estimator': rfecv.estimator_
        }

Recursive feature selection for
Balancing Strategy: Class-Weighted   Algorithm: Logistic
Balancing Strategy: Class-Weighted   Algorithm: Random Forest
Balancing Strategy: Class-Weighted   Algorithm: XGBoost
Balancing Strategy: Undersample      Algorithm: Logistic
Balancing Strategy: Undersample      Algorithm: Random Forest
Balancing Strategy: Undersample      Algorithm: XGBoost
Balancing Strategy: SMOTE            Algorithm: Logistic
Balancing Strategy: SMOTE            Algorithm: Random Forest
Balancing Strategy: SMOTE            Algorithm: XGBoost


In [25]:
# Selected features for each balancing strategy and classification algorithm
pd.DataFrame(selected_features_dict).T

Unnamed: 0,Balancing Strategy,Classifier Algorithm,Features,Estimator
Class-Weighted_Logistic,Class-Weighted,Logistic,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","LogisticRegression(class_weight='balanced', ma..."
Class-Weighted_Random Forest,Class-Weighted,Random Forest,"Index(['MentHlth', 'Age'], dtype='object')","(DecisionTreeClassifier(max_features='sqrt', r..."
Class-Weighted_XGBoost,Class-Weighted,XGBoost,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","XGBClassifier(base_score=None, booster=None, c..."
Undersample_Logistic,Undersample,Logistic,"Index(['HighBP', 'HighChol', 'CholCheck', 'Smo...","LogisticRegression(class_weight='balanced', ma..."
Undersample_Random Forest,Undersample,Random Forest,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(DecisionTreeClassifier(max_features='sqrt', r..."
Undersample_XGBoost,Undersample,XGBoost,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","XGBClassifier(base_score=None, booster=None, c..."
SMOTE_Logistic,SMOTE,Logistic,"Index(['HighBP', 'HighChol', 'CholCheck', 'Str...","LogisticRegression(class_weight='balanced', ma..."
SMOTE_Random Forest,SMOTE,Random Forest,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(DecisionTreeClassifier(max_features='sqrt', r..."
SMOTE_XGBoost,SMOTE,XGBoost,"Index(['HighBP', 'HighChol', 'BMI', 'GenHlth',...","XGBClassifier(base_score=None, booster=None, c..."


In [None]:
# Number of features
pd.DataFrame(selected_features_dict).T['Features'].apply(len)

Class-Weighted_Logistic         21
Class-Weighted_Random Forest     2
Class-Weighted_XGBoost          18
Undersample_Logistic            18
Undersample_Random Forest       21
Undersample_XGBoost             21
SMOTE_Logistic                  16
SMOTE_Random Forest             21
SMOTE_XGBoost                    5
Name: Features, dtype: int64

In [18]:
# Loop over each balancing strategy and classifier algorithm
for strategy_name, strategy in balancing_strategies.items():
    for algorithm_name, algorithm in algorithms:
        print(f'Balancing Strategy: {strategy_name:<14}   Algorithm: {algorithm_name}')
        
        # Add steps to the pipeline and define classifier hyperparameters
        steps = []
        if strategy_name == 'Class-Weighted':
            param_grid = param_grids[algorithm_name].copy()
            # Set the relevant parameter for class-weighted balancing
            if algorithm_name == 'Logistic':
                param_grid['classifier__class_weight'] = [strategy]
            elif algorithm_name == 'Random Forest':
                param_grid['classifier__class_weight'] = [strategy]
            elif algorithm_name == 'XGBoost':
                param_grid['classifier__scale_pos_weight'] = [scale_pos_weight]
        else:
            # For other balancing strategies, add the balancing step
            steps.append(('balance', strategy))
            param_grid = param_grids[algorithm_name]
        
        key = f'{strategy_name}_{algorithm_name}'
        selected_features = selected_features_dict[key]['Features']
        
        # Customise preprocessor to selected features
        preprocessor = ColumnTransformer(
            transformers=[
                # 'passthrough' as nominal variables are already encoded
                (
                    'nominal',
                    'passthrough',
                    [x for x in features['nominal'] if x in selected_features]
                ),
                # No OrdinalEncoder as ordinal variables are already  encoded
                (
                    'ordinal_numerical',
                    StandardScaler(),
                    [x for x in features['ordinal'] + features['numerical'] \
                        if x in selected_features]
                )
            ]
        )
        
        steps.append(('preprocessor', preprocessor))
        steps.append(('classifier', algorithm))
        
        pipeline = Pipeline(steps)
        
        # Perform GridSearch
        gs = GridSearchCV(
            pipeline,
            param_grid,
            cv=cv,
            scoring='f1',
            n_jobs=-1,
            verbose=3
        )
        
        X_train_rfecv = X_train[selected_features]
        gs.fit(X_train_rfecv, y_train)
        print()
        
        # Get the best model and its selected features
        best_model = gs.best_estimator_
        
        # Evaluate the best model
        X_val_rfecv = X_val[selected_features]
        y_pred = best_model.predict(X_val)
        y_prob = best_model.predict_proba(X_val)[:, 1]
        
        # Record the model details and evaluation results
        current_result = pd.DataFrame([
            {
                'Balancing Strategy': strategy_name,
                'Classifier': algorithm_name,
                'Parameters': gs.best_params_,
                'Accuracy': accuracy_score(y_val, y_pred),
                'Recall': recall_score(y_val, y_pred),
                'Neg. Class Recall': recall_score(y_val, y_pred, pos_label=0),
                'Precision': precision_score(y_val, y_pred),
                'F1': f1_score(y_val, y_pred),
                'AUC-ROC': roc_auc_score(y_val, y_prob),
                'Confusion Matrix': confusion_matrix(y_val, y_pred).tolist(),
                'No. of Features': len(selected_features),
                'Features List': selected_features,
                'Model': best_model
            }
        ])

        results = pd.concat([results, current_result], ignore_index=True)

Balancing Strategy: Class-Weighted   Algorithm: Logistic
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 4/5] END classifier__C=0.01, classifier__class_weight=balanced;, score=0.454 total time=   4.2s
[CV 1/5] END classifier__C=0.01, classifier__class_weight=balanced;, score=0.460 total time=   4.5s
[CV 3/5] END classifier__C=0.01, classifier__class_weight=balanced;, score=0.456 total time=   4.4s
[CV 2/5] END classifier__C=0.01, classifier__class_weight=balanced;, score=0.451 total time=   4.5s
[CV 5/5] END classifier__C=0.01, classifier__class_weight=balanced;, score=0.459 total time=   4.8s
[CV 1/5] END classifier__C=0.1, classifier__class_weight=balanced;, score=0.460 total time=   4.3s
[CV 3/5] END classifier__C=0.1, classifier__class_weight=balanced;, score=0.456 total time=   4.1s
[CV 2/5] END classifier__C=0.1, classifier__class_weight=balanced;, score=0.451 total time=   4.3s
[CV 2/5] END classifier__C=1.0, classifier__class_weight=balanced;, score=0.451 total 

  results = pd.concat([results, current_result], ignore_index=True)


Balancing Strategy: Class-Weighted   Algorithm: Random Forest
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 3/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=50;, score=0.330 total time=   8.4s
[CV 4/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=50;, score=0.329 total time=   8.5s
[CV 2/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=50;, score=0.330 total time=   8.6s
[CV 5/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=50;, score=0.332 total time=   8.7s
[CV 1/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=50;, score=0.330 total time=   8.9s
[CV 3/5] END classifier__class_weight=balanced, classifier__max_depth=5, classifier__n_estimators=100;, score=0.330 total time=  17.1s
[CV 1/5] END classifier__class_weight=balanced, classifier__max_depth=5, 



[CV 3/5] END classifier__eta=0.01, classifier__max_depth=15, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=0.7;, score=0.451 total time= 1.0min
[CV 4/5] END classifier__eta=0.01, classifier__max_depth=15, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=0.7;, score=0.455 total time= 1.0min
[CV 1/5] END classifier__eta=0.01, classifier__max_depth=15, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=1;, score=0.441 total time=  56.7s
[CV 2/5] END classifier__eta=0.01, classifier__max_depth=15, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=1;, score=0.434 total time=  52.9s
[CV 5/5] END classifier__eta=0.01, classifier__max_depth=15, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=0.7;, score=0.453 total time= 1.0min
[CV 1/5] END classifier__eta=0.1, classifier__max_depth=5, classifier__scale_pos_weight=5.218650324337885, classifier__subsample=0.7;, score=0.459 total time=  13.6



[CV 4/5] END .................classifier__C=0.1;, score=0.434 total time=   1.9s
[CV 1/5] END .................classifier__C=1.0;, score=0.442 total time=   1.9s
[CV 5/5] END .................classifier__C=0.1;, score=0.439 total time=   2.0s
[CV 3/5] END .................classifier__C=1.0;, score=0.435 total time=   2.3s
[CV 4/5] END .................classifier__C=1.0;, score=0.434 total time=   2.3s
[CV 2/5] END .................classifier__C=1.0;, score=0.435 total time=   2.4s
[CV 5/5] END .................classifier__C=1.0;, score=0.439 total time=   2.5s
[CV 2/5] END ................classifier__C=10.0;, score=0.435 total time=   2.0s
[CV 3/5] END ................classifier__C=10.0;, score=0.435 total time=   1.9s
[CV 4/5] END ................classifier__C=10.0;, score=0.434 total time=   2.0s
[CV 1/5] END ...............classifier__C=100.0;, score=0.442 total time=   2.7s
[CV 2/5] END ...............classifier__C=100.0;, score=0.435 total time=   2.8s
[CV 5/5] END ...............

In [33]:
# Sort by recall, recall_neg_class, and num_features to select the best model
sorted_results = results.sort_values(
    by=['F1', 'Recall', 'Neg. Class Recall', 'No. of Features'],
    ascending=[False, False, False, True]
)

# Save results dataframe for subsequent retrieval without having to retrain
results_file = f'./model/sorted_results.pkl'
with open(results_file, 'wb') as f:
    pickle.dump(sorted_results, f)

print(f"The results dataframe has been saved to '{results_file}'.")

sorted_results

The results dataframe has been saved to './model/sorted_results.pkl'.


Unnamed: 0,Balancing Strategy,Classifier,Parameters,Accuracy,Recall,Neg. Class Recall,Precision,F1,AUC-ROC,Confusion Matrix,No. of Features,Features List,Model
2,Class-Weighted,XGBoost,"{'classifier__eta': 0.01, 'classifier__max_dep...",0.716551,0.722477,0.715416,0.327273,0.450483,0.797125,"[[13022, 5180], [968, 2520]]",18,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(ColumnTransformer(transformers=[('nominal', '..."
0,Class-Weighted,Logistic,"{'classifier__C': 1.0, 'classifier__class_weig...",0.710143,0.728211,0.706681,0.322376,0.446908,0.796227,"[[12863, 5339], [948, 2540]]",21,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(ColumnTransformer(transformers=[('nominal', '..."
4,Undersample,Random Forest,"{'classifier__max_depth': 10, 'classifier__n_e...",0.700231,0.75086,0.690529,0.317378,0.446167,0.797927,"[[12569, 5633], [869, 2619]]",21,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(RandomUnderSampler(random_state=1), ColumnTra..."
7,SMOTE,Random Forest,"{'classifier__max_depth': 5, 'classifier__n_es...",0.764915,0.584576,0.799473,0.358411,0.444372,0.780028,"[[14552, 3650], [1449, 2039]]",21,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(SMOTE(random_state=1), ColumnTransformer(tran..."
6,SMOTE,Logistic,{'classifier__C': 0.01},0.697142,0.71445,0.693825,0.308989,0.431403,0.773584,"[[12629, 5573], [996, 2492]]",16,"Index(['HighBP', 'HighChol', 'CholCheck', 'Str...","(SMOTE(random_state=1), ColumnTransformer(tran..."
3,Undersample,Logistic,{'classifier__C': 10.0},0.69396,0.71961,0.689045,0.307222,0.430606,0.774769,"[[12542, 5660], [978, 2510]]",18,"Index(['HighBP', 'HighChol', 'CholCheck', 'Smo...","(RandomUnderSampler(random_state=1), ColumnTra..."
5,Undersample,XGBoost,"{'classifier__eta': 0.3, 'classifier__max_dept...",0.612633,0.808486,0.575102,0.267197,0.401652,0.759944,"[[10468, 7734], [668, 2820]]",21,"Index(['HighBP', 'HighChol', 'CholCheck', 'BMI...","(RandomUnderSampler(random_state=1), ColumnTra..."
8,SMOTE,XGBoost,"{'classifier__eta': 0.1, 'classifier__max_dept...",0.413693,0.929472,0.314856,0.206326,0.337691,0.75816,"[[5731, 12471], [246, 3242]]",5,"Index(['HighBP', 'HighChol', 'BMI', 'GenHlth',...","(SMOTE(random_state=1), ColumnTransformer(tran..."
1,Class-Weighted,Random Forest,"{'classifier__class_weight': 'balanced', 'clas...",0.512494,0.739106,0.469069,0.210587,0.327781,0.639253,"[[8538, 9664], [910, 2578]]",2,"Index(['MentHlth', 'Age'], dtype='object')","(ColumnTransformer(transformers=[('nominal', '..."


In [31]:
# Select the 'best' model
# The definition of 'best' can change based on the desired outcome. The choice 
# is up to the data scientist. The row index in 
# ----------------------
# selected_model_idx = 0
# ----------------------
# can be changed to get whichever model suits the situation.
selected_model_idx = 0
selected_model = sorted_results.iloc[selected_model_idx]
best_model = selected_model['Model']
print('Model performance on validation data')
print(f"Model Algorithm: {selected_model['Classifier']}")
print(f"Balancing Strategy: {selected_model['Balancing Strategy']}")
print(f"Parameters: {selected_model['Parameters']}")
print(f"Number of Features Used: {len(selected_model['Features List'])}")
print(f"Features Used: {selected_model['Features List'].tolist()}")

# Print the classification report for the best model
X_val_rfecv = X_val[selected_model['Features List']]
y_pred = best_model.predict(X_val_rfecv)
print()
print('Confusion Matrix')
print(confusion_matrix(y_val, y_pred))
print()
print('Classification Report')
print(classification_report(y_val, y_pred))

# Save best model for use in production
best_model_file = f'./model/best_model.pkl'
with open(best_model_file, 'wb') as f:
    pickle.dump(best_model, f)

print(f"The 'best model' has been saved to '{best_model_file}'.")

Model performance on validation data
Model Algorithm: XGBoost
Balancing Strategy: Class-Weighted
Parameters: {'classifier__eta': 0.01, 'classifier__max_depth': 10, 'classifier__scale_pos_weight': 5.218650324337885, 'classifier__subsample': 0.7}
Number of Features Used: 18
Features Used: ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education']

Confusion Matrix
[[13022  5180]
 [  968  2520]]

Classification Report
              precision    recall  f1-score   support

         0.0       0.93      0.72      0.81     18202
         1.0       0.33      0.72      0.45      3488

    accuracy                           0.72     21690
   macro avg       0.63      0.72      0.63     21690
weighted avg       0.83      0.72      0.75     21690

The 'best model' has been saved to './model/best_model.pkl'.
