In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from typing import List, Dict, Tuple, Any

In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

In [4]:
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    accuracy_score
)

In [5]:
def evaluate_binary_classification(y_val, y_pred):
    """
    Evaluate a binary classification model's performance using sklearn metrics.
    
    Parameters:
    y_val (array-like): Ground truth binary labels (0 or 1).
    y_pred (array-like): Predicted binary labels (0 or 1).
    
    Prints:
    - Confusion Matrix components (TP, TN, FP, FN)
    - Specificity (True Negative Rate)
    - Negative Predictive Value (NPV)
    - False Positive Rate (FPR)
    - False Discovery Rate (FDR)
    - Accuracy
    - Precision
    - Recall (True Positive Rate)
    - F1 Score
    """
    # Confusion matrix components
    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()

    # Metrics calculations
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # True Negative Rate
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0  # Negative Predictive Value
    fpr = fp / (fp + tn) if (fp + tn) > 0 else 0  # False Positive Rate
    fdr = fp / (fp + tp) if (fp + tp) > 0 else 0  # False Discovery Rate
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    # Print results
    print(f"True Positive (TP): {tp}")
    print(f"True Negative (TN): {tn}")
    print(f"False Positive (FP): {fp}")
    print(f"False Negative (FN): {fn}")
    print(f"True Negative Rate (Specificity): {specificity:.4f}")
    print(f"Negative Predictive Value (NPV): {npv:.4f}")
    print(f"False Positive Rate (FPR): {fpr:.4f}")
    print(f"False Discovery Rate (FDR): {fdr:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall (True Positive Rate): {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

In [6]:
X_train = pd.read_parquet('train_binarized.parquet')
X_val = pd.read_parquet('val_binarized.parquet')

In [7]:
y_train = pd.read_parquet('y_train.parquet')
y_val = pd.read_parquet('y_val.parquet')

In [8]:
X_train

Unnamed: 0,pregnancies_ge_1,pregnancies_le_1,pregnancies_ge_3,pregnancies_le_3,pregnancies_ge_7,pregnancies_le_7,pregnancies_ge_9,pregnancies_le_9,glucose_ge_79,glucose_le_79,...,age_ge_24,age_le_24,age_ge_29,age_le_29,age_ge_40,age_le_40,age_ge_50,age_le_50,age_ge_60,age_le_60
357,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,1,0,0,1,0,1
73,1,0,1,0,0,1,0,1,1,0,...,0,1,0,1,0,1,0,1,0,1
352,1,0,1,1,0,1,0,1,0,1,...,1,0,1,0,1,0,0,1,0,1
497,1,0,0,1,0,1,0,1,1,0,...,1,0,0,1,0,1,0,1,0,1
145,0,1,0,1,0,1,0,1,1,0,...,0,1,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,1,0,1,0,0,1,0,1,1,0,...,1,0,0,1,0,1,0,1,0,1
106,1,1,0,1,0,1,0,1,1,0,...,1,0,0,1,0,1,0,1,0,1
270,1,0,1,0,1,0,1,0,1,0,...,1,0,1,0,0,1,0,1,0,1
435,0,1,0,1,0,1,0,1,1,0,...,1,0,1,1,0,1,0,1,0,1


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier

In [10]:
log_reg_clf = LogisticRegression()

In [11]:
%%time
log_reg_clf.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 87.3 ms


  y = column_or_1d(y, warn=True)


In [17]:
log_reg_clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [15]:
dir(log_reg_clf)

['C',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 '_validate_params',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'feature_names_in_

In [18]:
%%time
y_pred_log_reg = log_reg_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 2 ms


In [19]:
evaluate_binary_classification(y_val, y_pred_log_reg)

True Positive (TP): 44
True Negative (TN): 96
False Positive (FP): 27
False Negative (FN): 25
True Negative Rate (Specificity): 0.7805
Negative Predictive Value (NPV): 0.7934
False Positive Rate (FPR): 0.2195
False Discovery Rate (FDR): 0.3803
Accuracy: 0.7292
Precision: 0.6197
Recall (True Positive Rate): 0.6377
F1 Score: 0.6286


In [20]:
svc_clf = SVC()

In [21]:
%%time
svc_clf.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 11 ms


  y = column_or_1d(y, warn=True)


In [22]:
dir(svc_clf)

['C',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_dual_coef_',
 '_estimator_type',
 '_gamma',
 '_get_coef',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_impl',
 '_intercept_',
 '_more_tags

In [23]:
svc_clf.n_features_in_

78

In [17]:
%%time
y_pred_svc = svc_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 6.44 ms


In [19]:
evaluate_binary_classification(y_val, y_pred_svc)

True Positive (TP): 43
True Negative (TN): 100
False Positive (FP): 23
False Negative (FN): 26
True Negative Rate (Specificity): 0.8130
Negative Predictive Value (NPV): 0.7937
False Positive Rate (FPR): 0.1870
False Discovery Rate (FDR): 0.3485
Accuracy: 0.7448
Precision: 0.6515
Recall (True Positive Rate): 0.6232
F1 Score: 0.6370


In [24]:
rf_clf = RandomForestClassifier()

In [25]:
%%time 
rf_clf.fit(X_train, y_train)

CPU times: total: 78.1 ms
Wall time: 102 ms


  return fit_method(estimator, *args, **kwargs)


In [27]:
dir(rf_clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_compute_oob_predictions',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_estimators_indices',
 '_get_metadata_request',
 '_get_oob_predictions',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_n_samples',
 '_n_samples_bootstrap',
 '_parameter_constraints',
 '_repr_

In [28]:
rf_clf.n_estimators

100

In [26]:
%%time 
y_pred_rf = rf_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 6 ms


In [29]:
evaluate_binary_classification(y_val, y_pred_rf)

True Positive (TP): 48
True Negative (TN): 97
False Positive (FP): 26
False Negative (FN): 21
True Negative Rate (Specificity): 0.7886
Negative Predictive Value (NPV): 0.8220
False Positive Rate (FPR): 0.2114
False Discovery Rate (FDR): 0.3514
Accuracy: 0.7552
Precision: 0.6486
Recall (True Positive Rate): 0.6957
F1 Score: 0.6713


In [30]:
gb_clf = GradientBoostingClassifier()

In [31]:
%%time 
gb_clf.fit(X_train, y_train)

CPU times: total: 78.1 ms
Wall time: 122 ms


  y = column_or_1d(y, warn=True)


In [32]:
gb_clf.n_estimators

100

In [26]:
%%time
y_pred_gb = gb_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 13.4 ms


In [27]:
evaluate_binary_classification(y_val, y_pred_gb)

True Positive (TP): 47
True Negative (TN): 90
False Positive (FP): 33
False Negative (FN): 22
True Negative Rate (Specificity): 0.7317
Negative Predictive Value (NPV): 0.8036
False Positive Rate (FPR): 0.2683
False Discovery Rate (FDR): 0.4125
Accuracy: 0.7135
Precision: 0.5875
Recall (True Positive Rate): 0.6812
F1 Score: 0.6309


In [28]:
xgb_clf = XGBClassifier()

In [29]:
%%time 
xgb_clf.fit(X_train, y_train)

CPU times: total: 422 ms
Wall time: 87.4 ms


In [30]:
%%time
y_pred_xgb = xgb_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 12.8 ms


In [31]:
evaluate_binary_classification(y_val, y_pred_xgb)

True Positive (TP): 44
True Negative (TN): 93
False Positive (FP): 30
False Negative (FN): 25
True Negative Rate (Specificity): 0.7561
Negative Predictive Value (NPV): 0.7881
False Positive Rate (FPR): 0.2439
False Discovery Rate (FDR): 0.4054
Accuracy: 0.7135
Precision: 0.5946
Recall (True Positive Rate): 0.6377
F1 Score: 0.6154


In [33]:
from catboost import CatBoostClassifier

In [34]:
cb_clf = CatBoostClassifier()

In [35]:
%%time
cb_clf.fit(X_train, y_train, verbose=0)

CPU times: total: 1.12 s
Wall time: 2.22 s


<catboost.core.CatBoostClassifier at 0x1d805a52db0>

In [36]:
dir(cb_clf)

['__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_base_calc_leaf_indexes',
 '_base_drop_unused_features',
 '_base_eval_metrics',
 '_base_predict',
 '_base_shrink',
 '_base_virtual_ensembles_predict',
 '_calc_fstr',
 '_calc_leaf_indexes',
 '_calc_ostr',
 '_check_is_compatible_loss',
 '_convert_to_asymmetric_representation',
 '_dataset_train_eval_split',
 '_deserialize_model',
 '_estimator_type',
 '_eval_metrics',
 '_fit',
 '_get_borders',
 '_get_cat_feature_indices',
 '_get_embedding_feature_indices',
 '_get_float_feature_indices',
 '_get_nan_treatments',
 '_get_params',
 '_get_tags',
 '_get_text_feature_in

In [37]:
cb_clf.tree_count_

1000

In [35]:
%%time
y_pred_cb = cb_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 5.6 ms


In [36]:
evaluate_binary_classification(y_val, y_pred_cb)

True Positive (TP): 49
True Negative (TN): 96
False Positive (FP): 27
False Negative (FN): 20
True Negative Rate (Specificity): 0.7805
Negative Predictive Value (NPV): 0.8276
False Positive Rate (FPR): 0.2195
False Discovery Rate (FDR): 0.3553
Accuracy: 0.7552
Precision: 0.6447
Recall (True Positive Rate): 0.7101
F1 Score: 0.6759


In [38]:
tree_clf = DecisionTreeClassifier()

In [39]:
%%time 
tree_clf.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 5 ms


In [40]:
%%time
y_pred_tree = tree_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 2 ms


In [41]:
dir(tree_clf)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_compute_missing_values_in_feature_mask',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_fit',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_prune_tree',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_support_missing_values',
 '_validate_X_predict',
 '_validate

In [45]:
tree_clf.get_depth()

20

In [40]:
evaluate_binary_classification(y_val, y_pred_tree)

True Positive (TP): 35
True Negative (TN): 91
False Positive (FP): 32
False Negative (FN): 34
True Negative Rate (Specificity): 0.7398
Negative Predictive Value (NPV): 0.7280
False Positive Rate (FPR): 0.2602
False Discovery Rate (FDR): 0.4776
Accuracy: 0.6562
Precision: 0.5224
Recall (True Positive Rate): 0.5072
F1 Score: 0.5147


In [41]:
knn_clf = KNeighborsClassifier()

In [42]:
%%time
knn_clf.fit(X_train, y_train)

CPU times: total: 0 ns
Wall time: 4.14 ms


  return self._fit(X, y)


In [43]:
%%time
y_pred_knn = knn_clf.predict(X_val)

CPU times: total: 0 ns
Wall time: 16.6 ms


In [44]:
evaluate_binary_classification(y_val, y_pred_knn)

True Positive (TP): 36
True Negative (TN): 93
False Positive (FP): 30
False Negative (FN): 33
True Negative Rate (Specificity): 0.7561
Negative Predictive Value (NPV): 0.7381
False Positive Rate (FPR): 0.2439
False Discovery Rate (FDR): 0.4545
Accuracy: 0.6719
Precision: 0.5455
Recall (True Positive Rate): 0.5217
F1 Score: 0.5333
