In [31]:
import time
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import sklearn.model_selection as cv
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    VotingClassifier,    
)

%matplotlib inline

In [32]:
# Load dataset
diabetes_df = pd.read_csv('../preprocessing_scripts/Data_preprocessed_10000.csv')
X, y = diabetes_df.drop(columns="Diabetes_binary"), diabetes_df["Diabetes_binary"]

print(X)
print(y)

      HighBP  HighChol  CholCheck       BMI  Smoker  Stroke  \
0        0.0       1.0        1.0 -1.071593     1.0     0.0   
1        0.0       0.0        1.0  0.628607     1.0     0.0   
2        1.0       1.0        1.0 -0.319133     1.0     0.0   
3        0.0       1.0        1.0  0.017644     1.0     0.0   
4        1.0       0.0        1.0  2.311887     0.0     0.0   
...      ...       ...        ...       ...     ...     ...   
9934     0.0       0.0        1.0 -0.319133     1.0     0.0   
9935     1.0       1.0        1.0  1.041359     1.0     0.0   
9936     1.0       1.0        1.0  0.177702     0.0     0.0   
9937     0.0       0.0        1.0 -1.723404     0.0     0.0   
9938     1.0       1.0        1.0 -1.279341     0.0     0.0   

      HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  ...  AnyHealthcare  \
0                      0.0           1.0     0.0      1.0  ...            1.0   
1                      0.0           0.0     1.0      1.0  ...            1.0   


In [33]:
# Define a function to measure execution's time. It will be use as a 'decorator'
# The idea behind this is just to use it to estimate the time it will take for the
# full code to run if you want to know before training with the complete dataset
# using a subset of size known in relationship with the full dataset.
def compute_executions_time(function):
    def wrapper():
        start_time = time.time()  # init measuring time
        function()  # execute function
        print(f"\n{time.time() - start_time} seconds")  # print execution time
    return wrapper

## **Random Forest**

In [34]:
#determinación de posibles valores de n_estimators

@compute_executions_time
def random_forest_with_different_n_estimators():
    prev_score = float('-inf')

    for n_trees in range(1,200,1):
        random_forest_classifier = RandomForestClassifier(
            n_estimators = n_trees
        )
        scores = cross_val_score(
            estimator = random_forest_classifier,
            X = X,
            y = y,
            cv=10,
            scoring="f1",
            n_jobs=-1
        )

        print(f"F1-score: {scores.mean():.5f} [nº estimators (trees): {n_trees}]")

        if(abs(scores.mean() - prev_score)  < 0.0001): 
            print("inefficiency breakpoint reached")
            break
    
        prev_score = scores.mean()
        
random_forest_with_different_n_estimators();

F1-score: 0.64272 [nº estimators (trees): 1]
F1-score: 0.55973 [nº estimators (trees): 2]
F1-score: 0.68273 [nº estimators (trees): 3]
F1-score: 0.63934 [nº estimators (trees): 4]
F1-score: 0.70763 [nº estimators (trees): 5]
F1-score: 0.67478 [nº estimators (trees): 6]
F1-score: 0.71235 [nº estimators (trees): 7]
F1-score: 0.69259 [nº estimators (trees): 8]
F1-score: 0.72024 [nº estimators (trees): 9]
F1-score: 0.70243 [nº estimators (trees): 10]
F1-score: 0.72781 [nº estimators (trees): 11]
F1-score: 0.70859 [nº estimators (trees): 12]
F1-score: 0.73372 [nº estimators (trees): 13]
F1-score: 0.71903 [nº estimators (trees): 14]
F1-score: 0.72286 [nº estimators (trees): 15]
F1-score: 0.71744 [nº estimators (trees): 16]
F1-score: 0.72206 [nº estimators (trees): 17]
F1-score: 0.72834 [nº estimators (trees): 18]
F1-score: 0.73666 [nº estimators (trees): 19]
F1-score: 0.72409 [nº estimators (trees): 20]
F1-score: 0.73443 [nº estimators (trees): 21]
F1-score: 0.72741 [nº estimators (trees): 2

In [35]:
#determinacion de mejor combinación de n_estimators y max_features con GridSearch

params = {
    "n_estimators" : range(1,201,10),
    "max_features": ["sqrt", "log2", 0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
}

clf = GridSearchCV(estimator=RandomForestClassifier(),
            param_grid=params,
            scoring="f1",
            n_jobs=-1,
            cv=10
            )
clf.fit(X,y)

print(clf.best_params_)
best_n_trees_rf = clf.best_params_["n_estimators"]
best_max_features_rf = clf.best_params_["max_features"]
print(clf.best_score_)

{'max_features': 'log2', 'n_estimators': 171}
0.749383223289077


In [36]:
#determinación de mejor criterion

for criteria in ["gini", "entropy", "log_loss"]:
        random_forest_classifier = RandomForestClassifier(
            n_estimators = best_n_trees_rf,
            max_features = best_max_features_rf,
            criterion = criteria
        )
        scores = cross_val_score(
            estimator = random_forest_classifier,
            X = X,
            y = y,
            cv=10,
            scoring="f1",
            n_jobs=-1
        )

        print(f"F1-score: {scores.mean():.5f} [criterion: {criteria}]")


F1-score: 0.74843 [criterion: gini]
F1-score: 0.74710 [criterion: entropy]
F1-score: 0.74766 [criterion: log_loss]


In [49]:
#determinación de mejor max_depth

for depth in [None, 5,10,11,12,13,14,15,16,17,18,19,20,25,30,35,40,45,50]:
        random_forest_classifier = RandomForestClassifier(
            n_estimators = best_n_trees_rf,
            max_features = best_max_features_rf,
            max_depth = depth
        )
        scores = cross_val_score(
            estimator = random_forest_classifier,
            X = X,
            y = y,
            cv=10,
            scoring="f1",
            n_jobs=-1
        )

        print(f"F1-score: {scores.mean():.5f} [max_depth: {depth}]")

best_depth_rf = 10

F1-score: 0.74947 [max_depth: None]
F1-score: 0.74980 [max_depth: 5]
F1-score: 0.75557 [max_depth: 10]
F1-score: 0.75319 [max_depth: 11]
F1-score: 0.75288 [max_depth: 12]
F1-score: 0.75370 [max_depth: 13]
F1-score: 0.75185 [max_depth: 14]
F1-score: 0.75206 [max_depth: 15]
F1-score: 0.75260 [max_depth: 16]
F1-score: 0.74818 [max_depth: 17]
F1-score: 0.74660 [max_depth: 18]
F1-score: 0.74875 [max_depth: 19]
F1-score: 0.74991 [max_depth: 20]
F1-score: 0.74842 [max_depth: 25]
F1-score: 0.74611 [max_depth: 30]
F1-score: 0.74444 [max_depth: 35]
F1-score: 0.74937 [max_depth: 40]
F1-score: 0.74704 [max_depth: 45]
F1-score: 0.74753 [max_depth: 50]


In [50]:
#determinación de mejor min_samples_split

for min in range(2,20):
        random_forest_classifier = RandomForestClassifier(
            n_estimators = best_n_trees_rf,
            max_features = best_max_features_rf,
            max_depth = best_depth_rf,
            min_samples_split = min
        )
        scores = cross_val_score(
            estimator = random_forest_classifier,
            X = X,
            y = y,
            cv=10,
            scoring="f1",
            n_jobs=-1
        )

        print(f"F1-score: {scores.mean():.5f} [min_samples_split: {min}]")

F1-score: 0.75475 [min_samples_split: 2]
F1-score: 0.75519 [min_samples_split: 3]
F1-score: 0.75472 [min_samples_split: 4]
F1-score: 0.75303 [min_samples_split: 5]
F1-score: 0.75422 [min_samples_split: 6]
F1-score: 0.75610 [min_samples_split: 7]
F1-score: 0.75589 [min_samples_split: 8]
F1-score: 0.75617 [min_samples_split: 9]
F1-score: 0.75507 [min_samples_split: 10]
F1-score: 0.75768 [min_samples_split: 11]
F1-score: 0.75441 [min_samples_split: 12]
F1-score: 0.75410 [min_samples_split: 13]
F1-score: 0.75420 [min_samples_split: 14]
F1-score: 0.75405 [min_samples_split: 15]
F1-score: 0.75529 [min_samples_split: 16]
F1-score: 0.75186 [min_samples_split: 17]
F1-score: 0.75334 [min_samples_split: 18]
F1-score: 0.75406 [min_samples_split: 19]


In [51]:
#determinación de mejor min_samples_leaf

for min in range(1,20):
        random_forest_classifier = RandomForestClassifier(
            n_estimators = best_n_trees_rf,
            max_features = best_max_features_rf,
            max_depth = best_depth_rf,
            min_samples_leaf = min
        )
        scores = cross_val_score(
            estimator = random_forest_classifier,
            X = X,
            y = y,
            cv=10,
            scoring="f1",
            n_jobs=-1
        )

        print(f"F1-score: {scores.mean():.5f} [min_samples_leaf: {min}]")

F1-score: 0.75405 [min_samples_leaf: 1]
F1-score: 0.75480 [min_samples_leaf: 2]
F1-score: 0.75416 [min_samples_leaf: 3]
F1-score: 0.75438 [min_samples_leaf: 4]
F1-score: 0.75509 [min_samples_leaf: 5]
F1-score: 0.75465 [min_samples_leaf: 6]
F1-score: 0.75375 [min_samples_leaf: 7]
F1-score: 0.75489 [min_samples_leaf: 8]
F1-score: 0.75319 [min_samples_leaf: 9]
F1-score: 0.75393 [min_samples_leaf: 10]
F1-score: 0.75420 [min_samples_leaf: 11]
F1-score: 0.75465 [min_samples_leaf: 12]
F1-score: 0.75422 [min_samples_leaf: 13]
F1-score: 0.75345 [min_samples_leaf: 14]
F1-score: 0.75457 [min_samples_leaf: 15]
F1-score: 0.75485 [min_samples_leaf: 16]
F1-score: 0.75200 [min_samples_leaf: 17]
F1-score: 0.75486 [min_samples_leaf: 18]
F1-score: 0.75381 [min_samples_leaf: 19]


## **Bagging**

In [None]:
# determinación de base estimator

for est in [DecisionTreeClassifier(), KNeighborsClassifier()]:
    scores = cross_val_score(
        BaggingClassifier(
            estimator = est,
        ), 
        X, 
        y, 
        cv = 10, 
        scoring="f1",
        n_jobs=-1
    )
    print(f"F1-score: {scores.mean():.5f} [estimator: {est}]")

F1-score: 0.696 [estimator: DecisionTreeClassifier()]
F1-score: 0.706 [estimator: KNeighborsClassifier()]


In [57]:
# determinación de valores de n_estimators
prev_score = float('-inf')

for nest in range(1,51,2):
    scores = cross_val_score(
        BaggingClassifier(
            estimator = DecisionTreeClassifier(),
            n_estimators = nest
        ), 
        X, 
        y, 
        cv = 10, 
        scoring="f1",
        n_jobs=-1
    )
    print(f"F1-score: {scores.mean():.5f} [nº estimators: {nest}]")

    if(abs(scores.mean() - prev_score)  < 0.0001): 
            print("inefficiency breakpoint reached")
            break
    
    prev_score = scores.mean()

best_n_est_bg = 32

F1-score: 0.64672 [nº estimators: 1]
F1-score: 0.67973 [nº estimators: 3]
F1-score: 0.69335 [nº estimators: 5]
F1-score: 0.70235 [nº estimators: 7]
F1-score: 0.70763 [nº estimators: 9]
F1-score: 0.71839 [nº estimators: 11]
F1-score: 0.71516 [nº estimators: 13]
F1-score: 0.72060 [nº estimators: 15]
F1-score: 0.72549 [nº estimators: 17]
F1-score: 0.72298 [nº estimators: 19]
F1-score: 0.72436 [nº estimators: 21]
F1-score: 0.72667 [nº estimators: 23]
F1-score: 0.72505 [nº estimators: 25]
F1-score: 0.72929 [nº estimators: 27]
F1-score: 0.72809 [nº estimators: 29]
F1-score: 0.72950 [nº estimators: 31]
F1-score: 0.72696 [nº estimators: 33]
F1-score: 0.73052 [nº estimators: 35]
F1-score: 0.73063 [nº estimators: 37]
F1-score: 0.73051 [nº estimators: 39]
F1-score: 0.73078 [nº estimators: 41]
F1-score: 0.73295 [nº estimators: 43]
F1-score: 0.72661 [nº estimators: 45]
F1-score: 0.73147 [nº estimators: 47]
F1-score: 0.73007 [nº estimators: 49]


In [56]:
# determinación de valores de max_samples
for max in range(11,101,2):
    scores = cross_val_score(
        BaggingClassifier(
            estimator = DecisionTreeClassifier(),
            n_estimators = best_n_est_bg,
            max_samples = max
        ), 
        X, 
        y, 
        cv = 10, 
        scoring="f1",
        n_jobs=-1
    )
    print(f"F1-score: {scores.mean():.5f} [max_samples: {max}]")

best_max_samples_bg = 45

F1-score: 0.71842 [max_samples: 11]
F1-score: 0.72010 [max_samples: 13]
F1-score: 0.71038 [max_samples: 15]
F1-score: 0.71312 [max_samples: 17]
F1-score: 0.72107 [max_samples: 19]
F1-score: 0.71852 [max_samples: 21]
F1-score: 0.72406 [max_samples: 23]
F1-score: 0.72343 [max_samples: 25]
F1-score: 0.71840 [max_samples: 27]
F1-score: 0.72067 [max_samples: 29]
F1-score: 0.72416 [max_samples: 31]
F1-score: 0.72636 [max_samples: 33]
F1-score: 0.72836 [max_samples: 35]
F1-score: 0.72255 [max_samples: 37]
F1-score: 0.73415 [max_samples: 39]
F1-score: 0.72304 [max_samples: 41]
F1-score: 0.72495 [max_samples: 43]
F1-score: 0.73279 [max_samples: 45]
F1-score: 0.72451 [max_samples: 47]
F1-score: 0.72535 [max_samples: 49]
F1-score: 0.72839 [max_samples: 51]
F1-score: 0.72973 [max_samples: 53]
F1-score: 0.73428 [max_samples: 55]
F1-score: 0.72462 [max_samples: 57]
F1-score: 0.72166 [max_samples: 59]
F1-score: 0.73788 [max_samples: 61]
F1-score: 0.73325 [max_samples: 63]
F1-score: 0.72980 [max_sampl

In [None]:
# determinación de valores de max_features
for max in range(1,22,1):
    scores = cross_val_score(
        BaggingClassifier(
            estimator = KNeighborsClassifier(n_neighbors=30),
            n_estimators = best_n_est_bg,
            max_samples = max
        ), 
        X, 
        y, 
        cv = 10, 
        scoring="f1",
        n_jobs=-1
    )
    print(f"F1-score: {scores.mean():.5f} [max_samples: {max}]")

best_max_features_bg = 21

F1-score: 0.265 [max_samples: 1]
F1-score: 0.453 [max_samples: 2]
F1-score: 0.541 [max_samples: 3]
F1-score: 0.592 [max_samples: 4]
F1-score: 0.681 [max_samples: 5]
F1-score: 0.678 [max_samples: 6]
F1-score: 0.710 [max_samples: 7]
F1-score: 0.686 [max_samples: 8]
F1-score: 0.682 [max_samples: 9]
F1-score: 0.691 [max_samples: 10]
F1-score: 0.703 [max_samples: 11]
F1-score: 0.695 [max_samples: 12]
F1-score: 0.705 [max_samples: 13]
F1-score: 0.705 [max_samples: 14]
F1-score: 0.708 [max_samples: 15]
F1-score: 0.715 [max_samples: 16]
F1-score: 0.710 [max_samples: 17]
F1-score: 0.718 [max_samples: 18]
F1-score: 0.715 [max_samples: 19]
F1-score: 0.719 [max_samples: 20]
F1-score: 0.713 [max_samples: 21]


## **Extra Trees Classifier**

In [None]:
@compute_executions_time
def execute_extra_trees_classifier_with_different_estimators_and_cv(cv=10):
    for n_trees in [1, 2, 5, 10, 20, 50, 100, 200]:
        extra_trees_classifier = ExtraTreesClassifier(
            n_estimators = n_trees
        )
        scores = cross_val_score(
            estimator = extra_trees_classifier,
            X = X,
            y = y,
            cv = cv,
            scoring = "f1",
            n_jobs=-1
        )
        print(f"F1-score: {scores.mean():.3f} [nº estimators (trees): {n_trees}]")

execute_extra_trees_classifier_with_different_estimators_and_cv()

## **Voting Scheme**

In [48]:
@compute_executions_time
def execute_voting_scheme_different_estimators_grid_search_and_cv(cv = 10):
    naive_bayes = GaussianNB()
    k_neighbors = KNeighborsClassifier()
    params_space = {
        "n_neighbors": list(range(1, 51, 1)),
        "weights": ["distance", "uniform"]
    }

    #search for best metaparameters for knn
    clf = GridSearchCV(
        k_neighbors, 
        param_grid = params_space,
        cv = cv,
        n_jobs = -1
    )
    clf.fit(X, y)
    best_parameters_for_knn = clf.best_params_
    print(f"Best Params fo Knn: {clf.best_params_} - Accuracy: {clf.best_score_}")

    clf2 = KNeighborsClassifier(
        n_neighbors = best_parameters_for_knn["n_neighbors"],
        weights = best_parameters_for_knn["weights"]
    )

    clf3 = DecisionTreeClassifier()

    for clf, label in zip([naive_bayes, clf2, clf3], ["Naive Bayes","Knn (3)", "Dec. Tree", ]):
        scores = cross_val_score(
            clf, 
            X, 
            y, 
            cv = cv, 
            n_jobs=-1,
            scoring = "f1"
        )
        print(f"F1-score: {scores.mean():.3f} [{label}]")

    warnings.filterwarnings("ignore", category=DeprecationWarning)  # Just to avoid warnings

    for vot in ["hard", "soft"]:
        voting_classifier = VotingClassifier(
            estimators=[
                ("nb", naive_bayes),
                ("knn3", clf2),
                ("dt", clf3)
            ],
            voting = vot
        )
        scores = cross_val_score(
            voting_classifier,
            X,
            y,
            cv = cv,
            n_jobs=-1,
            scoring = "f1")
        print("F1-score: %0.3f [%s]" % (scores.mean() , f"Majority Voting with {vot} voting"))

execute_voting_scheme_different_estimators_grid_search_and_cv()

Best Params fo Knn: {'n_neighbors': 30, 'weights': 'uniform'} - Accuracy: 0.7249186964688432
F1-score: 0.713 [Naive Bayes]
F1-score: 0.733 [Knn (3)]
F1-score: 0.647 [Dec. Tree]
F1-score: 0.735 [Majority Voting with hard voting]
F1-score: 0.730 [Majority Voting with soft voting]

11.430985927581787 seconds


## **Ada Boost Classifier**

In [None]:
@compute_executions_time
def execute_ada_boost_classifier_for_different_classifiers(cv=50):
    for n_estimators in [1, 2, 5, 10, 20, 50, 100, 200]:
        ada_boos_classifier = AdaBoostClassifier(
            n_estimators=n_estimators
        )
        scores = cross_val_score(
            ada_boos_classifier,
            X,
            y,
            cv = cv,
            jobs=-1,
            scoring = "f1"
        )
        print(f"F1-score: {scores.mean():.3f} [nº estimators: {n_estimators}]")

execute_ada_boost_classifier_for_different_classifiers()

F1-score: 0.694 [nº estimators: 1]
F1-score: 0.694 [nº estimators: 2]
F1-score: 0.736 [nº estimators: 5]
F1-score: 0.746 [nº estimators: 10]
F1-score: 0.746 [nº estimators: 20]
F1-score: 0.750 [nº estimators: 50]
F1-score: 0.751 [nº estimators: 100]
F1-score: 0.753 [nº estimators: 200]

73.57335209846497 seconds


In [None]:
#determinacion de mejor combinación de n_estimators y learning_rate con GridSearch

params = {
    "n_estimators" : range(1,201,10),
    "learning_rate": [0.1,0.5,1.1,1.6,2.1,2.6,3.1,3.6,4.1,4.6,5.1]
}

clf = GridSearchCV(estimator=AdaBoostClassifier(),
            param_grid=params,
            scoring="f1",
            n_jobs=-1,
            cv=10
            )
clf.fit(X,y)

print(clf.best_params_)
best_n_est_ada = clf.best_params_["n_estimators"]
best_learning_rate_ada = clf.best_params_["learning_rate"]
print(clf.best_score_)

{'learning_rate': 1.6, 'n_estimators': 81}
0.755560393312526


In [None]:
# determinación de valores de max_features
for est in [DecisionTreeClassifier(max_depth=1),GaussianNB()]:
    scores = cross_val_score(
        AdaBoostClassifier(
            estimator = est,
            n_estimators = best_n_est_ada,
            learning_rate=best_learning_rate_ada
        ), 
        X, 
        y, 
        cv = 10, 
        scoring="f1",
        n_jobs=-1
    )
    print(f"F1-score: {scores.mean():.3f} [max_samples: {est}]")

F1-score: 0.756 [max_samples: DecisionTreeClassifier(max_depth=1)]
F1-score: 0.704 [max_samples: GaussianNB()]


## **Comparison between classfiers**