In [None]:
import os
import numpy as np
import pandas as pd
import gc
import time
import logging
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [None]:
logging.basicConfig(filename='malaria_ml_features.log', level=logging.ERROR)

In [None]:
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [None]:

# Configuração dos paths
path_features = '/media/williancarddd/NVME/projects/malaria-pibiti/1_entrada/'
dataset_names = ['Dataset01_100', 'Dataset01_95.0', 'Dataset01_90.0', 'Dataset01_85.0', 'Dataset01_80.0',
                 'Dataset01_75.0', 'Dataset01_70.0', 'Dataset01_65.0', 'Dataset01_60.0', 'Dataset01_55.0',
                 'Dataset01_50.0', 'Dataset01_45.0', 'Dataset01_40.0', 'Dataset01_35.0', 'Dataset01_30.0',
                 'Dataset01_25.0', 'Dataset01_20.0', 'Dataset01_15.0', 'Dataset01_10.0', 'Dataset01_5.0']
dataset_names = dataset_names[::-1]
paths_datasets = {dataset_names[i]: os.path.join(path_features, dataset_names[i], "features/features.csv") for i in range(len(dataset_names))}
path_results = '/media/williancarddd/NVME/projects/malaria-pibiti/6_resultados'

# Nome dos métodos
methodsNames = ['GradientBoosting', 'KNN', 'NBayes', 'RandomForest']


In [None]:
# Função para criar diretórios para salvar resultados
def make_results_folders(path_results, dataset_name, method):
    path_dataset = os.path.join(path_results, dataset_name)
    if not os.path.exists(path_dataset):
        os.mkdir(path_dataset)
    path_method = os.path.join(path_dataset, method)
    if not os.path.exists(path_method):
        os.mkdir(path_method)
    path_metrics = os.path.join(path_method, 'metrics')
    if not os.path.exists(path_metrics):
        os.mkdir(path_metrics)
    path_csvs = os.path.join(path_method, 'csvs')
    if not os.path.exists(path_csvs):
        os.mkdir(path_csvs)
    path_test = os.path.join(path_method, 'test')
    if not os.path.exists(path_test):
        os.mkdir(path_test)
    return path_metrics, path_csvs, path_test

In [None]:
# config video 
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = "1"

In [None]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import  StandardScaler

# get dynamic cores
import multiprocessing
cores = multiprocessing.cpu_count()
# use 1/2 of the cores
cores = cores - 1

def train_ml_algorithm(X_train, y_train, methodName):

    search = None
    # KNN
    # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
    if (methodName == 'KNN'): 
        standardized_data = StandardScaler()
        var_filter = VarianceThreshold()
        knn = KNeighborsClassifier()
        pipe = Pipeline([('standardized_data', standardized_data),
                        ('var_filter', var_filter),
                        ('knn', knn)])
        # parameters = {
        #     "n_neighbors" : [1, 2, 3, 4, 5],
        #     "weights": ['uniform', 'distance'],
        #     "algorithm": ['ball_tree', 'kd_tree', 'brute'],
        #     "leaf_size": [5, 15, 25, 35, 45, 55],
        #     'p': [10, 20, 40],
        #     "metric": ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
        # }
        parameters = [{'knn__n_neighbors':[1,2,3,4,5], 'knn__algorithm':['brute'], 'knn__metric':['euclidean']}]
        search  = GridSearchCV(
            estimator=pipe,
            param_grid=parameters,
            cv=ShuffleSplit(test_size=0.01, n_splits=1, random_state=0),
            scoring='accuracy',
            n_jobs=cores,
            verbose=1
        )
    
    # AdaBoost
    # https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC
    elif (methodName == 'AdaBoost'): 
        parameters = {
            # 'base_estimator': [None],
            "n_estimators" : [100],
            'algorithm': ['SAMME'],
            'random_state': [0],
        }
        search  = GridSearchCV( 
            estimator=AdaBoostClassifier(),
            cv=ShuffleSplit(test_size=0.01, n_splits=1, random_state=0),
            scoring='accuracy',
            param_grid=parameters,
            n_jobs=-1
        )
   
    # GradientBoosting
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
    elif (methodName == 'GradientBoosting'): 
        parameters = {
            'loss': ['deviance', 'exponential'],
            "n_estimators" : [10, 50, 100, 150, 100],
            'criterion': ["squared_error" ],
            "max_features" : ['sqrt', 'log2'],
            'verbose': [0],
        }
        search  = GridSearchCV( 
            estimator=GradientBoostingClassifier(),
            param_grid=parameters,
            cv=ShuffleSplit(test_size=0.01, n_splits=1, random_state=0),
            scoring='accuracy',
            n_jobs=-1
        )

    # Naive Bayes
    # https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB
    elif (methodName == 'NBayes'): 
        parameters = {
            # 'priors': None,
            "var_smoothing" : np.logspace(0,-9, num=100),
        }
        search  = GridSearchCV( 
            estimator=GaussianNB(),
            param_grid=parameters,
            cv=ShuffleSplit(test_size=0.01, n_splits=1, random_state=0),
            scoring='accuracy',
            n_jobs=cores
        )

    # RandomForest
    # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html?highlight=random#sklearn.ensemble.RandomForestClassifier
    elif (methodName == 'RandomForest'): 
        parameters = {
            "n_estimators" : [10, 100, 1000],
            'criterion': ["entropy"],
            'max_depth': [None],
            "max_features" : ['sqrt', 'log2'],
            'verbose': [0],
            'class_weight': ['balanced','balanced_subsample'],
        }
        search  = GridSearchCV( 
            estimator=RandomForestClassifier(),
            param_grid=parameters,
            cv=ShuffleSplit(test_size=0.01, n_splits=1, random_state=0),
            scoring='accuracy',
            n_jobs=cores
        )
    else:
        results = None
    
    
    if (search != None):
        results = search.fit(X_train, y_train)

    return results
    

In [None]:

# Configure personal metrics
def specificity(tn, fp):
    return tn / (tn + fp)

# Negative Predictive Error
def npv(tn, fn):
    return tn / (tn + fn + 1e-7)

# Matthews Correlation_Coefficient
def mcc(tp, tn, fp, fn):
    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    return num / np.sqrt(den + 1e-7)


def calculateMeasures(dataset, Y_pred, Y_true, Yscores, y_pred, y_true, yscores, folder, methodName, thresh, save_metrics_path, runtimeTrain, runtimeTest):
    metrics = pd.DataFrame()
    tn, fp, fn, tp = confusion_matrix(Y_true, Y_pred, labels=[0,1]).ravel()
    #fpr, tpr, _ = roc_curve(y_true, scores, pos_label=2)
    auc_val = roc_auc_score(Y_true, Yscores)

    metrics['dataset'] = [dataset]
    metrics['network'] = [methodName]
    metrics['partition'] = [folder]

    

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    #fpr, tpr, _ = roc_curve(y_true, scores, pos_label=2)
    auc_val = roc_auc_score(y_true, yscores)

    # Train RESULTS
    metrics['accuracy'] = [accuracy_score(Y_true, Y_pred)]
    metrics['precision'] = [precision_score(Y_true, Y_pred)]
    metrics['sensitivity'] = [recall_score(Y_true, Y_pred)]
    metrics['specificity'] = [specificity(tn,fp)]
    metrics['f1_score'] = [f1_score(Y_true, Y_pred)]
    metrics['npv'] = [npv(tn, fn)]
    metrics['mcc'] = [mcc(tp, tn, fp, fn)]
    metrics['auc'] = [auc_val]
    metrics['TP'] = [tp]
    metrics['TN'] = [tn]
    metrics['FP'] = [fp]
    metrics['FN'] = [fn]
    metrics['runtime'] = [runtimeTrain]

    # Test RESULTS
    metrics['val_accuracy'] = [accuracy_score(y_true, y_pred)]
    metrics['val_precision'] = [precision_score(y_true, y_pred)]
    metrics['val_sensitivity'] = [recall_score(y_true, y_pred)]
    metrics['val_specificity'] = [specificity(tn,fp)]
    metrics['val_f1_score'] =[f1_score(y_true, y_pred)]
    metrics['val_npv'] = [npv(tn, fn)]
    metrics['val_mcc'] = [mcc(tp, tn, fp, fn)]
    metrics['val_auc'] = [auc_val]
    metrics['val_TP'] = [tp]
    metrics['val_TN'] = [tn]
    metrics['val_FP'] = [fp]
    metrics['val_FN'] = [fn]
    metrics['val_runtime'] = [runtimeTest]



    print(bcolors.FAIL + 'ACC: %.2f' %(100*metrics['val_accuracy'][0]) + ' AUC: %.2f' %(100*metrics['val_auc'][0]) + bcolors.ENDC)

    if os.path.exists(os.path.join(save_metrics_path, methodName + '.csv')):
        metrics.to_csv(os.path.join(save_metrics_path, methodName + '.csv'), sep=',', mode='a', index=False, header=False)
    else:
        metrics.to_csv(os.path.join(save_metrics_path, methodName + '.csv'), sep=',', index=False)  

In [90]:

from sklearn.model_selection import train_test_split

def load_dataset(csv_path: str) -> tuple:
    features = []
    """_summary_
    # feature header histogram_Mean,histogram_Standard Deviation,histogram_Energy,histogram_Skewness,histogram_Entropy,histogram_Smoothness,histogram_Kurtosis,glcm_Contrast,glcm_Dissimilarity,glcm_Homogeneity,glcm_Energy,glcm_Correlation,image
    # image 6-66-53-0.bmp 0 is the label

    For each features file, the last column is the image name with the label in your name
    """

    pd_data = pd.read_csv(csv_path)

    # Get the features
    features = pd_data.iloc[:, :-1].values
 
    # Get the label  (last column), which is the image name with the label in your name, apply lambda to get only the label
    labels = pd_data.iloc[:, -1].apply(lambda x: int(x.split('-')[-1].split('.')[0])).values

    # remove column image from features

    features = np.delete(features, 0, axis=1)

    return features, labels


all_datasets = {}
for dataset_name, path in paths_datasets.items():
    X, y = load_dataset(path)
    all_datasets[dataset_name] = (X, y)


splited_datasets = {}
for dataset_name, (X, y) in all_datasets.items():
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    splited_datasets[dataset_name] = (X_train, X_test, y_train, y_test)
    


[[ 0.06204914  0.9895307  15.90607122 ...  0.99509313  0.99391009
   0.89674086]
 [ 0.06225552  0.9960983  15.90608004 ...  0.99822709  0.99741641
   0.83978684]
 [ 0.06220966  0.9946368  15.90608164 ...  0.99739309  0.99667822
   0.86922134]
 ...
 [ 0.06227845  0.99682915 15.90608595 ...  0.99833793  0.99784699
   0.8409174 ]
 [ 0.06231667  0.99804825 15.9060863  ...  0.99887713  0.99852367
   0.80066593]
 [ 0.06220582  0.99451476 15.90608764 ...  0.99701703  0.99655518
   0.82892093]]


In [91]:


def main():
    for network in methodsNames:
        for dataset_name, (X, y) in all_datasets.items():
            X_train, X_test, y_train, y_test = splited_datasets[dataset_name]
            path_metrics, path_csvs, path_test = make_results_folders(path_results, dataset_name, network)
            print(f"Training {network} on {dataset_name}")
            print(f"X_train shape: {X_train.shape} y_train shape: {y_train.shape}")
            print(f"X_test shape: {X_test.shape} y_test shape: {y_test.shape}")
            start = time.time()
            results = train_ml_algorithm(X_train, y_train, network)
            runtimeTrain = time.time() - start
            print(f"Training time: {runtimeTrain}")
            start = time.time()
            y_pred = results.predict(X_test)
            y_scores = results.predict_proba(X_test)[:, 1]
            runtimeTest = time.time() - start
            print(f"Test time: {runtimeTest}")
            calculateMeasures(dataset_name, results.predict(X_train), y_train, results.predict_proba(X_train)[:, 1], y_pred, y_test, y_scores, 'test', network + "f", 0.5, path_metrics, runtimeTrain, runtimeTest)
            print(f"Finished {network} on {dataset_name}")
            del results
            gc.collect()
            print("Memory cleaned")

if __name__ == '__main__':
    main()


Training GradientBoosting on Dataset01_5.0
X_train shape: (4801, 11) y_train shape: (4801,)
X_test shape: (1201, 11) y_test shape: (1201,)


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", li

Training time: 1.8318872451782227
Test time: 0.0031549930572509766
[91mACC: 83.51 AUC: 90.63[0m
Finished GradientBoosting on Dataset01_5.0
Memory cleaned
Training GradientBoosting on Dataset01_10.0
X_train shape: (4801, 11) y_train shape: (4801,)
X_test shape: (1201, 11) y_test shape: (1201,)


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/williancarddd/miniconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", li

Training time: 1.2021982669830322
Test time: 0.003930568695068359
[91mACC: 87.26 AUC: 93.56[0m
Finished GradientBoosting on Dataset01_10.0
Memory cleaned
Training GradientBoosting on Dataset01_15.0
X_train shape: (4801, 11) y_train shape: (4801,)
X_test shape: (1201, 11) y_test shape: (1201,)


KeyboardInterrupt: 