In [None]:
%%bash
pip -q install pydicom opencv-python scikit-image pyradiomics

wget -q http://www.inf.ufpr.br/lferrari/imagens_ihq.tar.gz && tar -xf imagens_ihq.tar.gz

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 12.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 17.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 12.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 52.7/52.7 MB 11.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.4/116.4 kB 15.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 526.7/526.7 kB 50.1 MB/s eta 0:00:00


In [2]:
import cv2 as cv
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from joblib import dump, load
# import pydicom as dicom
# import radiomics
# from radiomics import featureextractor
# import SimpleITK as sitk
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from joblib import load

In [None]:
def cut_images(input_path, new_width, new_height, output_path=None):
    """
    Cut images into the desired size and save the output images

    Params:
    input_path: path to the original images
    output_path: path to save the cut images
    new_width: width of the cut images
    new_height: height of the cut images

    Return:

    images_data: dictionary with images names as keys
    and images as values
    patients: list of patients IDs
    classes = list of labels
    """
    images_data = {}
    classes = []
    patients = []

    n = 0
    # Browse input path
    for class_dir in os.listdir(input_path):
        class_path = os.path.join(input_path, class_dir)

        # If it is a directory
        if os.path.isdir(class_path):

            # Save image id
            image_id = 1

            # Go through images
            for image_file in os.listdir(class_path):
                image_path = os.path.join(class_path, image_file)

                # Save patient id
                patient = image_file.split("_")[0]
                patients.append(patient)

                image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)

                # If image exists
                if image is not None:

                    # Save subimage id
                    sub_id = 1

                    for i in range(0, image.shape[0], new_height):
                        for j in range(0, image.shape[1], new_width):

                            # Cut image into subimage
                            sub_image = image[i:i+new_height, j:j+new_width]

                            # Image name identifier
                            image_name = f"{patient}-img{image_id}-{sub_id}"

                            # Append image and its label to the dictionary
                            # if image_name not in images_data:
                            #     image_name = f"{patient}2-img{image_id}-{sub_id}"
                            #     images_data[image_name] = sub_image
                            # else:
                            images_data[image_name] = sub_image

                            classes.append(int(class_dir))

                            # Write subimage if an output path was given
                            if output_path != None:
                                # Create dir if it doesn't exist
                                os.makedirs(os.path.join(output_path, class_dir), exist_ok=True)
                                # Output file path
                                output_file = f"{image_name}.png"
                                output_file = os.path.join(output_path, class_dir, output_file)
                                # Save subimage
                                cv.imwrite(output_file, sub_image)

                            sub_id += 1
                            n += 1

                image_id += 1

    return images_data, patients, classes

In [None]:
def divide_folds(image_names, patients, classes):
    """
    Divides a dataset into folds for stratified k-fold cross-validation.

    Params:
    images_names: list of all images names
    patients: list with patient id for each image
    classes: list with class for each image

    Return:
    folds: list of tuples, each tuple is one fold containing (imgs_names, labels)
    """
    # Create a list of unique indexes for patients
    unique_patients = list(set(patients))

    # Number of folds
    n_folds = 5

    patients_per_fold = len(unique_patients)//n_folds
    left = [(len(unique_patients)-i) for i in range(1, (len(unique_patients)%n_folds)+1)]

    images_classes = dict(zip(images_names, classes))
    assigned_patients = []
    folds = []

    for i in range(n_folds):

        n_patients = 0

        x = []
        y = []

        if i == n_folds-1:
            for j in left:

                patient = patients[-j]

                imgs_patient = [name for name in image_names if patient in name]
                x = x + imgs_patient
                y = y + [images_classes[key] for key in imgs_patient]
                n_patients += 1
                assigned_patients.append(patient)

        for k in range(len(unique_patients)):

            patient = unique_patients[k]

            if n_patients < patients_per_fold and patient not in assigned_patients:
                imgs_patient = [name for name in image_names if patient in name]
                x = x + imgs_patient
                y = y + [images_classes[key] for key in imgs_patient]
                n_patients += 1
                assigned_patients.append(patient)

        folds.append((x, y))

    return folds


In [None]:
def preprocess(images_data):
    """
    Apply normalization, blur and sharpening to the images

    Params:
    images_data: dictionary with images names as keys
    and images as values

    Return:
    new_data: dictionary with images names as keys
    and processed images as values
    """

    new_data = images_data.copy()

    for key, value in new_data.items():

        img = value

        # Normalize between 0 and 1
        norm = cv.normalize(img, None, 0, 1.0, cv.NORM_MINMAX, dtype=cv.CV_32F)

        # Gaussian blur
        blur = cv.GaussianBlur(norm, (3, 3), 1)

        # Sharpen the image
        kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
        sharp = cv.filter2D(blur, -1, kernel)

        new_data[key] = sharp

    return new_data

In [None]:
def apply_thresholds(images_data):
    """
    Params:
    images_data: dictionary with images names as keys
    and images as values

    Params:
    imgs: list of raw images

    Return:
    new_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values
    extractor: pyradiomics extractor
    """

    new_data = images_data.copy()

    for key, value in new_data.items():

        # Otsu's thresholding
        _, th1 = cv.threshold(value, 100, 1, cv.THRESH_BINARY+cv.THRESH_OTSU)

        # Adaptative gaussian thresholding
        #th2 = cv.adaptiveThreshold(img,255,cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY,11,2)
        th3 = cv.adaptiveThreshold(value, 1, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)

        new_data[key] = (value, th1, th3)

    return new_data

In [None]:
# Cut images into 40x30
images_data, patients, classes = cut_images("imagens_ihq_er", 40, 30)

# Divide images into folds
images_names = list(images_data.keys())
folds = divide_folds(images_names, patients, classes, 0.7)
# Save folds variable
# dump(folds, 'folds.joblib')

# Preprocess images_data
images_data = preprocess(images_data)
# # Update images_data, applying Otsu's and adaptive thresholds
images_data = apply_thresholds(images_data)


Extract features with PyRadiomics

In [None]:
def run_extractor(images_data, extractor):
    """
    Extract features using sitk and pyradiomics

    Params:
    images_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values
    extractor: pyradiomics extractor

    Returns:
    features_otsu: features for otsu mask
    features_adapt: features for adaptative mask
    """

    data_spacing=[1,1,1]
    features_otsu = {}
    features_adapt = {}

    for key, value in images_data.items():

        # Get raw, Otsu's and adaptive images
        img = value[0]
        img_otsu = value[1]
        img_adapt = value[2]

        sitk_img = sitk.GetImageFromArray(img)
        sitk_img.SetSpacing((1, 1, 1))
        sitk_img = sitk.JoinSeries(sitk_img)

        sitk_otsu = sitk.GetImageFromArray(img_otsu)
        sitk_otsu.SetSpacing((1, 1, 1))
        sitk_otsu = sitk.JoinSeries(sitk_otsu)
        sitk_otsu = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        sitk_adapt = sitk.GetImageFromArray(img_adapt)
        sitk_adapt.SetSpacing((1, 1, 1))
        sitk_adapt = sitk.JoinSeries(sitk_adapt)
        sitk_adapt = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        # Extract features and append them to the proper list
        try:
            ft_otsu = extractor.execute(sitk_img, sitk_otsu)
            features_otsu[key] = ft_otsu

            ft_adapt = extractor.execute(sitk_img, sitk_adapt)
            features_adapt[key] = ft_adapt

        except:
            print(f"{key}, ", end="")
            pass

    return features_otsu, features_adapt



In [None]:
def conditional_append(element, dest):
    """
    Append element to the list destiny, if element is not in destiny

    Params:
    element: an element of any kind
    dest: a destination list

    Returns:
    dest: list with appended element if the element was not in there
    """
    if element not in dest:
        dest.append(element)

    return dest

def process_features(feats_o, feats_a):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float

    Params:
    feats_o: list of Otsu's threshold features
    feats_a: list of adaptativa threshold features

    Returns:
    all_feats_o: Otsu's features processed
    all_feats_a: adaptative features processed
    names = feature names processed
    """

    all_feats_o = {}
    all_feats_a = {}
    names = []

    # For each image in one of the features list
    for key in feats_o:

        # Get features for Otsu's and adaptive for this sample
        sample_o = feats_o[key]
        sample_a = feats_a[key]

        values_o = []
        values_a = []

        # For each feature in the list
        for ft in sample_o:

            # Get the feature's value
            value_o = sample_o[ft]
            value_a = sample_a[ft]

            # If the value is str or dict, ignore it
            if type(value_o) == str or type(value_o) == dict:
                continue
            # If it's a tuple
            elif type(value_o) == tuple:
                for e in range(len(value_o)):
                    # Add and index to the feature name
                    conditional_append(f'{ft}_{e}', names)
                    # Append float values to the lists
                    values_o.append(float(value_o[e]))
                    values_a.append(float(value_a[e]))
            # For other data types, just append the name and float values
            else:
                conditional_append(ft, names)
                values_o.append(float(value_o))
                values_a.append(float(value_a))

        # Append processed features to the general list
        all_feats_o[key] = values_o
        all_feats_a[key] = values_a

    return all_feats_o, all_feats_a, names

def extract_features(images_data):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float
    Get the features' names, with tuple features indexed

    Params:
    images_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values

    Returns:
    all_folds_feats: dictionary containing Otsu's features
    and adaptive features for each fold
    names: feature names
    """

    # Create feature extractor
    !wget -c https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
    params = 'Params.yaml'
    settings = {'label': 1, 'correctMask': True}
    extractor = featureextractor.RadiomicsFeatureExtractor(params, additionalInfo=True, **settings)

    # Extract features from Otsu's and adaptative
    feats_o, feats_a = run_extractor(images_data, extractor)

    # Process features and get feature names
    all_feats_o, all_feats_a, names = process_features(feats_o, feats_a)

    return all_feats_o, all_feats_a, names

In [None]:
# Division of images_data dictionary in 4 parts
# to extract features easily

# lista = list(images_data.keys())
# p = lista[:10000]
# s = lista[10000:20000]
# t = lista[20000:30000]
# q = lista[30000:40000]

# pp = {k:images_data[k] for k in p}
# ss = {k:images_data[k] for k in s}
# tt = {k:images_data[k] for k in t}
# qq = {k:images_data[k] for k in q}

In [None]:
# Extract all features
all_feats_o, all_feats_a, ft_names = extract_features(images_data)

--2023-11-26 23:10:38--  https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



INFO:radiomics.featureextractor:Loading parameter file Params.yaml
INFO:radiomics.featureextractor:Applying custom setting overrides: {'additionalInfo': True, 'label': 1, 'correctMask': True}
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with custom settings: {}
INFO:radiomics.featureextractor:Calculating features for original image
INFO:radiomics.featureextractor:Computing firstorder
INFO:radiomics.featureextractor:Computing glcm
INFO:radiomics.featureextractor:Computing glrlm
INFO:radiomics.featureextractor:Computing glszm
INFO:radiomics.featureextractor:Computing gldm
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with c

In [None]:
def save_features(all_feats_o, all_feats_a):
    """
    Save features of all images in .txt files

    Params:
    all_feats_o:

    Returns:
    all_folds_feats = dictionary containing Otsu's features
    and adaptive features for each fold
    names = feature names
    """

    out_o = 'features_o/'
    out_a = 'features_a/'

    os.makedirs(out_o, exist_ok=True)
    os.makedirs(out_a, exist_ok=True)

    for key in all_feats_o:

        ft_o = all_feats_o[key]
        ft_a = all_feats_a[key]

        filename_o = f'{key}_o.txt'
        filename_a = f'{key}_a.txt'

        with open(os.path.join(out_o, filename_o), 'w') as f:
            for elem in ft_o:
                f.write(f'{elem}\n')

        with open(os.path.join(out_a, filename_a), 'w') as f:
            for elem in ft_a:
                f.write(f'{elem}\n')

In [None]:
# Save features and feature_names
save_features(all_feats_o, all_feats_a)
with open('ft_names.txt', 'w') as f:
    f.write('\n'.join(ft_names))

In [None]:
# Download features
!zip -r /content/features_o.zip /content/features_o
!zip -r /content/features_a.zip /content/features_a

from google.colab import files
files.download("/content/features_o.zip")
files.download("/content/features_a.zip")

Read features and folds and prepare for training the model

In [7]:
def read_files(ft_names_path, folds_path):
    """
    Read feature names and fold files

    Params:
    ft_names_path: path to the feature names .txt file
    folds_path: path to the folds .joblib file

    Returns:
    ft_names: list of radiomics feature names
    folds: list of tuples, each tuple is one fold containing (imgs_names, labels)
    """

    ft_names = []

    # Open file and read the content in a list
    with open(ft_names_path, 'r') as f:
        for line in f:

            # Remove linebreak
            x = line[:-1]

            # Add feature name to the list
            ft_names.append(str(x))

    # Load folds
    folds = load(folds_path)

    return ft_names, folds

In [100]:
def read_features(ft_path, fold_images, label):
    """
    Read features from files

    Params:
    ft_path: path to the feature names .txt file
    fold_images: list of image names for a fold
    label

    Returns:
    features: list of features
    names: list of names
    labels: list of labels
    """

    # Lists to return
    features = []
    names = []
    labels = []

    # For feature file in the path
    for ft_file in os.listdir(ft_path):

        # List of features for this image
        ft_o = []

        # Get patient name from the file
        patient = ft_file.split("-")[0]

        # Look for the patient in the fold images_names
        for k, image_name in enumerate(fold_images):
            if patient in image_name:
                # Try to open file and read the content to a list
                try:
                    #print(f'achei o {patient} no fold')
                    with open(os.path.join(ft_path, ft_file), 'r') as f:
                        for line in f:

                            # Remove linebreak
                            x = line[:-1]

                            # Add current feature to the list
                            ft_o.append(float(x))

                    names.append(image_name)
                    labels.append(label[k])
                except:
                    pass

                # Break if patient found
                break

        if len(ft_o) != 0:
            features.append(ft_o)

    return features, names, labels

def create_dfs(path_o, path_a, fold, ft_names):
    """
    Create dataframes for the data

    Params:
    fold: tuple of (imgs_names, labels)

    Returns:
    ft_names: list of radiomics feature names
    folds: list of tuples, each tuple is one fold containing (imgs, labels)
    """

    x = fold[0]
    y = fold[1]

    # Dataframe for Otsu's features
    ft_o, index_o, label_o = read_features(path_o, x, y)
    df_o = pd.DataFrame(ft_o, columns = ft_names, index = index_o)
    df_o.insert(loc=1, column='label', value=label_o)

    # Dataframe for adaptive features
    ft_a, index_a, label_a = read_features(path_a, x, y)
    df_a = pd.DataFrame(ft_a, columns = ft_names, index = index_a)
    df_a.insert(loc=1, column='label', value=label_a)

    return df_o, df_a


In [97]:
%%bash
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_o.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_a.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_ap.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_op.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/folds.tar.gz
wget -q https://raw.githubusercontent.com/vitoriastavis/ufpr-medical-images/main/ft_names.txt

tar -xf features_o.tar.gz
tar -xf features_a.tar.gz
tar -xf features_op.tar.gz
tar -xf features_ap.tar.gz
tar -xf folds.tar.gz

In [None]:
# The variable 'folds' is a list of tuples
# folds[0] is the first fold
# in each fold, there are (image_names, labels)
# which are used to build the dataframes below
# also using ft_names, which are names of the features to use as column names
ft_names, folds = read_files('ft_names.txt', 'folds.joblib')

In [30]:
# Create dfs for non processed images
dataframes_np = []
for i in range(len(folds)):
  df_o, df_a = create_dfs(False, folds[i], ft_names)
  dataframes_np.append((df_o, df_a))

# Create dfs for processed images
dataframes_p = []
for i in range(len(folds)):
  df_o, df_a = create_dfs(True, folds[i], ft_names)
  dataframes_p.append((df_o, df_a))

In [53]:
def classifier(method, dataframes=None, threshold='', verbose=True):
    """
    classify features using knn

    Params:
    threshold: name of threshold filter
    data: optional dictionary data

    Returns:
    f1_list: list of f1 scores to create a dataframe
    """

    f1_list = []

    ans = {"accuracy": [], "precision": [], "recall": [], "f1_score": []}

    if verbose:
        print(f'Classifying with {method} - {threshold} \n')

    # 5-fold cross validation
    for i in range(len(dataframes)):
        # Create dataframe from current folder
        (df_o, df_a) = dataframes[i]
        if (threshold == 'otsu'):
            df = df_o
        elif (threshold == 'adaptive'):
            df = df_a

        # Predictor variables
        X = df.drop(['label'], axis=1).values

        # Target variables
        y = df['label'].values

        # Creates classifier with selected parameters
        if method == 'knn':
          clf = KNeighborsClassifier(n_neighbors=7, metric='euclidean')
        elif method == 'mlp':
          clf = MLPClassifier(max_iter=300, activation='logistic', solver='adam')
        else:
          print('invalid method, try knn or mlp')
          exit()

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Fit the model
        clf.fit(X_train, y_train)

        # Get the score
        score = clf.score(X_test, y_test)

        # Predicting
        y_pred = clf.predict(X_test)

        # Creates confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        if verbose:
          print(f'\t\t {method} - Fold {i+1} | threshold {threshold}')
          print (f'Confusion Matrix \n {cm}')
          print(classification_report(y_test, y_pred))
          print()

        # Get classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        precision = report['macro avg']['precision']
        recall = report['macro avg']['recall']
        f1 = report['macro avg']['f1-score']

        # List of metrics
        ans["accuracy"].append(round(score, 3))
        ans["precision"].append(round(precision, 3))
        ans["recall"].append(round(recall), 3)
        ans["f1_score"].append(round(f1), 3)

        f1_list.append(f1)

    # Print the results
    results = {"accuracy": round(np.mean(ans["accuracy"]), 3),
               "precision": round(np.mean(ans["precision"]), 3),
               "recall": round(np.mean(ans["recall"]), 3),
               "f1_score": round(np.mean(ans["f1_score"]), 3)}

    if verbose:
      print("Classification done! Average metrics:")
      print(results)
      print()

    return f1_list

In [None]:
f1_knn_otsu_np = classifier('knn', dataframes_np, 'otsu')
f1_knn_adapt_np = classifier('knn', dataframes_np, 'adaptive')
f1_mlp_otsu_np = classifier('mlp', dataframes_np, 'otsu')
f1_mlp_adapt_np = classifier('mlp', dataframes_np, 'adaptive')

In [62]:
header = ['Otsu (NP)', 'Adaptive (NP)']

In [87]:
results_np_knn = pd.DataFrame(np.transpose([f1_knn_otsu_np, f1_knn_adapt_np]), columns = header)
results_np_knn.index = [i+1 for i in results_np_knn.index]
results_np_knn.index.name = 'Fold'

In [93]:
results_np_mlp = pd.DataFrame(np.transpose([f1_mlp_otsu_np, f1_mlp_adapt_np]), columns = header)
results_np_mlp.index = [i+1 for i in results_np_mlp.index]
results_np_mlp.index.name = 'Fold'

In [104]:
f1_knn_otsu_p = classifier('knn', dataframes_p, 'otsu')
f1_knn_adapt_p = classifier('knn', dataframes_p, 'adaptive')
f1_mlp_otsu_p = classifier('mlp', dataframes_p, 'otsu')
f1_mlp_adapt_p = classifier('mlp', dataframes_p, 'adaptive')

Classifying with knn - otsu 

		 knn - Fold 1 | threshold otsu
Confusion Matrix 
 [[174  53   3  10]
 [ 66 360  16  26]
 [  8  32 267  88]
 [ 33  96 148 411]]
              precision    recall  f1-score   support

           0       0.62      0.72      0.67       240
           1       0.67      0.77      0.71       468
           2       0.62      0.68      0.64       395
           3       0.77      0.60      0.67       688

    accuracy                           0.68      1791
   macro avg       0.67      0.69      0.67      1791
weighted avg       0.69      0.68      0.68      1791


		 knn - Fold 2 | threshold otsu
Confusion Matrix 
 [[143   7  19  19]
 [  5 214  17  66]
 [ 15  25 106  30]
 [ 30 103  36 558]]
              precision    recall  f1-score   support

           0       0.74      0.76      0.75       188
           1       0.61      0.71      0.66       302
           2       0.60      0.60      0.60       176
           3       0.83      0.77      0.80       727

    



		 mlp - Fold 1 | threshold otsu
Confusion Matrix 
 [[162  53   0  16]
 [ 43 422  18  45]
 [  2  25 264  95]
 [ 25  77  82 462]]
              precision    recall  f1-score   support

           0       0.70      0.70      0.70       231
           1       0.73      0.80      0.76       528
           2       0.73      0.68      0.70       386
           3       0.75      0.72      0.73       646

    accuracy                           0.73      1791
   macro avg       0.73      0.72      0.72      1791
weighted avg       0.73      0.73      0.73      1791






		 mlp - Fold 2 | threshold otsu
Confusion Matrix 
 [[146   2   7  22]
 [  3 236  23  58]
 [  6  13 115  34]
 [ 13  56  24 635]]
              precision    recall  f1-score   support

           0       0.87      0.82      0.85       177
           1       0.77      0.74      0.75       320
           2       0.68      0.68      0.68       168
           3       0.85      0.87      0.86       728

    accuracy                           0.81      1393
   macro avg       0.79      0.78      0.79      1393
weighted avg       0.81      0.81      0.81      1393






		 mlp - Fold 3 | threshold otsu
Confusion Matrix 
 [[521 117  44  30]
 [ 88 408  15  17]
 [ 59  39 112  34]
 [ 47  34  18 249]]
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       712
           1       0.68      0.77      0.72       528
           2       0.59      0.46      0.52       244
           3       0.75      0.72      0.73       348

    accuracy                           0.70      1832
   macro avg       0.69      0.67      0.68      1832
weighted avg       0.70      0.70      0.70      1832






		 mlp - Fold 4 | threshold otsu
Confusion Matrix 
 [[602  12 103]
 [ 47  23  10]
 [ 94   9 419]]
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       717
           2       0.52      0.29      0.37        80
           3       0.79      0.80      0.80       522

    accuracy                           0.79      1319
   macro avg       0.71      0.64      0.66      1319
weighted avg       0.78      0.79      0.79      1319






		 mlp - Fold 5 | threshold otsu
Confusion Matrix 
 [[379  19  72  37]
 [ 48  54  17  17]
 [ 79  13 402  39]
 [ 45   9  50 156]]
              precision    recall  f1-score   support

           0       0.69      0.75      0.72       507
           1       0.57      0.40      0.47       136
           2       0.74      0.75      0.75       533
           3       0.63      0.60      0.61       260

    accuracy                           0.69      1436
   macro avg       0.66      0.62      0.64      1436
weighted avg       0.69      0.69      0.69      1436


Classification done!
{'accuracy': 0.75, 'precision': 0.71, 'recall': 0.69, 'f1_score': 0.7}
Classifying with mlp - adaptive 





		 mlp - Fold 1 | threshold adaptive
Confusion Matrix 
 [[164  55   3  18]
 [ 49 403  17  54]
 [  0  15 271  94]
 [ 14  69  89 476]]
              precision    recall  f1-score   support

           0       0.72      0.68      0.70       240
           1       0.74      0.77      0.76       523
           2       0.71      0.71      0.71       380
           3       0.74      0.73      0.74       648

    accuracy                           0.73      1791
   macro avg       0.73      0.73      0.73      1791
weighted avg       0.73      0.73      0.73      1791






		 mlp - Fold 2 | threshold adaptive
Confusion Matrix 
 [[132   3   7  19]
 [  1 233  18  45]
 [  2  24 122  30]
 [ 15  59  31 652]]
              precision    recall  f1-score   support

           0       0.88      0.82      0.85       161
           1       0.73      0.78      0.76       297
           2       0.69      0.69      0.69       178
           3       0.87      0.86      0.87       757

    accuracy                           0.82      1393
   macro avg       0.79      0.79      0.79      1393
weighted avg       0.82      0.82      0.82      1393






		 mlp - Fold 3 | threshold adaptive
Confusion Matrix 
 [[522 114  52  26]
 [ 94 378  24  26]
 [ 57  31 107  51]
 [ 44  16  40 250]]
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       714
           1       0.70      0.72      0.71       522
           2       0.48      0.43      0.46       246
           3       0.71      0.71      0.71       350

    accuracy                           0.69      1832
   macro avg       0.65      0.65      0.65      1832
weighted avg       0.68      0.69      0.68      1832






		 mlp - Fold 4 | threshold adaptive
Confusion Matrix 
 [[591  12 102]
 [ 45  26  18]
 [ 99   7 419]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       705
           2       0.58      0.29      0.39        89
           3       0.78      0.80      0.79       525

    accuracy                           0.79      1319
   macro avg       0.72      0.64      0.67      1319
weighted avg       0.78      0.79      0.78      1319


		 mlp - Fold 5 | threshold adaptive
Confusion Matrix 
 [[379  35  62  23]
 [ 39  56  25  14]
 [ 62  12 389  42]
 [ 54  13  59 172]]
              precision    recall  f1-score   support

           0       0.71      0.76      0.73       499
           1       0.48      0.42      0.45       134
           2       0.73      0.77      0.75       505
           3       0.69      0.58      0.63       298

    accuracy                           0.69      1436
   macro avg       0.65      0.63      0.64      1436
wei



In [105]:
results_p_knn = pd.DataFrame(np.transpose([f1_knn_otsu_p, f1_knn_adapt_p]), columns = header)
results_p_knn.index = [i+1 for i in results_p_knn.index]
results_p_knn.index.name = 'Fold'

In [106]:
results_p_mlp = pd.DataFrame(np.transpose([f1_mlp_otsu_p, f1_mlp_adapt_p]), columns = header)
results_p_mlp.index = [i+1 for i in results_p_mlp.index]
results_p_mlp.index.name = 'Fold'

In [108]:
levels = ['Not preprocessed', 'Preprocessed']

In [112]:
results_knn = pd.concat([results_np_knn, results_p_knn], axis=1, keys =levels)

In [113]:
results_knn

Unnamed: 0_level_0,Not preprocessed,Not preprocessed,Preprocessed,Preprocessed
Unnamed: 0_level_1,Otsu (NP),Adaptive (NP),Otsu (NP),Adaptive (NP)
Fold,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,0.67449,0.668885,0.674448,0.677209
2,0.704497,0.702514,0.70103,0.712923
3,0.569883,0.562232,0.590148,0.578843
4,0.602405,0.558448,0.61227,0.572136
5,0.528268,0.524298,0.53696,0.532198


In [None]:
results_mlp = pd.concat([results_np_mlp, results_p_mlp], axis=1, keys=levels)

In [None]:
if __name__ == '__main__':

    # Check if at least one argument is provided
    if len(sys.argv) < 3 or len(sys.argv) > 4:
        print("Usage: python classification.py path_o path_a \n Options: -v")
        sys.exit(1)

    elif (len(sys.argv)) == 3:
        path_o = sys.argv[1]
        path_a = sys.argv[2]
        verbose = False

    elif (len(sys.argv)) == 4 and sys.argv[1] == '-v':
        path_o = sys.argv[2]
        path_a = sys.argv[3]
        verbose = True


    # The variable 'folds' is a list of tuples
    # folds[0] is the first fold
    # in each fold, there are (image_names, labels)
    # which are used to build the dataframes below
    # also using ft_names, which are names of the features to use as column names
    ft_names, folds = read_files('ft_names.txt', 'folds.joblib')

    # Create dfs for non processed images
    dataframes_np = []
    for i in range(len(folds)):
      df_o, df_a = create_dfs(sys.argv[1], folds[i], ft_names)
      dataframes_np.append((df_o, df_a))

    # Create dfs for processed images
    dataframes_p = []
    for i in range(len(folds)):
      df_o, df_a = create_dfs(sys.argv[2], folds[i], ft_names)
      dataframes_p.append((df_o, df_a))

    # Classification for non processed images
    f1_knn_otsu_np = classifier('knn', dataframes_np, 'otsu', verbose)
    f1_knn_adapt_np = classifier('knn', dataframes_np, 'adaptive', verbose)
    f1_mlp_otsu_np = classifier('mlp', dataframes_np, 'otsu', verbose)
    f1_mlp_adapt_np = classifier('mlp', dataframes_np, 'adaptive', verbose)

    # Classification for processed images
    f1_knn_otsu_p = classifier('knn', dataframes_p, 'otsu', verbose)
    f1_knn_adapt_p = classifier('knn', dataframes_p, 'adaptive', verbose)
    f1_mlp_otsu_p = classifier('mlp', dataframes_p, 'otsu', verbose)
    f1_mlp_adapt_p = classifier('mlp', dataframes_p, 'adaptive', verbose)

    header = ['Otsu (NP)', 'Adaptive (NP)']
    levels = ['Not preprocessed', 'Preprocessed']

    # Result dataframes
    results_np_knn = pd.DataFrame(np.transpose([f1_knn_otsu_np, f1_knn_adapt_np]), columns = header)
    results_np_mlp = pd.DataFrame(np.transpose([f1_mlp_otsu_np, f1_mlp_adapt_np]), columns = header)

    results_p_knn = pd.DataFrame(np.transpose([f1_knn_otsu_p, f1_knn_adapt_p]), columns = header)
    results_p_mlp = pd.DataFrame(np.transpose([f1_mlp_otsu_p, f1_mlp_adapt_p]), columns = header)

    results_knn = pd.concat([results_np_knn, results_p_knn], axis=1, keys =levels)
    results_mlp = pd.concat([results_np_mlp, results_p_mlp], axis=1, keys=levels)

    results_knn.index = [i+1 for i in results_knn.index]
    results_knn.index.name = 'Fold'

    results_mlp.index = [i+1 for i in results_mlp.index]
    results_mlp.index.name = 'Fold'

    print('Final dataframes: KNN - F1-score')
    print(results_knn)

    print('Final dataframes: MLP - F1-score')
    print(results_knn)