In [3]:
%%bash
pip -q install pydicom opencv-python scikit-image pyradiomics

wget -q http://www.inf.ufpr.br/lferrari/imagens_ihq.tar.gz && tar -xf imagens_ihq.tar.gz

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 12.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 17.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 34.5/34.5 MB 12.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 52.7/52.7 MB 11.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.4/116.4 kB 15.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 526.7/526.7 kB 50.1 MB/s eta 0:00:00


In [4]:

import cv2 as cv
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from joblib import dump, load
import pydicom as dicom
import radiomics
from radiomics import featureextractor
import SimpleITK as sitk
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from joblib import load

In [5]:
def cut_images(input_path, new_width, new_height, output_path=None):
    """
    Cut images into the desired size and save the output images

    Params:
    input_path: path to the original images
    output_path: path to save the cut images
    new_width: width of the cut images
    new_height: height of the cut images

    Return:

    images_data: dictionary with images names as keys
    and images as values
    patients: list of patients IDs
    classes = list of labels
    """
    images_data = {}
    classes = []
    patients = []

    n = 0
    # Browse input path
    for class_dir in os.listdir(input_path):
        class_path = os.path.join(input_path, class_dir)

        # If it is a directory
        if os.path.isdir(class_path):

            # Save image id
            image_id = 1

            # Go through images
            for image_file in os.listdir(class_path):
                image_path = os.path.join(class_path, image_file)

                # Save patient id
                patient = image_file.split("_")[0]
                patients.append(patient)

                image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)

                # If image exists
                if image is not None:

                    # Save subimage id
                    sub_id = 1

                    for i in range(0, image.shape[0], new_height):
                        for j in range(0, image.shape[1], new_width):

                            # Cut image into subimage
                            sub_image = image[i:i+new_height, j:j+new_width]

                            # Image name identifier
                            image_name = f"{patient}-img{image_id}-{sub_id}"

                            # Append image and its label to the dictionary
                            # if image_name not in images_data:
                            #     image_name = f"{patient}2-img{image_id}-{sub_id}"
                            #     images_data[image_name] = sub_image
                            # else:
                            images_data[image_name] = sub_image

                            classes.append(int(class_dir))

                            # Write subimage if an output path was given
                            if output_path != None:
                                # Create dir if it doesn't exist
                                os.makedirs(os.path.join(output_path, class_dir), exist_ok=True)
                                # Output file path
                                output_file = f"{image_name}.png"
                                output_file = os.path.join(output_path, class_dir, output_file)
                                # Save subimage
                                cv.imwrite(output_file, sub_image)

                            sub_id += 1
                            n += 1

                image_id += 1

    return images_data, patients, classes

In [None]:
def divide_folds(image_names, patients, classes):
    """
    Divides a dataset into folds for stratified k-fold cross-validation.

    Params:
    images_names: list of all images names
    patients: list with patient id for each image
    classes: list with class for each image

    Return:
    folds: list of tuples, each tuple is one fold containing (imgs_names, labels)
    """
    # Create a list of unique indexes for patients
    unique_patients = list(set(patients))

    # Number of folds
    n_folds = 5

    patients_per_fold = len(unique_patients)//n_folds
    left = [(len(unique_patients)-i) for i in range(1, (len(unique_patients)%n_folds)+1)]

    images_classes = dict(zip(images_names, classes))
    assigned_patients = []
    folds = []

    for i in range(n_folds):

        n_patients = 0

        x = []
        y = []

        if i == n_folds-1:
            for j in left:

                patient = patients[-j]

                imgs_patient = [name for name in image_names if patient in name]
                x = x + imgs_patient
                y = y + [images_classes[key] for key in imgs_patient]
                n_patients += 1
                assigned_patients.append(patient)

        for k in range(len(unique_patients)):

            patient = unique_patients[k]

            if n_patients < patients_per_fold and patient not in assigned_patients:
                imgs_patient = [name for name in image_names if patient in name]
                x = x + imgs_patient
                y = y + [images_classes[key] for key in imgs_patient]
                n_patients += 1
                assigned_patients.append(patient)

        folds.append((x, y))

    return folds


In [47]:
def preprocess(images_data):
    """
    Apply normalization, blur and sharpening to the images

    Params:
    images_data: dictionary with images names as keys
    and images as values

    Return:
    new_data: dictionary with images names as keys
    and processed images as values
    """

    new_data = images_data.copy()

    for key, value in new_data.items():

        img = value

        # Normalize between 0 and 1
        norm = cv.normalize(img, None, 0, 1.0, cv.NORM_MINMAX, dtype=cv.CV_32F)

        # Gaussian blur
        blur = cv.GaussianBlur(norm, (3, 3), 1)

        # Sharpen the image
        kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
        sharp = cv.filter2D(blur, -1, kernel)

        new_data[key] = sharp

    return new_data

In [7]:
def apply_thresholds(images_data):
    """
    Params:
    images_data: dictionary with images names as keys
    and images as values

    Params:
    imgs: list of raw images

    Return:
    new_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values
    extractor: pyradiomics extractor
    """

    new_data = images_data.copy()

    for key, value in new_data.items():

        # Otsu's thresholding
        _, th1 = cv.threshold(value, 100, 1, cv.THRESH_BINARY+cv.THRESH_OTSU)

        # Adaptative gaussian thresholding
        #th2 = cv.adaptiveThreshold(img,255,cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY,11,2)
        th3 = cv.adaptiveThreshold(value, 1, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)

        new_data[key] = (value, th1, th3)

    return new_data

In [68]:
# Cut images into 40x30
images_data, patients, classes = cut_images("imagens_ihq_er", 40, 30)

# Divide images into folds
images_names = list(images_data.keys())
folds = divide_folds(images_names, patients, classes, 0.7)
# Save folds variable
# dump(folds, 'folds.joblib')

# Preprocess images_data
images_data = preprocess(images_data)
# # Update images_data, applying Otsu's and adaptive thresholds
images_data = apply_thresholds(images_data)


Extract features with PyRadiomics

In [63]:
def run_extractor(images_data, extractor):
    """
    Extract features using sitk and pyradiomics

    Params:
    images_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values
    extractor: pyradiomics extractor

    Returns:
    features_otsu: features for otsu mask
    features_adapt: features for adaptative mask
    """

    data_spacing=[1,1,1]
    features_otsu = {}
    features_adapt = {}

    for key, value in images_data.items():

        # Get raw, Otsu's and adaptive images
        img = value[0]
        img_otsu = value[1]
        img_adapt = value[2]

        sitk_img = sitk.GetImageFromArray(img)
        sitk_img.SetSpacing((1, 1, 1))
        sitk_img = sitk.JoinSeries(sitk_img)

        sitk_otsu = sitk.GetImageFromArray(img_otsu)
        sitk_otsu.SetSpacing((1, 1, 1))
        sitk_otsu = sitk.JoinSeries(sitk_otsu)
        sitk_otsu = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        sitk_adapt = sitk.GetImageFromArray(img_adapt)
        sitk_adapt.SetSpacing((1, 1, 1))
        sitk_adapt = sitk.JoinSeries(sitk_adapt)
        sitk_adapt = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        # Extract features and append them to the proper list
        try:
            ft_otsu = extractor.execute(sitk_img, sitk_otsu)
            features_otsu[key] = ft_otsu

            ft_adapt = extractor.execute(sitk_img, sitk_adapt)
            features_adapt[key] = ft_adapt

        except:
            print(f"{key}, ", end="")
            pass

    return features_otsu, features_adapt



In [66]:
def conditional_append(element, dest):
    """
    Append element to the list destiny, if element is not in destiny

    Params:
    element: an element of any kind
    dest: a destination list

    Returns:
    dest: list with appended element if the element was not in there
    """
    if element not in dest:
        dest.append(element)

    return dest

def process_features(feats_o, feats_a):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float

    Params:
    feats_o: list of Otsu's threshold features
    feats_a: list of adaptativa threshold features

    Returns:
    all_feats_o: Otsu's features processed
    all_feats_a: adaptative features processed
    names = feature names processed
    """

    all_feats_o = {}
    all_feats_a = {}
    names = []

    # For each image in one of the features list
    for key in feats_o:

        # Get features for Otsu's and adaptive for this sample
        sample_o = feats_o[key]
        sample_a = feats_a[key]

        values_o = []
        values_a = []

        # For each feature in the list
        for ft in sample_o:

            # Get the feature's value
            value_o = sample_o[ft]
            value_a = sample_a[ft]

            # If the value is str or dict, ignore it
            if type(value_o) == str or type(value_o) == dict:
                continue
            # If it's a tuple
            elif type(value_o) == tuple:
                for e in range(len(value_o)):
                    # Add and index to the feature name
                    conditional_append(f'{ft}_{e}', names)
                    # Append float values to the lists
                    values_o.append(float(value_o[e]))
                    values_a.append(float(value_a[e]))
            # For other data types, just append the name and float values
            else:
                conditional_append(ft, names)
                values_o.append(float(value_o))
                values_a.append(float(value_a))

        # Append processed features to the general list
        all_feats_o[key] = values_o
        all_feats_a[key] = values_a

    return all_feats_o, all_feats_a, names

def extract_features(images_data):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float
    Get the features' names, with tuple features indexed

    Params:
    images_data: dictionary with images names as keys
    and (images, Otsu's, Adaptative) as values

    Returns:
    all_folds_feats: dictionary containing Otsu's features
    and adaptive features for each fold
    names: feature names
    """

    # Create feature extractor
    !wget -c https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
    params = 'Params.yaml'
    settings = {'label': 1, 'correctMask': True}
    extractor = featureextractor.RadiomicsFeatureExtractor(params, additionalInfo=True, **settings)

    # Extract features from Otsu's and adaptative
    feats_o, feats_a = run_extractor(images_data, extractor)

    # Process features and get feature names
    all_feats_o, all_feats_a, names = process_features(feats_o, feats_a)

    return all_feats_o, all_feats_a, names

In [None]:
# Division of images_data dictionary in 4 parts
# to extract features easily

# lista = list(images_data.keys())
# p = lista[:10000]
# s = lista[10000:20000]
# t = lista[20000:30000]
# q = lista[30000:40000]

# pp = {k:images_data[k] for k in p}
# ss = {k:images_data[k] for k in s}
# tt = {k:images_data[k] for k in t}
# qq = {k:images_data[k] for k in q}

In [69]:
# Extract all features
all_feats_o, all_feats_a, ft_names = extract_features(images_data)

--2023-11-26 23:10:38--  https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



INFO:radiomics.featureextractor:Loading parameter file Params.yaml
INFO:radiomics.featureextractor:Applying custom setting overrides: {'additionalInfo': True, 'label': 1, 'correctMask': True}
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with custom settings: {}
INFO:radiomics.featureextractor:Calculating features for original image
INFO:radiomics.featureextractor:Computing firstorder
INFO:radiomics.featureextractor:Computing glcm
INFO:radiomics.featureextractor:Computing glrlm
INFO:radiomics.featureextractor:Computing glszm
INFO:radiomics.featureextractor:Computing gldm
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with c

In [None]:
def save_features(all_feats_o, all_feats_a):
    """
    Save features of all images in .txt files

    Params:
    all_feats_o:

    Returns:
    all_folds_feats = dictionary containing Otsu's features
    and adaptive features for each fold
    names = feature names
    """

    out_o = 'features_o/'
    out_a = 'features_a/'

    os.makedirs(out_o, exist_ok=True)
    os.makedirs(out_a, exist_ok=True)

    for key in all_feats_o:

        ft_o = all_feats_o[key]
        ft_a = all_feats_a[key]

        filename_o = f'{key}_o.txt'
        filename_a = f'{key}_a.txt'

        with open(os.path.join(out_o, filename_o), 'w') as f:
            for elem in ft_o:
                f.write(f'{elem}\n')

        with open(os.path.join(out_a, filename_a), 'w') as f:
            for elem in ft_a:
                f.write(f'{elem}\n')

In [None]:
# Save features and feature_names
save_features(all_feats_o, all_feats_a)
with open('ft_names.txt', 'w') as f:
    f.write('\n'.join(ft_names))

In [None]:
# Download features
!zip -r /content/features_o.zip /content/features_o
!zip -r /content/features_a.zip /content/features_a

from google.colab import files
files.download("/content/features_o.zip")
files.download("/content/features_a.zip")

Read features and folds and prepare for training the model

In [None]:
def read_files(ft_names_path, folds_path):
    """
    Read feature names and fold files

    Params:
    ft_names_path: path to the feature names .txt file
    folds_path: path to the folds .joblib file

    Returns:
    ft_names: list of radiomics feature names
    folds: list of tuples, each tuple is one fold containing (imgs_names, labels)
    """

    ft_names = []

    # Open file and read the content in a list
    with open(ft_names_path, 'r') as f:
        for line in f:

            # Remove linebreak
            x = line[:-1]

            # Add feature name to the list
            ft_names.append(str(x))

    # Load folds
    folds = load(folds_path)

    return ft_names, folds

In [None]:
def read_features(ft_path, fold_images, label):
    """
    Read features from files

    Params:
    ft_path: path to the feature names .txt file
    fold_images: list of image names for a fold
    label

    Returns:
    features: list of features
    names: list of names
    labels: list of labels
    """

    # Lists to return
    features = []
    names = []
    labels = []

    # For feature file in the path
    for ft_file in os.listdir(ft_path):

        # List of features for this image
        ft_o = []

        # Get patient name from the file
        patient = ft_file.split("-")[0]

        # Look for the patient in the fold images_names
        for k, image_name in enumerate(data):
            if patient in image_name:
                # Try to open file and read the content to a list
                try:
                    #print(f'achei o {patient} no fold')
                    with open(os.path.join(ft_path, ft_file), 'r') as f:
                        for line in f:

                            # Remove linebreak
                            x = line[:-1]

                            # Add current feature to the list
                            ft_o.append(float(x))

                    names.append(image_name)
                    labels.append(label[k])
                except:
                    pass

                # Break if patient found
                break

        if len(ft_o) != 0:
            features.append(ft_o)

    return features, names, labels

def create_dfs(fold, ft_names):
    """
    Create dataframes for the data

    Params:
    fold: tuple of (imgs_names, labels)

    Returns:
    ft_names: list of radiomics feature names
    folds: list of tuples, each tuple is one fold containing (imgs, labels)
    """

    x = fold[0]
    y = fold[1]

    # Dataframe for Otsu's features
    ft_o, index_o, label_o = read_features('features_o', x, y)
    df_o = pd.DataFrame(ft_o, columns = ft_names, index = index_o)
    df_o.insert(loc=1, column='label', value=label_o)

    # Dataframe for adaptive features
    ft_a, index_a, label_a = read_features('features_a', x, y)
    df_a = pd.DataFrame(ft_a, columns = ft_names, index = index_a)
    df_a.insert(loc=1, column='label', value=label_a)

    return df_o, df_a


In [None]:
%%bash
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_o.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/features_a.tar.gz
wget -q https://github.com/vitoriastavis/ufpr-medical-images/raw/main/folds.tar.gz
wget -q https://raw.githubusercontent.com/vitoriastavis/ufpr-medical-images/main/ft_names.txt

tar -xf features_o.tar.gz
tar -xf features_a.tar.gz
tar -xf folds.tar.gz

In [None]:
# The variable 'folds' is a list of tuples
# folds[0] is the first fold
# in each fold, there are (image_names, labels)
# which are used to build the dataframes below
# also using ft_names, which are names of the features to use as column names
ft_names, folds = read_files('ft_names.txt', 'folds.joblib')

In [None]:
def knn_classifier(threshold='', data={}):
    """
    classify features using knn

    Params:
    threshold: name of threshold filter
    data: opcional dictionary data

    Returns:
    void
    """

    ans = {"accuracy": [], "recall": [], "f1_score": []}
    print("Classifying...")

    # 5-fold cross validation
    for i in range(5):
        # Create dataframe from current folder
        df_o, df_a = create_dfs(folds[i], ft_names)
        if (threshold == 'otsu'):
            df = df_o
        elif (threshold == 'adaptive'):
            df = df_a
        else:
            df = data

        # Predictor variables
        X = df.drop(['label'], axis=1).values

        # Target variables
        y = df['label'].values

        # Creates classifier with selected parameters
        neigh = KNeighborsClassifier(n_neighbors=7, metric='euclidean')

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Fit the model
        neigh.fit(X_train, y_train)

        # Get the score
        score = neigh.score(X_test, y_test)

        # Predicting
        y_pred = neigh.predict(X_test)

        # Creates confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print (cm)
        print(classification_report(y_test, y_pred))

        # Get classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        recall = report['macro avg']['recall']
        f1 = report['macro avg']['f1-score']

        # List of metrics
        ans["accuracy"].append(score)
        ans["recall"].append(recall)
        ans["f1_score"].append(f1)

    print("Classification done!")

    # Print the results
    results = {"accuracy": round(np.median(ans["accuracy"]), 2), "recall": round(np.median(ans["recall"]), 2), "f1_score": round(np.median(ans["f1_score"]), 2)}
    print(results)

In [None]:
def mlp_classifier(threshold='', data={}):
    """
    classify features using mlp

    Params:
    threshold: name of threshold filter
    data: opcional data to use as dataframe

    Returns:
    void
    """

    ans = {"accuracy": [], "recall": [], "f1_score": []}
    print("Classifying...")

    # 5-fold cross validation
    for i in range(5):
        # Create dataframe from current folder
        df_o, df_a = create_dfs(folds[i], ft_names)
        if (threshold == 'otsu'):
            df = df_o
        elif (threshold == 'adaptive'):
            df = df_a
        else:
            df = data

        # Predictor variables
        X = df.drop(['label'], axis=1).values

        # Target variables
        y = df['label'].values

        # Creates classifier with selected parameters
        mlp = MLPClassifier(max_iter=300, activation='logistic', solver='adam')

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Standardize features by removing the mean and scaling to unit variance
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Fit the model
        mlp.fit(X_train, y_train)

        # Get the score
        score = mlp.score(X_test, y_test)

        # Predicting
        y_pred = mlp.predict(X_test)

        # Creates confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        print (cm)
        print(classification_report(y_test, y_pred))

        # Get classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        recall = report['macro avg']['recall']
        f1 = report['macro avg']['f1-score']

        # List of metrics
        ans["accuracy"].append(score)
        ans["recall"].append(recall)
        ans["f1_score"].append(f1)

    print("Classification done!")

    # Print the results
    results = {"accuracy": round(np.median(ans["accuracy"]), 2), "recall": round(np.median(ans["recall"]), 2), "f1_score": round(np.median(ans["f1_score"]), 2)}
    print(results)

In [None]:
knn_classifier('otsu')

In [None]:
knn_classifier('adaptive')

In [None]:
mlp_classifier('otsu')

In [None]:
mlp_classifier('adaptive')