In [80]:
%%bash
#pip -q install pydicom opencv-python scikit-image pyradiomics

wget -q http://www.inf.ufpr.br/lferrari/imagens_ihq.tar.gz && tar -xf imagens_ihq.tar.gz

In [2]:
import cv2 as cv
import numpy as np
import os
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from joblib import dump, load
import pydicom as dicom
import radiomics
from radiomics import featureextractor
import SimpleITK as sitk
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt

In [77]:
def cut_images(input_path, new_width, new_height, output_path=None):
    """
    Cut images into the desired size and save the output images

    Params:
    input_path = path to the original images
    output_path = path to save the cut images
    new_width = width of the cut images
    new_height = height of the cut images

    Return:
    None
    """

    images_data = {}
    classes = []
    patients = []

    n = 0
    # Browse input path
    for class_dir in os.listdir(input_path):
        class_path = os.path.join(input_path, class_dir)

        # If it is a directory
        if os.path.isdir(class_path):

            # Save image id
            image_id = 1

            # Go through images
            for image_file in os.listdir(class_path):
                image_path = os.path.join(class_path, image_file)

                # Save patient id
                patient = image_file.split("_")[0]
                patients.append(patient)

                image = cv.imread(image_path, cv.IMREAD_GRAYSCALE)

                # If image exists
                if image is not None:

                    # Save subimage id
                    sub_id = 1

                    for i in range(0, image.shape[0], new_height):
                        for j in range(0, image.shape[1], new_width):

                            # Cut image into subimage
                            sub_image = image[i:i+new_height, j:j+new_width]

                            # Image name identifier
                            image_name = f"{patient}-img{image_id}-{sub_id}"

                            # Append image and its label to the dictionary
                            if image_name in images_data:
                                print(f'repetiu {image_name}{class_dir}')

                            images_data[image_name] = sub_image

                            # images_names.append(image_name)
                            # images.append(sub_image)

                            classes.append(int(class_dir))

                            # Write subimage if an output path was given
                            if output_path != None:
                                # Create dir if it doesn't exist
                                os.makedirs(os.path.join(output_path, class_dir), exist_ok=True)
                                # Output file path
                                output_file = f"{image_name}.png"
                                output_file = os.path.join(output_path, class_dir, output_file)
                                # Save subimage
                                cv.imwrite(output_file, sub_image)

                            sub_id += 1
                            n += 1

                image_id += 1

    print(n)

    return (images_data, patients, classes)

In [4]:
def divide_folds(images, patients, classes):
    """
    Divides a dataset into folds for stratified k-fold cross-validation.

    Params:
    images = list of all images
    patients = list with patient id for each image
    classes = list with class for each image

    Return:
    folds = list of tuples, each tuple is one folder
    """
    # Create a list of unique indexes for patients
    unique_patients = list(set(patients))

    # Shuffle the list of unique indexes
    random.shuffle(unique_patients)

    # Divide patients into groups
    n_folds = 4 # since it's not an exact division, there will be 5 folds
    fold_size = len(unique_patients) // n_folds
    patients_folds = [unique_patients[i:i+fold_size] for i in range(0, len(unique_patients), fold_size)]

    # List to save folds
    folds = []

    # Divide images into folds based on patients
    for i, patients_folds in enumerate(patients_folds):
        train_patients = [p for p in unique_patients if p not in patients_folds]
        test_patients = patients_folds

        train_indices = [i for i, patient in enumerate(patients) if patient in train_patients]
        test_indices = [i for i, patient in enumerate(patients) if patient in test_patients]

        train_images = [images[i] for i in train_indices]
        test_images = [images[i] for i in test_indices]
        train_classes = [classes[i] for i in train_indices]
        test_classes = [classes[i] for i in test_indices]

        folds.append((train_images, test_images, train_classes, test_classes))

    return folds


In [5]:
def apply_thresholds(imgs):
    """
    Apply Otsu's and Adaptative thresholds to images

    Params:
    imgs = list of raw images

    Return:
    imgs_otsu = Otsu's thresholded images
    imgs_adapt = Adaptative thresholded images
    """

    for key, value in images_data.items():

        # Otsu's thresholding
        _, th1 = cv.threshold(value, 100, 1, cv.THRESH_BINARY+cv.THRESH_OTSU)

        # Adaptative gaussian thresholding
        #th2 = cv.adaptiveThreshold(img,255,cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY,11,2)
        th3 = cv.adaptiveThreshold(value, 1, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)

        images_data[key] = (value, th1, th3)

    return images_data

In [81]:
# Cut images into 40x30
images_data, patients, classes = cut_images("/content/imagens_ihq_er", 40, 30)

# # Divide images into folds
# images_names = list(images_data.keys())
# folds = divide_folds(images_names, patients, classes)

# # Update images_data, applying Otsu's and adaptive thresholds
# images_data = apply_thresholds(images_data)



repetiu 363RE-img45-1
repetiu 363RE-img45-2
repetiu 363RE-img45-3
repetiu 363RE-img45-4
repetiu 363RE-img45-5
repetiu 363RE-img45-6
repetiu 363RE-img45-7
repetiu 363RE-img45-8
repetiu 363RE-img45-9
repetiu 363RE-img45-10
repetiu 363RE-img45-11
repetiu 363RE-img45-12
repetiu 363RE-img45-13
repetiu 363RE-img45-14
repetiu 363RE-img45-15
repetiu 363RE-img45-16
repetiu 363RE-img45-17
repetiu 363RE-img45-18
repetiu 363RE-img45-19
repetiu 363RE-img45-20
repetiu 363RE-img45-21
repetiu 363RE-img45-22
repetiu 363RE-img45-23
repetiu 363RE-img45-24
repetiu 363RE-img45-25
repetiu 363RE-img45-26
repetiu 363RE-img45-27
repetiu 363RE-img45-28
repetiu 363RE-img45-29
repetiu 363RE-img45-30
repetiu 363RE-img45-31
repetiu 363RE-img45-32
repetiu 363RE-img45-33
repetiu 363RE-img45-34
repetiu 363RE-img45-35
repetiu 363RE-img45-36
repetiu 363RE-img45-37
repetiu 363RE-img45-38
repetiu 363RE-img45-39
repetiu 363RE-img45-40
repetiu 363RE-img45-41
repetiu 363RE-img45-42
repetiu 363RE-img45-43
repetiu 363RE-img45-

In [71]:
len(images_data)

39900

In [58]:
import itertools

lista = list(images_data.keys())
primeira_metade = lista[:len(lista)/2]


TypeError: ignored

In [56]:
itertools.islice(images_data.items(), 2)

<itertools.islice at 0x7aeef7356160>

In [52]:
ff = [['638RE-img1-1', '638RE-img1-2'], ['638RE-img1-3', '638RE-img1-6'], [1, 2], [1, 2]]

Extract features with PyRadiomics

In [28]:
def run_extractor(images_data, extractor):
    """
    Extract features using sitk and pyradiomics

    Params:
    imgs = raw images
    otsu = masked images with otsu thresholding
    adapt = masked images with adaptative thresholding
    extractor = pyradiomics extractor

    Returns:
    features_otsu = features for otsu mask
    features_adapt = features for adaptative mask
    """

    data_spacing=[1,1,1]
    features_otsu = {}
    features_adapt = {}

    for key, value in images_data.items():

        # Get raw, Otsu's and adaptive images
        img = value[0]
        img_otsu = value[1]
        img_adapt = value[2]

        sitk_img = sitk.GetImageFromArray(img)
        sitk_img.SetSpacing((1, 1, 1))
        sitk_img = sitk.JoinSeries(sitk_img)

        sitk_otsu = sitk.GetImageFromArray(img_otsu)
        sitk_otsu.SetSpacing((1, 1, 1))
        sitk_otsu = sitk.JoinSeries(sitk_otsu)
        sitk_otsu = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        sitk_adapt = sitk.GetImageFromArray(img_adapt)
        sitk_adapt.SetSpacing((1, 1, 1))
        sitk_adapt = sitk.JoinSeries(sitk_adapt)
        sitk_adapt = sitk.Cast(sitk_otsu, sitk.sitkInt32)

        # Extract features and append them to the proper list
        try:
            ft_otsu = extractor.execute(sitk_img, sitk_otsu)
            features_otsu[key] = ft_otsu

            ft_adapt = extractor.execute(sitk_img, sitk_adapt)
            features_adapt[key] = ft_adapt

        except:
            print(f"{key}, ", end="")
            pass

    return (features_otsu, features_adapt)



In [19]:
def conditional_append(element, dest):
    """
    Append element to the list destiny, if element is not in destiny

    Params:
    element = an element of any kind
    dest = a destination list

    Returns:
    destiny = list with appended element if the element was not in there
    """
    if element not in dest:
        dest.append(element)

    return dest

def process_features(feats_o, feats_a):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float

    Params:
    feats_o = list of Otsu's threshold features
    feats_a = list of adaptativa threshold features

    Returns:
    all_feats_o = Otsu's features processed
    all_feats_a = adaptative features processed
    names = feature names processed
    """

    all_feats_o = {}
    all_feats_a = {}
    names = []

    # For each image in one of the features list
    for key in feats_o:

        # Get features for Otsu's and adaptive for this sample
        sample_o = feats_o[key]
        sample_a = feats_a[key]

        values_o = []
        values_a = []

        # For each feature in the list
        for ft in sample_o:

            # Get the feature's value
            value_o = sample_o[ft]
            value_a = sample_a[ft]

            # If the value is str or dict, ignore it
            if type(value_o) == str or type(value_o) == dict:
                continue
            # If it's a tuple
            elif type(value_o) == tuple:
                for e in range(len(value_o)):
                    # Add and index to the feature name
                    conditional_append(f'{ft}_{e}', names)
                    # Append float values to the lists
                    values_o.append(float(value_o[e]))
                    values_a.append(float(value_a[e]))
            # For other data types, just append the name and float values
            else:
                conditional_append(ft, names)
                values_o.append(float(value_o))
                values_a.append(float(value_a))

        # Append processed features to the general list
        all_feats_o[key] = values_o
        all_feats_a[key] = values_a

    return (all_feats_o, all_feats_a, names)

def extract_features(images_data):
    """
    Process features, in a way that:
    - features that are dictionaries and strings are removed
    - features that are tuples are separated and each element
    of the tuple is considered one feature
    - other types are converted to float
    Get the features' names, with tuple features indexed

    Params:
    folds = list of tuples containing x_train raw and with thresholds

    Returns:
    all_folds_feats = dictionary containing Otsu's features and adaptive
    features for each fold
    names = feature names
    """

    # Create feature extractor
    !wget -c https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
    params = 'Params.yaml'
    settings = {'label': 1, 'correctMask': True}
    extractor = featureextractor.RadiomicsFeatureExtractor(params, additionalInfo=True, **settings)

    # Extract features from Otsu's and adaptative
    feats_o, feats_a = run_extractor(images_data, extractor)

    # Process features and get feature names
    all_feats_o, all_feats_a, names = process_features(feats_o, feats_a)

    # Save features in the dictionary
    # keys = list(images_data.keys())
    # for i in range(len(images_data)):
    #     key = key[i]
    #     value = images_data[key]
    #     images_data[key] = (value[0], value[1], value[2], all_feats_o[i], all_feats_a[i])

    return (all_feats_o, all_feats_a, names)

In [29]:
all_feats_o, all_feats_a, names = extract_features(blu)

--2023-11-23 23:47:24--  https://raw.githubusercontent.com/AIM-Harvard/pyradiomics/master/examples/exampleSettings/Params.yaml
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 416 Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



INFO:radiomics.featureextractor:Loading parameter file Params.yaml
INFO:radiomics.featureextractor:Applying custom setting overrides: {'additionalInfo': True, 'label': 1, 'correctMask': True}
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with custom settings: {}
INFO:radiomics.featureextractor:Calculating features for original image
INFO:radiomics.featureextractor:Computing firstorder
INFO:radiomics.featureextractor:Computing glcm
INFO:radiomics.featureextractor:Computing glrlm
INFO:radiomics.featureextractor:Computing glszm
INFO:radiomics.featureextractor:Computing gldm
INFO:radiomics.featureextractor:Calculating features with label: 1
INFO:radiomics.featureextractor:Loading image and mask
INFO:radiomics.featureextractor:Computing shape
INFO:radiomics.featureextractor:Adding image type "Original" with c

638RE-img1-4, 638RE-img1-5, 

In [49]:
def save_features(all_feats_o, all_feats_a, names):

    out_o = 'features_o/'
    out_a = 'features_a/'

    os.makedirs(out_o, exist_ok=True)
    os.makedirs(out_a, exist_ok=True)

    for key in all_feats_o:

        ft_o = all_feats_o[key]
        ft_a = all_feats_a[key]

        filename_o = f'{key}_o.txt'
        filename_a = f'{key}_a.txt'

        with open(os.path.join(out_o, filename_o), 'w') as f:
            for elem in ft_o:
                f.write(f'{elem}\n')

        with open(os.path.join(out_a, filename_a), 'w') as f:
            for elem in ft_a:
                f.write(f'{elem}\n')

    with open('ft_names.txt', 'w') as f:
        f.write('\n'.join(names))

In [50]:
save_features(all_feats_o, all_feats_a, names)

In [44]:
def read_features(data):

    features_o = []
    features_a = []

    out_o = 'features_o/'
    out_a = 'features_a/'

    for image_name in data:

        filename_o = f'{image_name}_o.txt'
        filename_a = f'{image_name}_a.txt'

        ft_o = []
        # open file and read the content in a list
        with open(os.path.join(out_o, filename_o), 'r') as f:
            for line in f:
                # remove linebreak from a current name
                # linebreak is the last character of each line
                x = line[:-1]

                # add current item to the list
                ft_o.append(float(x))

        ft_a = []
        # open file and read the content in a list
        with open(os.path.join(out_a, filename_a), 'r') as f:
            for line in f:
                # remove linebreak from a current name
                # linebreak is the last character of each line
                x = line[:-1]

                # add current item to the list
                ft_a.append(float(x))

        features_o.append(ft_o)
        features_a.append(ft_a)

    return (features_o, features_a)

def create_dfs(fold, names):

    x_train = fold[0]
    x_test = fold[1]
    y_train = fold[2]
    y_test = fold[3]

    # ft_o_train, ft_a_train = read_features(x_train)
    # ft_o_test, ft_a_test = read_features(x_test)

    # df_o_train = pd.DataFrame(ft_o_train, columns = names, index = x_train)
    # df_a_train = pd.DataFrame(ft_a_train, columns = names, index = x_train)
    # df_o_test = pd.DataFrame(ft_o_test, columns = names, index = x_test)
    # df_a_test = pd.DataFrame(ft_a_test, columns = names, index = x_test)

    # return  (df_o_train, df_a_train, df_o_test, df_a_test)




In [53]:
df_o_train, df_a_train, df_o_test, df_a_test = create_dfs(ff, names)

In [54]:
df_o_train

Unnamed: 0,diagnostics_Image-original_Spacing_0,diagnostics_Image-original_Spacing_1,diagnostics_Image-original_Spacing_2,diagnostics_Image-original_Size_0,diagnostics_Image-original_Size_1,diagnostics_Image-original_Size_2,diagnostics_Image-original_Mean,diagnostics_Image-original_Minimum,diagnostics_Image-original_Maximum,diagnostics_Mask-original_Spacing_0,...,original_gldm_GrayLevelNonUniformity,original_gldm_GrayLevelVariance,original_gldm_HighGrayLevelEmphasis,original_gldm_LargeDependenceEmphasis,original_gldm_LargeDependenceHighGrayLevelEmphasis,original_gldm_LargeDependenceLowGrayLevelEmphasis,original_gldm_LowGrayLevelEmphasis,original_gldm_SmallDependenceEmphasis,original_gldm_SmallDependenceHighGrayLevelEmphasis,original_gldm_SmallDependenceLowGrayLevelEmphasis
638RE-img1-1,1.0,1.0,1.0,40.0,30.0,1.0,244.448333,221.0,255.0,1.0,...,428.505251,0.249997,2.494749,57.408401,151.066511,33.993874,0.626313,0.026167,0.062094,0.017185
638RE-img1-2,1.0,1.0,1.0,40.0,30.0,1.0,242.6975,185.0,255.0,1.0,...,505.500519,0.237539,2.834891,66.775701,196.872274,34.251558,0.541277,0.019117,0.052406,0.010795
