### The code was imported from the AI in Medicine Specialization Coursera course Assignment 1 Week 1

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.image import ImageDataGenerator

2021-11-18 12:09:48.301620: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
def compute_class_freqs(labels):
    """
    Compute positive and negative frequences for each class.

    Args:
        labels (np.array): matrix of labels, size (num_examples, num_classes)
    Returns:
        positive_frequencies (np.array): array of positive frequences for each
                                         class, size (num_classes)
        negative_frequencies (np.array): array of negative frequences for each
                                         class, size (num_classes)
    """
    # total number of patients (rows).
    N = labels.shape[0]
    
    positive_frequencies = np.sum(labels, axis=0) / N
    negative_frequencies = np.sum(labels==0, axis=0) / N
    
    return positive_frequencies, negative_frequencies

### 1- All 14 labels - Only the train set

In [3]:
train_df = pd.read_csv(
    filepath_or_buffer="../../labels/train_validation_split_data/train_u-zeroes.csv",
    dtype={  # Setting labels to type np.int32 was necessary for conversion to tf.Tensor object
        "Path": str,
        "Atelectasis": np.int32,
        "Cardiomegaly": np.int32,
        "Consolidation": np.int32,
        "Edema": np.int32,
        "Pleural Effusion": np.int32,
        "Pleural Other": np.int32,
        "Pneumonia": np.int32,
        "Pneumothorax": np.int32,
        "Enlarged Cardiomediastinum": np.int32,
        "Lung Opacity": np.int32,
        "Lung Lesion": np.int32,
        "Fracture": np.int32,
        "Support Devices": np.int32,
        "No Finding": np.int32
    }
)

In [4]:
list_columns = list(train_df.columns)
y_cols = list_columns[1::]  # First column is 'Path' column

In [5]:
train_datagen = ImageDataGenerator()

In [6]:
train_datagenerator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory="../../dataset/",
    x_col='Path',
    y_col=y_cols,
    weight_col=None,
    target_size=(512, 512),
    color_mode='grayscale',
    class_mode='raw',
    batch_size=16,
    shuffle=True,
    validate_filenames=True
)

Found 178731 validated image filenames.


In [7]:
train_datagenerator.labels

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [8]:
freq_pos, freq_neg = compute_class_freqs(train_datagenerator.labels)

In [9]:
freq_pos

array([0.14991803, 0.12124925, 0.06616088, 0.23491728, 0.38565218,
       0.0156716 , 0.02695671, 0.08675607, 0.04840235, 0.47166972,
       0.04108409, 0.04054697, 0.51889152, 0.09955744])

In [10]:
freq_neg

array([0.85008197, 0.87875075, 0.93383912, 0.76508272, 0.61434782,
       0.9843284 , 0.97304329, 0.91324393, 0.95159765, 0.52833028,
       0.95891591, 0.95945303, 0.48110848, 0.90044256])

In [11]:
pos_weights = freq_neg
neg_weights = freq_pos

In [12]:
pos_weights

array([0.85008197, 0.87875075, 0.93383912, 0.76508272, 0.61434782,
       0.9843284 , 0.97304329, 0.91324393, 0.95159765, 0.52833028,
       0.95891591, 0.95945303, 0.48110848, 0.90044256])

In [13]:
neg_weights

array([0.14991803, 0.12124925, 0.06616088, 0.23491728, 0.38565218,
       0.0156716 , 0.02695671, 0.08675607, 0.04840235, 0.47166972,
       0.04108409, 0.04054697, 0.51889152, 0.09955744])

In [14]:
np.save("chexpert_positive_weights_full_labels.npy", pos_weights)
np.save("chexpert_negative_weights_full_labels.npy", neg_weights)

### Only the 5 CheXpert competition labels - Only the train set

In [15]:
train_df = pd.read_csv(
    filepath_or_buffer="../../labels/train_validation_split_data/train_u-zeroes_chexpert.csv",
    dtype={  # Setting labels to type np.int32 was necessary for conversion to tf.Tensor object
        "Path": str,
        "Atelectasis": np.int32,
        "Cardiomegaly": np.int32,
        "Consolidation": np.int32,
        "Edema": np.int32,
        "Pleural Effusion": np.int32
    }
)

In [16]:
list_columns = list(train_df.columns)
y_cols = list_columns[1::]  # First column is 'Path' column

In [17]:
train_datagen = ImageDataGenerator()

In [18]:
train_datagenerator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory="../../dataset/",
    x_col='Path',
    y_col=y_cols,
    weight_col=None,
    target_size=(512, 512),
    color_mode='grayscale',
    class_mode='raw',
    batch_size=16,
    shuffle=True,
    validate_filenames=True
)

Found 178731 validated image filenames.


In [19]:
train_datagenerator.labels

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1]], dtype=int32)

In [20]:
freq_pos, freq_neg = compute_class_freqs(train_datagenerator.labels)

In [21]:
freq_pos

array([0.14991803, 0.12124925, 0.06616088, 0.23491728, 0.38565218])

In [22]:
freq_neg

array([0.85008197, 0.87875075, 0.93383912, 0.76508272, 0.61434782])

In [23]:
pos_weights = freq_neg
neg_weights = freq_pos

In [24]:
pos_weights

array([0.85008197, 0.87875075, 0.93383912, 0.76508272, 0.61434782])

In [25]:
neg_weights

array([0.14991803, 0.12124925, 0.06616088, 0.23491728, 0.38565218])

In [26]:
np.save("chexpert_positive_weights_chexpert_labels.npy", pos_weights)
np.save("chexpert_negative_weights_chexpert_labels.npy", neg_weights)