In [1]:
import sys
import os
import pandas as pd

# Path to the src directory
src_path = '/cluster/home/taheeraa/code/master-thesis/01-multi-label/src'

if src_path not in sys.path:
    sys.path.append(src_path)
    
from utils.handle_class_imbalance import generate_class_weights

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
data_path = "/cluster/home/taheeraa/datasets/chestxray-14"
labels = [
    "Atelectasis", 
    "Cardiomegaly",
    "Effusion", 
    "Infiltration", 
    "Mass",
    "Nodule",
    "Pneumonia",
    "Pneumothorax",  
    "Consolidation",
    "Edema",
    "Emphysema",
    "Fibrosis",
    "Pleural_Thickening",
    "Hernia"
]
file_path_train = data_path + '/train_official.txt'
file_path_val = data_path + '/val_official.txt'
file_path_test = data_path + '/test_official.txt'

columns = ['Image Filename'] + labels

df_train = pd.read_csv(file_path_train, sep='\s+', names=columns)
df_val = pd.read_csv(file_path_val, sep='\s+', names=columns)
df_test = pd.read_csv(file_path_test, sep='\s+', names=columns)

# Finding all image paths, and mapping them to the DataFrame
subfolders = [f"images_{i:03}/images" for i in range(1, 13)]  # Generates 'images_001' to 'images_012'
path_mapping = {}
for subfolder in subfolders:
    full_folder_path = os.path.join(data_path, subfolder)
    for img_file in os.listdir(full_folder_path):
        path_mapping[img_file] = os.path.join(full_folder_path, img_file)

# Update the DataFrame using the mapping
df_train['Full Image Path'] = df_train['Image Filename'].map(path_mapping)
df_val['Full Image Path'] = df_val['Image Filename'].map(path_mapping)
df_test['Full Image Path'] = df_test['Image Filename'].map(path_mapping)

# Move 'Full Image Path' to the front of the DataFrame
cols_train = ['Full Image Path'] + [col for col in df_train.columns if col != 'Full Image Path']
cols_val = ['Full Image Path'] + [col for col in df_val.columns if col != 'Full Image Path']
cols_test = ['Full Image Path'] + [col for col in df_test.columns if col != 'Full Image Path']
df_train = df_train[cols_train]
df_val = df_val[cols_val]
df_test = df_test[cols_test]

# Drop 'Image Filename' column
df_train = df_train.drop(columns=['Image Filename'])
df_val = df_val.drop(columns=['Image Filename'])
df_test = df_test.drop(columns=['Image Filename'])

In [10]:
df_train.columns

Index(['Full Image Path', 'Atelectasis', 'Cardiomegaly', 'Effusion',
       'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax',
       'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening',
       'Hernia'],
      dtype='object')

In [17]:
df_train_calculate_weights = df_train.drop(columns=['Full Image Path']).to_numpy()

In [21]:
class_weights

{0: 0.7466243679984138,
 1: 3.583896449985724,
 2: 0.7032852100181163,
 3: 0.446944879646774,
 4: 1.5051562874730193,
 5: 1.3249824067558058,
 6: 6.986270871985158,
 7: 2.397249809014515,
 8: 2.1691244239631335,
 9: 4.445808736717828,
 10: 4.303542857142857,
 11: 4.967154728927582,
 12: 2.791607976870042,
 13: 42.69387755102041}

In [31]:
class_weights_dict = generate_class_weights(df_train_calculate_weights, multi_class=False, one_hot_encoded=True)
label_weights_dict = {labels[i]: class_weights_dict[i] for i in range(len(labels))}
label_weights_dict

{'Atelectasis': 0.7466243679984138,
 'Cardiomegaly': 3.583896449985724,
 'Effusion': 0.7032852100181163,
 'Infiltration': 0.446944879646774,
 'Mass': 1.5051562874730193,
 'Nodule': 1.3249824067558058,
 'Pneumonia': 6.986270871985158,
 'Pneumothorax': 2.397249809014515,
 'Consolidation': 2.1691244239631335,
 'Edema': 4.445808736717828,
 'Emphysema': 4.303542857142857,
 'Fibrosis': 4.967154728927582,
 'Pleural_Thickening': 2.791607976870042,
 'Hernia': 42.69387755102041}

In [29]:
class_weights_list = [class_weights_dict[i] for i in class_weights_dict]
class_weights_tensor = torch.tensor(class_weights_list, dtype=torch.float32)

In [30]:
import torch

criterion = torch.nn.BCEWithLogitsLoss(weight=class_weights_tensor)