In [1]:
import os
import torch
import pandas as pd

from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

In [None]:
class ImageDataFrameDataset(Dataset):
    def __init__(self, data_path):
        self.df = pd.read_csv(os.path.join(data_path, "metadata.csv"))
        self.target_col = 'diagnostic'
        
        self.img_size = (224, 224)

        # Define categorical column names explicitly
        self.bool_cats = ['smoke', 'drink', 'pesticide', 'skin_cancer_history', 'cancer_history','has_piped_water', 'has_sewage_system', 'itch', 'grew', 'hurt','bleed', 'elevation', 'biopsed', 'changed']
        self.string_cats = ['background_father', 'background_mother', 'gender', 'region']
        self.numeric_cols = ['age', 'diameter_1', 'diameter_2']
        # self.drop_cols = ['patient_id', 'lesion_id', 'img_id']

        # Combine all to one categorical column list (excluding image_path & target)
        self.cat_cols = self.bool_cats + self.string_cats

        self.encoder = OrdinalEncoder()
        self.x = self.encoder.fit_transform(self.df[self.cat_cols])
        self.x = pd.concat([self.df[self.numeric_cols], pd.DataFrame(self.x, columns=self.cat_cols)], axis=1)
        
        self.target_encoder = LabelEncoder()
        self.y = self.target_encoder.fit_transform(self.df[self.target_col])

        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        x = self.x.iloc[idx]
        image = Image.open(os.path.join(self.data_path, "imgs", row['img_id'])).convert('RGB').resize(self.img_size)
        y = self.y.iloc[idx]

        return x, image, y

In [None]:
data_path = os.path.join(os.getenv('HOME'), "data/PAD_UFES_20")
dataset = ImageDataFrameDataset(data_path)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


In [5]:
data_path = os.path.join(os.getenv('HOME'), "data/PAD-UFES-20")

In [4]:
data_path

'/home/wall/data/PAD_UFES_20'

In [6]:
df = pd.read_csv(os.path.join(data_path, "metadata.csv"))

In [10]:
df.columns

Index(['patient_id', 'lesion_id', 'smoke', 'drink', 'background_father',
       'background_mother', 'age', 'pesticide', 'gender',
       'skin_cancer_history', 'cancer_history', 'has_piped_water',
       'has_sewage_system', 'fitspatrick', 'region', 'diameter_1',
       'diameter_2', 'diagnostic', 'itch', 'grew', 'hurt', 'changed', 'bleed',
       'elevation', 'img_id', 'biopsed'],
      dtype='object')