In [1]:
cd /home/ubuntu/fedatk_unl_tj

/home/ubuntu/fedatk_unl_tj


In [2]:
import os
import gc
import numpy as np
import torch
import pickle
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import Compose, ToTensor, Normalize


def batch_labels(targets):
    stacked_labels = torch.stack(targets)
    transposed_labels = stacked_labels.T
    processed_labels = transposed_labels.tolist()
    
    return [int("".join(map(str, sublist)), 2) for sublist in processed_labels]

def transform_target(x, required_labels):
    return [x[label] for label in required_labels]


def save_numpy_batch(file_path, data, labels):
    # Save data and labels as numpy arrays
    with open(file_path, "wb") as f:
        pickle.dump((data, labels), f)


def load_numpy_batches(file_dir):
    # Load all batches from the directory and concatenate them
    data_list, label_list = [], []
    for file_name in sorted(os.listdir(file_dir)):  # Sort to maintain batch order
        file_path = os.path.join(file_dir, file_name)
        with open(file_path, "rb") as f:
            data, labels = pickle.load(f)
            data_list.append(data)
            label_list.append(labels)
    return np.concatenate(data_list, axis=0), np.concatenate(label_list, axis=0)


def get_celeba_new(batch_size=2048):
    celeba_path = os.path.join("data", "celeba", "raw_data")
    assert os.path.isdir(celeba_path), "Download CelebA dataset!"

    transform = Compose([
        ToTensor(),
        Normalize(
            (0.4914, 0.4822, 0.4465),
            (0.2023, 0.1994, 0.2010)
        )
    ])

    # Load train and test indices
    train_idx = np.load('data/celeba/train_idx.npy', allow_pickle=True)
    test_idx = np.load('data/celeba/test_idx.npy', allow_pickle=True)

    # Prepare directories for temporary files
    os.makedirs("celeba_pickle/train_batches", exist_ok=True)
    os.makedirs("celeba_pickle/test_batches", exist_ok=True)

    # Load train set
    print("Processing train set...")
    celeba_train = datasets.CelebA(
        root=celeba_path,
        split='train',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])  # Adjust labels
    )
    celeba_train = torch.utils.data.Subset(celeba_train, train_idx)
    train_loader = DataLoader(celeba_train, batch_size=batch_size, shuffle=False)

    for batch_idx, (data, targets) in enumerate(train_loader):
        print(f"Saving train batch {batch_idx + 1}/{len(train_loader)}")

        # Convert the data to float16 before saving it
        data = data.half()  # Convert to float16
        save_numpy_batch(
            f"celeba_pickle/train_batches/train_batch_{batch_idx}.pkl",
            data.numpy(),
            batch_labels(targets)
        )

        del data, targets
        torch.cuda.empty_cache()
        gc.collect()

    # Load test set
    print("Processing test set...")
    celeba_test = datasets.CelebA(
        root=celeba_path,
        split='test',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])
    )
    celeba_test = torch.utils.data.Subset(celeba_test, test_idx)
    test_loader = DataLoader(celeba_test, batch_size=batch_size, shuffle=False)

    for batch_idx, (data, targets) in enumerate(test_loader):
        print(f"Saving test batch {batch_idx + 1}/{len(test_loader)}")

        # Convert the data to float16 before saving it
        data = data.half()  # Convert to float16
        save_numpy_batch(
            f"celeba_pickle/test_batches/test_batch_{batch_idx}.pkl",
            data.numpy(),
            batch_labels(targets)
        )

        del data, targets
        torch.cuda.empty_cache()
        gc.collect()

    # Load and combine all batches
    print("Combining train batches...")
    x_train, y_train = load_numpy_batches("celeba_pickle/train_batches")
    print("Combining test batches...")
    x_test, y_test = load_numpy_batches("celeba_pickle/test_batches")

    # Convert to PyTorch tensors
    x = torch.from_numpy(np.concatenate([x_train, x_test], axis=0))
    y = torch.from_numpy(np.concatenate([y_train, y_test], axis=0))

    return x, y


In [None]:
# x, y = get_celeba_new()

In [4]:
def get_celeba():
    # Load and combine all batches
    print("Combining train batches...")
    x_train, y_train = load_numpy_batches("celeba_pickle/train_batches")
    x_train_tensor = torch.from_numpy(x_train)
    y_train_tensor = torch.from_numpy(y_train)

    print("Combining test batches...")
    x_test, y_test = load_numpy_batches("celeba_pickle/test_batches")

    # Convert to PyTorch tensors
    x = torch.from_numpy(np.concatenate([x_train, x_test], axis=0))
    y = torch.from_numpy(np.concatenate([y_train, y_test], axis=0))

    del x_train, y_train, x_test, y_test

    return x, y

Combining train batches...
Combining test batches...


In [6]:
x.shape

torch.Size([182732, 3, 55, 45])

In [7]:
y.shape

torch.Size([182732])

In [8]:
torch.cuda.empty_cache()