In [1]:
cd /home/ubuntu/fedatk_unl_tj

/home/ubuntu/fedatk_unl_tj


In [2]:
import os
import pickle
from PIL import Image
import numpy as np
import gc

def load_indices(task_x_path):
    """
    Load the indices from the task_x pickle files.
    """
    # Load train indices
    train_file = os.path.join(task_x_path, 'train.pkl')
    with open(train_file, 'rb') as f:
        train_indices = pickle.load(f)

    # Load test indices
    test_file = os.path.join(task_x_path, 'test.pkl')
    with open(test_file, 'rb') as f:
        test_indices = pickle.load(f)

    return train_indices, test_indices

def create_placeholder_image(image_path):
    """
    Replace the image at image_path with a 1x1 black placeholder image.
    """
    placeholder = Image.new("RGB", (1, 1), color=(0, 0, 0))  # Black 1x1 image
    placeholder.save(image_path, format="JPEG")  # Save as JPG (format is 'JPEG' even for .jpg extension)

def generate_indices_to_keep(task_x_path, client_dirs, test_offset = 182637):
    """
    Generate the list of indices to keep based on the train/test indices.
    """
    # Initialize list for desired indices to keep
    indices_to_keep = []

    # For each task_x directory, load the indices
    for i in client_dirs:  # Task numbers 1 to 40

        task_dir = os.path.join(task_x_path, f'task_{i}')

        if os.path.isdir(task_dir):
            train_indices, test_indices = load_indices(task_dir)
            indices_to_keep.extend(train_indices)
            test_indices_with_offset = [idx + test_offset for idx in test_indices]
            indices_to_keep.extend(test_indices_with_offset)
    
    return indices_to_keep

def process_images(data_dir, indices_to_keep):
    """
    Process the CelebA dataset images: replace unused images with placeholders.
    """
    img_dir = os.path.join(data_dir, "celeba/img_align_celeba")
    assert os.path.isdir(img_dir), "Image directory not found!"

    all_images = sorted(os.listdir(img_dir))

    stored = []
    
    for i, img_file in enumerate(all_images):
        img_path = os.path.join(img_dir, img_file)
        if i not in indices_to_keep:
            # Replace the image with a placeholder if it's not in the indices_to_keep list
            create_placeholder_image(img_path)
            # print(f"Replaced {img_file} with a placeholder.")
            pass
        else:
            # print(f"Kept {img_file}.")
            stored.append((i, img_file))
    return stored

def load_celeba_subset(data_dir, indices_to_keep):
    """
    Load CelebA dataset but only access real images for indices_to_keep.
    """
    # Replace unused images with placeholders to save memory
    # process_images(data_dir, indices_to_keep)

    # Load the CelebA dataset as usual
    from torchvision.datasets import CelebA
    from torchvision.transforms import Compose, ToTensor, Normalize

    transform = Compose([
        ToTensor(),
        Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])

    celeba_dataset = CelebA(
        root=data_dir,
        split="train",  # You can adjust this based on the split you need
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels = [31, 20, 15, 35])
    )

    # Subset the dataset to only include the indices_to_keep
    from torch.utils.data import Subset
    celeba_subset = Subset(celeba_dataset, indices_to_keep)
    
    return celeba_subset

# Main function to process and load the dataset
def main():
    data_dir = "/home/ubuntu/fedatk_unl_tj/data/celeba/raw_data"  # Update with the correct path to CelebA raw data
    task_x_path = "/home/ubuntu/fedatk_unl_tj/data/celeba/all_data/train"  # Path to the task_x directories
    client_dirs = range(50)
    # Generate the list of indices to keep
    indices_to_keep = generate_indices_to_keep(task_x_path, client_dirs)


    # run image replacement 
    # stored = process_images(data_dir, indices_to_keep)

    # Load the CelebA dataset with the subset of indices
    # celeba_subset = load_celeba_subset(data_dir, indices_to_keep)
    
    # print(f"Loaded {len(celeba_subset)} images from CelebA dataset.")
    return indices_to_keep

# if __name__ == "__main__":
#     main()


In [3]:
itk = main()
len(itk)

63873

In [4]:

from collections import Counter
# Count occurrences
element_counts = Counter(itk)

# Find repeated elements
repeated_elements = [item for item, count in element_counts.items() if count > 1]
repeated_elements.sort()
print(f"Repeated elements: {repeated_elements}")

Repeated elements: [182667, 182685, 182693, 182702, 182723, 182727]


In [5]:
itk.sort()
filtered_itk = [x for x in itk if x > 182637]

In [6]:
import os
import pickle

# Base path to the task directories
base_path = '/home/ubuntu/fedatk_unl_tj/data/celeba/all_data/train'

# Initialize a list to store all indices
all_test_indices = []
flag = []

# Loop through tasks 0 to 50
for task_num in range(50):  # 0 to 50 inclusive
    task_path = os.path.join(base_path, f'task_{task_num}')
    test_pkl_path = os.path.join(task_path, 'train.pkl')

    # Check if the test.pkl file exists
    if os.path.isfile(test_pkl_path):
        # Load the indices from the test.pkl file
        with open(test_pkl_path, 'rb') as f:
            indices = pickle.load(f)
            all_test_indices.extend(indices)  # Add the indices to the list
            for element in repeated_elements:
                if element in indices:
                    flag.append((task_num, element))

# Print the total number of indices gathered
print(f"Total indices from chosen pkl files: {len(all_test_indices)}")


Total indices from chosen pkl files: 51079


In [7]:
flag

[(5, 182723),
 (8, 182685),
 (15, 182702),
 (20, 182667),
 (25, 182693),
 (47, 182727)]

In [8]:
task_path = os.path.join(base_path, f'task_{5}')
test_pkl_path = os.path.join(task_path, 'train.pkl')

with open(test_pkl_path,'rb') as f:
    aa = pickle.load(f)

aa.sort()
aa

[380,
 404,
 601,
 1739,
 1797,
 2010,
 2091,
 2218,
 2231,
 2399,
 2768,
 3326,
 3333,
 3381,
 3740,
 3906,
 3935,
 4188,
 4606,
 4639,
 4645,
 4816,
 4935,
 5412,
 5538,
 5703,
 5819,
 6952,
 6974,
 7016,
 7100,
 8520,
 8599,
 8608,
 9375,
 9603,
 9659,
 9674,
 9913,
 10034,
 10103,
 10117,
 10272,
 10398,
 11089,
 11560,
 11795,
 11829,
 11904,
 11960,
 12064,
 12526,
 12933,
 13018,
 13416,
 13600,
 13625,
 13859,
 13931,
 14454,
 14585,
 14889,
 15998,
 16161,
 16336,
 16408,
 16450,
 16474,
 16515,
 16538,
 16541,
 16608,
 16651,
 16688,
 17154,
 17308,
 17327,
 17671,
 18293,
 18581,
 18707,
 18745,
 18858,
 19091,
 19267,
 19606,
 19782,
 19838,
 19911,
 20001,
 20368,
 20542,
 20569,
 20686,
 20895,
 21214,
 21357,
 21380,
 21638,
 21666,
 21877,
 22069,
 22228,
 22316,
 22514,
 22543,
 22955,
 23231,
 23315,
 23489,
 24278,
 24457,
 24936,
 25217,
 25222,
 25288,
 25545,
 25720,
 25964,
 26061,
 26122,
 26126,
 26260,
 26333,
 26415,
 26760,
 26909,
 27114,
 27558,
 27666,
 2

In [9]:
import os
import torch
import numpy as np
from torchvision.datasets import CelebA
from torchvision.transforms import Compose, ToTensor, Normalize
from torch.utils.data import Subset

import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import gc
import os

def transform_target(x, required_labels):
    return [x[label] for label in required_labels]

def get_celeba():
    celeba_path = os.path.join("data", "celeba", "raw_data")
    assert os.path.isdir(celeba_path), "Download celeba dataset!"
    
    transform = Compose([
        ToTensor(),
        Normalize(
            (0.4914, 0.4822, 0.4465),
            (0.2023, 0.1994, 0.2010)
        )
    ])
    
    # Load train indices
    train_idx = np.load('data/celeba/train_idx.npy', allow_pickle=True)
    print("loading train set")
    # Process train set
    celeba_train = CelebA(
        root=celeba_path,
        split='train',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])  # Smiling, Male, Eyeglasses, Wearing Hat
    )
    print("got train set processing. ..")
    # celeba_train = Subset(celeba_train, train_idx)

    celeba_data_X_train = []
    celeba_data_y_train = []
    for data in celeba_train:
        celeba_data_X_train.append(data[0].numpy())
        y_lab = int(sum(bit.item() * (2 ** idx) for idx, bit in enumerate(reversed((data[1])))))
        celeba_data_y_train.append(y_lab)
    print("made new list ...")


    # Release memory for train set
    del celeba_train
    gc.collect()

    # Load test indices
    test_idx = np.load('data/celeba/test_idx.npy', allow_pickle=True)
    print("loading test set")
    # Process test set
    celeba_test = CelebA(
        root=celeba_path,
        split='test',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])
    )
    # celeba_test = Subset(celeba_test, test_idx)

    celeba_data_X_test = []
    celeba_data_y_test = []
    for data in celeba_test:
        celeba_data_X_test.append(data[0])
        celeba_data_y_test.append(data[1])

    # Release memory for test set
    del celeba_test
    gc.collect()
    torch.cuda.empty_cache()

    # Combine train and test data
    celeba_data_X = torch.stack(celeba_data_X_train + celeba_data_X_test)
    celeba_data_y = torch.Tensor(celeba_data_y_train + celeba_data_y_test)

    return celeba_data_X, celeba_data_y

def get_celeba_new(batch_size=2048):
    celeba_path = os.path.join("data", "celeba", "raw_data")
    assert os.path.isdir(celeba_path), "Download CelebA dataset!"
    
    transform = Compose([
        ToTensor(),
        Normalize(
            (0.4914, 0.4822, 0.4465),
            (0.2023, 0.1994, 0.2010)
        )
    ])
    

    # Load train and test indices
    train_idx = np.load('data/celeba/train_idx.npy', allow_pickle=True)
    test_idx = np.load('data/celeba/test_idx.npy', allow_pickle=True)
    
    # Load train set (Lazy loading via DataLoader)
    print("Loading train set...")
    celeba_train = datasets.CelebA(
        root=celeba_path,
        split='train',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])  # Smiling, Male, Eyeglasses, Wearing Hat
    )

    # Filter train set with the desired indices
    celeba_train = torch.utils.data.Subset(celeba_train, train_idx)
    
    # Create DataLoader for batch loading
    train_loader = DataLoader(celeba_train, batch_size=batch_size, shuffle=False)
    celeba_data_X_train = []
    celeba_data_y_train = []

    # Process train set in batches
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Perform your processing here (training or analysis)
        # For example, data is a batch of images, targets are the labels
        print(f"Processed batch {batch_idx + 1}/{len(train_loader)}")
        celeba_data_X_train.append(data.numpy())
        celeba_data_y_train.append(batch_labels(targets))

        # Clear cache after processing a batch to free memory
        del data, targets
        torch.cuda.empty_cache()
        gc.collect()

    # Load test set (Lazy loading via DataLoader)
    print("Loading test set...")
    celeba_test = datasets.CelebA(
        root=celeba_path,
        split='test',
        download=False,
        transform=transform,
        target_transform=lambda x: transform_target(x, required_labels=[31, 20, 15, 35])
    )

    # Filter test set with the desired indices
    celeba_test = torch.utils.data.Subset(celeba_test, test_idx)
    
    # Create DataLoader for batch loading
    test_loader = DataLoader(celeba_test, batch_size=batch_size, shuffle=False)
    celeba_data_X_test = []
    celeba_data_y_test = []

    # Process test set in batches
    for batch_idx, (data, targets) in enumerate(test_loader):
        # Perform your processing here (evaluation or analysis)
        print(f"Processed test batch {batch_idx + 1}/{len(test_loader)}")
        celeba_data_X_test.append(data.numpy())
        celeba_data_y_test.append(batch_labels(targets))
        # Clear cache after processing a batch to free memory
        del data, targets
        torch.cuda.empty_cache()
        gc.collect()

    print("Data loading complete.")
    x = np.concatenate(celeba_data_X_train + celeba_data_X_test, axis = 0)
    y = np.concatenate(celeba_data_y_train + celeba_data_y_test, axis = 0)

    x_torch = torch.from_numpy(x)
    y_torch = torch.from_numpy(y)
    return x, y

def batch_labels(targets):
    stacked_labels = torch.stack(targets) 
    transposed_labels = stacked_labels.T
    processed_labels = transposed_labels.tolist()
    
    return [int("".join(map(str, sublist)), 2) for sublist in processed_labels]
    

In [10]:
x,y=get_celeba_new()

Loading train set...
Processed batch 1/80
Processed batch 2/80
Processed batch 3/80
Processed batch 4/80
Processed batch 5/80
Processed batch 6/80
Processed batch 7/80
Processed batch 8/80
Processed batch 9/80
Processed batch 10/80
Processed batch 11/80
Processed batch 12/80
Processed batch 13/80
Processed batch 14/80
Processed batch 15/80
Processed batch 16/80
Processed batch 17/80
Processed batch 18/80
Processed batch 19/80
Processed batch 20/80
Processed batch 21/80
Processed batch 22/80
Processed batch 23/80
Processed batch 24/80
Processed batch 25/80
Processed batch 26/80
Processed batch 27/80
Processed batch 28/80
Processed batch 29/80
Processed batch 30/80
Processed batch 31/80
Processed batch 32/80
Processed batch 33/80
Processed batch 34/80
Processed batch 35/80
Processed batch 36/80
Processed batch 37/80
Processed batch 38/80
Processed batch 39/80
Processed batch 40/80
Processed batch 41/80
Processed batch 42/80
Processed batch 43/80
Processed batch 44/80
Processed batch 45/8

KeyboardInterrupt: 

In [11]:
transform = Compose([
        ToTensor(),
        Normalize(
            (0.4914, 0.4822, 0.4465),
            (0.2023, 0.1994, 0.2010)
        )
    ])
train_data = CelebA(root='data/celeba/raw_data/', download=False, split='train', transform=transform, target_transform=lambda x: transform_target(x, required_labels = [31, 20, 15, 35])) # Smiling, Male, Eyeglasses, Wearing Hat
test_data= CelebA(root='data/celeba/raw_data/', download=False, split='test', transform=transform, target_transform=lambda x: transform_target(x, required_labels = [31, 20, 15, 35])) # Smiling, Male, Eyeglasses, Wearing Hat

In [65]:
train_loader = DataLoader(train_data, batch_size=512, shuffle=False)
celeba_data_X_train = []
celeba_data_y_train = []

for batch_idx, (data, targets) in enumerate(train_loader):
    # Perform your processing here (training or analysis)
    # For example, data is a batch of images, targets are the labels
    print(f"Processed batch {batch_idx + 1}/{len(train_loader)}")
    celeba_data_X_train.append(data.numpy())
    celeba_data_y_train.append(batch_labels(targets))
    # Clear cache after processing a batch to free memory
    del data, targets
    torch.cuda.empty_cache()
    gc.collect()

    if batch_idx == 1:
        break


Processed batch 1/318
Processed batch 2/318


In [75]:
x = np.concatenate(celeba_data_X_train + celeba_data_X_train, axis = 0)

In [73]:
y = np.concatenate(celeba_data_y_train, axis = 0)

In [74]:
y[:5]

array([8, 8, 4, 0, 0])

In [76]:
x.shape

(2048, 3, 55, 45)