In [1]:
import os

# Define CIFAR10 classes
classes = ('airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

root_dir="/workspace/sync/SSL-Backdoor/data/CIFAR10"
one_percent_train_file = os.path.join(root_dir, "one_percent_trainset.txt")
ten_percent_train_file = os.path.join(root_dir, "ten_percent_trainset.txt")
cifar10_train_file = os.path.join(root_dir, "cifar10_trainset.txt")
cifar10_test_file = os.path.join(root_dir, "cifar10_testset.txt")

# 存储 CIFAR10 数据集为 PNG 

In [2]:
import torchvision
import torchvision.transforms as transforms
from PIL import Image

# Function to save CIFAR10 dataset as PNG images
def save_cifar10_as_png(root_dir='cifar10_png'):
    # Create the root directory if it does not exist
    if not os.path.exists(root_dir):
        os.makedirs(root_dir)

    # Load CIFAR10 dataset
    transform = transforms.Compose([transforms.ToTensor()])
    trainset = torchvision.datasets.CIFAR10(root='/workspace/sync/dataset', train=True, download=True, transform=transform)
    testset = torchvision.datasets.CIFAR10(root='/workspace/sync/dataset', train=False, download=True, transform=transform)

    
    # Function to save images
    def save_images(dataset, dataset_type):
        dataset_dir = os.path.join(root_dir, dataset_type)
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        
        for i, (image, label) in enumerate(dataset):
            label_name = classes[label]
            folder_path = os.path.join(dataset_dir, label_name)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            
            # Convert tensor to PIL Image and save
            pil_image = transforms.ToPILImage()(image)
            file_name = f'{i:05d}.png'
            pil_image.save(os.path.join(folder_path, file_name))

    # Save training and testing images
    save_images(trainset, 'train')
    save_images(testset, 'val')

    print("CIFAR10 dataset has been saved as PNG images in separate train and test folders.")


# Call the function
save_cifar10_as_png(root_dir=root_dir)


  warn(


Files already downloaded and verified
Files already downloaded and verified
CIFAR10 dataset has been saved as PNG images in separate train and test folders.


# 为 CIFAR10 数据集创建训练配置文件

In [2]:
import os
import random

def create_config_files(root_dir):
    # Function to get file paths from a directory
    def get_file_paths(directory, cls_index):
        file_paths = []
        for filename in os.listdir(directory):
            if filename.endswith('.png'):
                file_path = os.path.join(directory, filename)
                file_paths.append(f'{file_path} {cls_index}')
        return file_paths

    # Function to write config file
    def write_config_file(file_path, paths):
        with open(file_path, 'w') as f:
            for path in paths:
                f.write(f'{path}\n')

    # Generate file paths for train and test sets
    train_paths = []
    test_paths = []

    for cls in classes:
        cls_index = classes.index(cls)
        train_dir = os.path.join(root_dir, 'train', cls)
        test_dir = os.path.join(root_dir, 'val', cls)

        train_paths.extend(get_file_paths(train_dir, cls_index))
        test_paths.extend(get_file_paths(test_dir, cls_index))

    # Write full train and test config files
    write_config_file(cifar10_train_file, train_paths)
    write_config_file(cifar10_test_file, test_paths)

    # Sample for one percent and ten percent train files
    one_percent_paths = random.sample(train_paths, int(len(train_paths) * 0.01))
    ten_percent_paths = random.sample(train_paths, int(len(train_paths) * 0.1))

    # Write sampled config files
    write_config_file(one_percent_train_file, one_percent_paths)
    write_config_file(ten_percent_train_file, ten_percent_paths)

    print("Config files for CIFAR10 dataset have been created.")

# Call the function with the specified root directory
create_config_files(root_dir=root_dir)


Config files for CIFAR10 dataset have been created.


# 提取CIFAR100, 并且为CIFAR100数据集创建训练配置文件

In [5]:
import os
import tarfile
import random
import torchvision
import torchvision.transforms as transforms
from PIL import Image

def extract_cifar100_tar(tar_path, extract_path):
    if not os.path.exists(extract_path):
        os.makedirs(extract_path)
    with tarfile.open(tar_path, "r:gz") as tar:
        tar.extractall(path=extract_path)

def save_cifar100_as_png(root_dir='/workspace/sync/SSL-Backdoor/data/CIFAR100'):
    transform = transforms.Compose([transforms.ToTensor()])
    trainset = torchvision.datasets.CIFAR100(root=root_dir, train=True, download=False, transform=transform)
    testset = torchvision.datasets.CIFAR100(root=root_dir, train=False, download=False, transform=transform)
    classes = trainset.classes

    def save_images(dataset, dataset_type):
        dataset_dir = os.path.join(root_dir, dataset_type)
        if not os.path.exists(dataset_dir):
            os.makedirs(dataset_dir)
        for i, (image, label) in enumerate(dataset):
            label_name = classes[label]
            folder_path = os.path.join(dataset_dir, label_name)
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            pil_image = transforms.ToPILImage()(image)
            pil_image.save(os.path.join(folder_path, f'{i:05d}.png'))

    save_images(trainset, 'train')
    save_images(testset, 'val')
    print("CIFAR100 dataset has been saved as PNG images.")

def create_config_files(root_dir='/workspace/sync/SSL-Backdoor/data/CIFAR100'):
    dataset = torchvision.datasets.CIFAR100(root_dir, train=True, download=False)
    classes = dataset.classes

    cifar100_train_file = os.path.join(root_dir, 'cifar100_train.txt')
    cifar100_test_file = os.path.join(root_dir, 'cifar100_test.txt')
    one_percent_train_file = os.path.join(root_dir, 'cifar100_train_1.txt')
    ten_percent_train_file = os.path.join(root_dir, 'cifar100_train_10.txt')

    def get_file_paths(directory, cls_index):
        return [f"{os.path.join(directory, f)} {cls_index}" for f in os.listdir(directory) if f.endswith('.png')]

    def write_config_file(file_path, paths):
        with open(file_path, 'w') as f:
            for path in paths:
                f.write(f'{path}\n')

    train_paths, test_paths = [], []
    for cls in classes:
        cls_index = classes.index(cls)
        train_dir = os.path.join(root_dir, 'train', cls)
        test_dir = os.path.join(root_dir, 'val', cls)
        train_paths.extend(get_file_paths(train_dir, cls_index))
        test_paths.extend(get_file_paths(test_dir, cls_index))

    write_config_file(cifar100_train_file, train_paths)
    write_config_file(cifar100_test_file, test_paths)

    one_percent_paths = random.sample(train_paths, max(1, int(len(train_paths) * 0.01)))
    ten_percent_paths = random.sample(train_paths, max(1, int(len(train_paths) * 0.1)))

    write_config_file(one_percent_train_file, one_percent_paths)
    write_config_file(ten_percent_train_file, ten_percent_paths)
    print("Config files for CIFAR100 dataset have been created.")


tar_path = "/workspace/sync/dataset/cifar-100-python.tar.gz"
extract_path = "/workspace/sync/SSL-Backdoor/data/CIFAR100"
extract_cifar100_tar(tar_path, extract_path)
save_cifar100_as_png(root_dir=extract_path)
create_config_files(root_dir=extract_path)

CIFAR100 dataset has been saved as PNG images.
Config files for CIFAR100 dataset have been created.


## 为 STL-10 数据集创建训练配置文件

In [4]:
import os
import random
import numpy as np
from PIL import Image

# Define STL10 classes
stl10_classes = ('airplane', 'bird', 'car', 'cat', 'deer', 'dog', 'horse', 'monkey', 'ship', 'truck')

# Paths for STL10 dataset
stl10_root_dir = "/workspace/sync/SSL-Backdoor/data/STL-10"
stl10_train_file = os.path.join(stl10_root_dir, "trainset.txt")
stl10_test_file = os.path.join(stl10_root_dir, "testset.txt")
stl10_one_percent_train_file = os.path.join(stl10_root_dir, "1percent_trainset.txt")
stl10_ten_percent_train_file = os.path.join(stl10_root_dir, "10percent_trainset.txt")
stl10_unlabeled_bin_file = os.path.join(stl10_root_dir, "stl10_binary", "unlabeled_X.bin")
stl10_unlabeled_dir = os.path.join(stl10_root_dir, "unlabeled")
stl10_unlabeled_file = os.path.join(stl10_root_dir, "unlabeledset.txt")

def create_stl10_config_files(root_dir):
    # Function to write config file
    def write_config_file(file_path, paths):
        with open(file_path, 'w') as f:
            for path in paths:
                f.write(f'{path.strip()}\n')

    # Read lines directly from trainset.txt
    with open(stl10_train_file, 'r') as f:
        train_lines = f.readlines()

    # Sample for one percent and ten percent train files
    one_percent_paths = random.sample(train_lines, int(len(train_lines) * 0.01))
    ten_percent_paths = random.sample(train_lines, int(len(train_lines) * 0.1))

    # Write sampled config files
    write_config_file(stl10_one_percent_train_file, one_percent_paths)
    write_config_file(stl10_ten_percent_train_file, ten_percent_paths)

    # Create directory for unlabeled images if it doesn't exist
    if not os.path.exists(stl10_unlabeled_dir):
        os.makedirs(stl10_unlabeled_dir)

    # Load unlabeled data from bin file
    with open(stl10_unlabeled_bin_file, 'rb') as f:
        unlabeled_data = np.fromfile(f, dtype=np.uint8)
        unlabeled_data = unlabeled_data.reshape(-1, 3, 96, 96)
        unlabeled_data = np.transpose(unlabeled_data, (0, 2, 3, 1))

    # Save unlabeled images as PNG and create config file
    unlabeled_paths = []
    for i, img in enumerate(unlabeled_data):
        img_path = os.path.join(stl10_unlabeled_dir, f'unlabeled_{i}.png')
        Image.fromarray(img).save(img_path)
        unlabeled_paths.append(img_path)

    write_config_file(stl10_unlabeled_file, unlabeled_paths)

    print("Config files for STL10 dataset have been created.")

# Call the function with the specified root directory
create_stl10_config_files(root_dir=stl10_root_dir)

Config files for STL10 dataset have been created.


# 分别采样 25%、10%、5% 的ImageNet-100数据集用作蒸馏

In [2]:

import os
import random

data_root = "/workspace/sync/SSL-Backdoor/data/ImageNet-100"
ImageNet100_train_file = os.path.join(data_root, "ImageNet100_trainset.txt")

# train_file format
# /workspace/sync/SSL-Backdoor/data/ImageNet-100/train/n01558993/n01558993_10224.JPEG 0
# /workspace/sync/SSL-Backdoor/data/ImageNet-100/train/n01558993/n01558993_10835.JPEG 0
# /workspace/sync/SSL-Backdoor/data/ImageNet-100/train/n01558993/n01558993_10351.JPEG 0


# Function to read and sample the dataset
def sample_dataset(file_path, sample_fraction):
    # Read the dataset
    with open(file_path, 'r') as file:
        data = file.readlines()

    # Randomly sample a fraction of the dataset
    sample_size = int(len(data) * sample_fraction)
    sampled_data = random.sample(data, sample_size)

    return sampled_data

# Sampling 25% and 5% of the data
sampled_25_percent = sample_dataset(ImageNet100_train_file, 0.25)
sampled_5_percent = sample_dataset(ImageNet100_train_file, 0.05)

# Create new configuration files for the sampled datasets
sampled_25_file_path = os.path.join(data_root, "25percent_trainset.txt")
sampled_5_file_path = os.path.join(data_root, "5percent_trainset.txt")

with open(sampled_25_file_path, 'w') as file:
    file.writelines(sampled_25_percent)

with open(sampled_5_file_path, 'w') as file:
    file.writelines(sampled_5_percent)



# 采样另外的ImageNet类别模拟 label shift

In [10]:
import os
import random

# 文件路径
existing_classes_file = "/workspace/sync/SSL-Backdoor/poison-generation/scripts/imagenet100_classes.txt"
imagenet_1k_dir = "/workspace/sync/imagenet-1k/train"
new_classes_file = "/workspace/sync/SSL-Backdoor/poison-generation/scripts/new_n03085013_imagenet100_classes.txt"

# 读取现有的Imagenet100类别
with open(existing_classes_file, 'r') as file:
    existing_classes = set(file.read().splitlines())

# 获取ImageNet训练集的所有类别
all_classes = set(os.listdir(imagenet_1k_dir))
print(len(all_classes))
# 筛选出未包含在现有类别中的类别
new_classes_candidates = list(all_classes - existing_classes)
print(len(new_classes_candidates))
# 从这些类别中随机选择99个
selected_classes = random.sample(new_classes_candidates, 99)

# 加上类别n07831146
selected_classes.append("n03085013")

# 写入新的配置文件
with open(new_classes_file, 'w') as file:
    for cls in selected_classes:
        file.write(cls + '\n')


1000
900


## 从新的transferring_ImageNet-100中采样1%和10%的数据用作linear probe

In [None]:
import os
import random

def sample_images_from_folder(folder_path, sample_ratio):
    """
    Sample images from a folder at a specified ratio.
    """
    all_images = os.listdir(folder_path)
    sampled_images = random.sample(all_images, int(len(all_images) * sample_ratio))
    return sampled_images
def create_config_file_for_training_and_validation_sorted(training_folder, validation_folder, output_file_train_10, output_file_train_1, output_file_val):
    """
    Create configuration files for training data with sampled images and for all validation images, sorted by class names.
    """
    # Retrieve and sort class folders
    class_folders = sorted([f for f in os.scandir(training_folder) if f.is_dir()], key=lambda x: x.name)
    class_labels = {os.path.basename(f.path): i for i, f in enumerate(class_folders)}

    # Create config files for training data with sorted class names
    with open(output_file_train_10, 'w') as file_10, open(output_file_train_1, 'w') as file_1:
        for folder in class_folders:
            class_label = class_labels[os.path.basename(folder.path)]
            sampled_images = sample_images_from_folder(folder.path, 0.10)
            for image in sampled_images:
                file_10.write(f"{folder.path}/{image} {class_label}\n")

            sampled_images = sample_images_from_folder(folder.path, 0.01)
            for image in sampled_images:
                file_1.write(f"{folder.path}/{image} {class_label}\n")

    # Create config file for validation data with sorted class names
    with open(output_file_val, 'w') as file_val:
        for folder in class_folders:
            class_label = class_labels[os.path.basename(folder.path)]
            val_folder_path = os.path.join(validation_folder, os.path.basename(folder.path))
            val_images = os.listdir(val_folder_path)
            for image in val_images:
                file_val.write(f"{val_folder_path}/{image} {class_label}\n")

# Paths
training_folder_path = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/train"
validation_folder_path = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/val"
output_file_train_10_percent = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/10percent_trainset.txt"
output_file_train_1_percent = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/1percent_trainset.txt"
output_file_validation = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/ImageNet100_valset.txt"

# Call function
create_config_file_for_training_and_validation_sorted(training_folder_path, validation_folder_path, output_file_train_10_percent, output_file_train_1_percent, output_file_validation)

# Note: The class folders are sorted by their names. This code still assumes the same class folders exist in both training and validation directories.



# 从ImageNet100构造数据集配置文件

In [1]:
import os
import random

def sample_images_from_folder(folder_path, sample_ratio):
    """
    Sample images from a folder at a specified ratio.
    """
    all_images = os.listdir(folder_path)
    sampled_images = random.sample(all_images, int(len(all_images) * sample_ratio))
    return sampled_images

training_folder_path = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/train"
validation_folder_path = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/val"
output_file_train_10_percent = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/10percent_trainset.txt"
output_file_train_1_percent = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/1percent_trainset.txt"
output_file_validation = "/workspace/sync/SSL-Backdoor/data/transferring_ImageNet-100_n03085013/ImageNet100_valset.txt"

# Call function
create_config_file_for_training_and_validation_sorted(training_folder_path, validation_folder_path, output_file_train_10_percent, output_file_train_1_percent, output_file_validation)

# Note: The class folders are sorted by their names. This code still assumes the same class folders exist in both training and validation directories.



# 从ImageNet100构造数据集配置文件

In [1]:
import os
import random

def sample_images_from_folder(folder_path, sample_ratio):
    """
    Sample images from a folder at a specified ratio.
    """
    all_images = os.listdir(folder_path)
    sampled_images = random.sample(all_images, int(len(all_images) * sample_ratio))
    return sampled_images

def create_config_file_for_training_and_validation_sorted(training_folder, validation_folder, output_file_train_all, output_file_train_10, output_file_train_1, output_file_val):
    """
    Create configuration files for training data with all images, sampled images, and for all validation images, sorted by class names.
    """
    # Retrieve and sort class folders
    class_folders = sorted([f for f in os.scandir(training_folder) if f.is_dir()], key=lambda x: x.name)
    class_labels = {os.path.basename(f.path): i for i, f in enumerate(class_folders)}

    # Create config files for training data with sorted class names
    with open(output_file_train_all, 'w') as file_all, open(output_file_train_10, 'w') as file_10, open(output_file_train_1, 'w') as file_1:
        for folder in class_folders:
            class_label = class_labels[os.path.basename(folder.path)]
            all_images = os.listdir(folder.path)
            for image in all_images:
                file_all.write(f"{folder.path}/{image} {class_label}\n")

            sampled_images = sample_images_from_folder(folder.path, 0.10)
            for image in sampled_images:
                file_10.write(f"{folder.path}/{image} {class_label}\n")

            sampled_images = sample_images_from_folder(folder.path, 0.01)
            for image in sampled_images:
                file_1.write(f"{folder.path}/{image} {class_label}\n")

    # Create config file for validation data with sorted class names
    with open(output_file_val, 'w') as file_val:
        for folder in class_folders:
            class_label = class_labels[os.path.basename(folder.path)]
            val_folder_path = os.path.join(validation_folder, os.path.basename(folder.path))
            val_images = os.listdir(val_folder_path)
            for image in val_images:
                file_val.write(f"{val_folder_path}/{image} {class_label}\n")

# Paths
training_folder_path = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/train"
validation_folder_path = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/val"
output_file_train_all_images = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/trainset.txt"
output_file_train_10_percent = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/10percent_trainset.txt"
output_file_train_1_percent = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/1percent_trainset.txt"
output_file_validation = "/workspace/sync/SSL-Backdoor/data/ImageNet-100-A/valset.txt"

# Call function
create_config_file_for_training_and_validation_sorted(training_folder_path, validation_folder_path, output_file_train_all_images, output_file_train_10_percent, output_file_train_1_percent, output_file_validation)
