## [Process](https://huggingface.co/docs/datasets/process)

### [Format transform](https://huggingface.co/docs/datasets/process#format-transform)

In [1]:
from datasets import load_dataset

dataset = load_dataset("data/animal") # ちなみに、`drop_label` の値にかかわらず、`metadata.csv`がある場合は未記載のファイルがあるとエラーになる

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 2 examples [00:00, 102.12 examples/s]


In [2]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=652x515>,
 'label': 0}

In [3]:
dataset["train"][0:]

{'image': [<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=652x515>,
  <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=660x660>],
 'label': [0, 0]}

In [4]:
from torchvision import transforms

# 辞書の値がリストであることに注意。
# dataset["train"][0] で呼び出した場合は、len(batch["image"]) == 1 となる。
# dataset["train"][0:] で呼び出した場合は、len(batch["image"]) は元配列の長さとなりそう。 
def transform(batch: dict[str, list[any]]) -> dict[str, list[any]]:
    cropped = [transforms.functional.center_crop(image, 100) for image in batch["image"]]
    return {"image": cropped, "label": batch["label"]}

dataset.set_transform(transform, ["image", "label"])

In [5]:
dataset["train"][0:]

{'image': [<PIL.Image.Image image mode=RGB size=100x100>,
  <PIL.Image.Image image mode=RGB size=100x100>],
 'label': [0, 0]}

### [Format](https://huggingface.co/docs/datasets/process#format)

In [6]:
from datasets import load_dataset

dataset = load_dataset("data/animal")

In [7]:
for data in dataset["train"]:
    print(f"{data=}")
    break

data={'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=652x515 at 0x2D2DFE892B0>, 'label': 0}


実験: PIL.Imageを含むデータセットをDataloaderで処理する場合、PyTorch,HuggingFaceともにTensorへの変換が必要なことを確認。

In [8]:
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder

# PyTorchの方のDatasetは、PIL.Imageを内部でTensorに変換してくれる...と思っていたが全然勘違いで、普通に transform で変換する必要がある
torch_dataset = ImageFolder("data/animal")
torch_dataloader = DataLoader(torch_dataset)

# TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>
for data in torch_dataloader:
    print(f"{data=}")
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>

In [None]:
from torch.utils.data import DataLoader

dataloader = DataLoader(dataset["train"])

# TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.JpegImagePlugin.JpegImageFile'>
for data in dataloader:
    print(f"{data=}")
    break

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.JpegImagePlugin.JpegImageFile'>

実験: PyTorch,HuggingFaceのデータセットと、HuggingFaceのデータセットにtorchのフォーマットを設定した場合の3種類で、DataLoaderから読みだした値の型を確認。

In [9]:
# パターン1: PyTorchのDatasetを使う場合
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import transforms

transform = transforms.Compose([
    transforms.Resize((16, 16)),
    transforms.ToTensor()
])
torch_dataset_with_transform = ImageFolder("data/animal", transform=transform)

# ポイント: PyTorchのImageFolderで読み込んだ場合、DatasetDictではなくDatasetである（="train"でsplitを指定しない）
# また、個別のアイテムは、(image, label) のようなタプルである
print(f"dataset= {torch_dataset_with_transform},\ndataset[0]= {torch_dataset_with_transform[0]}")

torch_dataloader_with_transform = DataLoader(torch_dataset_with_transform, batch_size=2)

for data in torch_dataloader_with_transform:
    print(f"{data=}") # list[torch.Tensor[batch_size, value]] となる
    break

for images, labels in torch_dataloader_with_transform:
    print(f"{images=}, {labels=}")
    break

dataset= Dataset ImageFolder
    Number of datapoints: 2
    Root location: data/animal
    StandardTransform
Transform: Compose(
               Resize(size=(16, 16), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
           ),
dataset[0]= (tensor([[[0.1020, 0.0863, 0.0980, 0.1686, 0.2039, 0.3059, 0.4353, 0.4275,
          0.4196, 0.4706, 0.5412, 0.3725, 0.3961, 0.5098, 0.5529, 0.5373],
         [0.1412, 0.1373, 0.1333, 0.1922, 0.2784, 0.4314, 0.4471, 0.4314,
          0.4706, 0.5333, 0.5686, 0.4980, 0.7608, 0.7686, 0.6078, 0.4745],
         [0.2157, 0.2118, 0.2314, 0.2784, 0.3647, 0.4000, 0.4275, 0.4824,
          0.5176, 0.5490, 0.4980, 0.6235, 0.7961, 0.7961, 0.7255, 0.4353],
         [0.2471, 0.2902, 0.4000, 0.4784, 0.5098, 0.4784, 0.4980, 0.5333,
          0.5216, 0.4824, 0.5020, 0.7882, 0.8235, 0.6549, 0.5216, 0.3804],
         [0.3686, 0.5412, 0.5569, 0.5020, 0.4745, 0.4706, 0.4510, 0.5216,
          0.5059, 0.6039, 0.8118, 0.7804, 0.7176, 0.529

In [10]:
# パターン2: HuggingFaceのDatasetをそのまま使う場合
from datasets import load_dataset
from torchvision import transforms

huggingface_datasets_with_transform = load_dataset("imagefolder", data_dir="data/animal")

# ポイント: HuggingFaceのDatasetsのImageFolderで読み込んだ場合、DatasetDictDatasetである。
# また、個別のアイテムは、{image: any, label: any} のような辞書である
print("Before set_transform")
print(f"dataset= {huggingface_datasets_with_transform},\ndataset['train']= {huggingface_datasets_with_transform['train']},\ndataset['train'][0]= {huggingface_datasets_with_transform['train'][0]}")

transform = transforms.Compose([
    transforms.Resize((16, 16)),
    transforms.ToTensor()
])
def transform_to_set(batch: dict[str, list[any]]) -> dict[str, list[any]]:
    transformed = [transform(image) for image in batch["image"]]
    return {"image": transformed, "label": batch["label"]}

huggingface_datasets_with_transform.set_transform(transform_to_set)
print("After set_transform")
print(f"dataset= {huggingface_datasets_with_transform},\ndataset['train']= {huggingface_datasets_with_transform['train']},\ndataset['train'][0]= {huggingface_datasets_with_transform['train'][0]}")

huggingface_dataloader_with_transform = DataLoader(huggingface_datasets_with_transform["train"], batch_size=2)

for data in huggingface_dataloader_with_transform:
    print(f"{data=}")  # dict[str, any] となる
    break

for images, labels in huggingface_dataloader_with_transform:
    print(f"{images=}, {labels=}")  # "image", "label" という文字列が入っている。
    break

Generating train split: 2 examples [00:00, 782.81 examples/s]

Before set_transform
dataset= DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 2
    })
}),
dataset['train']= Dataset({
    features: ['image', 'label'],
    num_rows: 2
}),
dataset['train'][0]= {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=652x515 at 0x2D2E1BE2B10>, 'label': 0}
After set_transform
dataset= DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 2
    })
}),
dataset['train']= Dataset({
    features: ['image', 'label'],
    num_rows: 2
}),
dataset['train'][0]= {'image': tensor([[[0.1020, 0.0863, 0.0980, 0.1686, 0.2039, 0.3059, 0.4353, 0.4275,
          0.4196, 0.4706, 0.5412, 0.3725, 0.3961, 0.5098, 0.5529, 0.5373],
         [0.1412, 0.1373, 0.1333, 0.1922, 0.2784, 0.4314, 0.4471, 0.4314,
          0.4706, 0.5333, 0.5686, 0.4980, 0.7608, 0.7686, 0.6078, 0.4745],
         [0.2157, 0.2118, 0.2314, 0.2784, 0.3647, 0.4000, 0.4275, 0.4824,
          0.5176, 0.5490, 0.4980, 0.6235,




In [11]:
# パターン3: HuggingFaceのDatasetをPyTorchのDatasetに変換する場合
from datasets import load_dataset

huggingface_datasets = load_dataset("imagefolder", data_dir="data/animal")
huggingface_datasets_to_torch = huggingface_datasets["train"].with_format("torch")

# ポイント: フォーマットをtorchにしようが、個別のアイテムはタプルではなく辞書である。
print(f"dataset= {huggingface_datasets_to_torch},\ndataset[0]= {huggingface_datasets_to_torch[0]}")


dataset= Dataset({
    features: ['image', 'label'],
    num_rows: 2
}),
dataset[0]= {'image': tensor([[[  5,   5,   5],
         [  4,   4,   4],
         [  3,   3,   1],
         ...,
         [120,  80,  31],
         [107,  65,  15],
         [138,  96,  46]],

        [[  5,   5,   3],
         [  5,   5,   3],
         [  6,   6,   4],
         ...,
         [113,  70,  19],
         [116,  73,  22],
         [147, 104,  53]],

        [[  3,   3,   1],
         [  4,   4,   2],
         [  5,   6,   1],
         ...,
         [122,  75,  23],
         [121,  75,  25],
         [153, 107,  57]],

        ...,

        [[ 97, 110, 100],
         [ 40,  54,  41],
         [ 39,  53,  38],
         ...,
         [ 96, 107, 101],
         [ 35,  45,  37],
         [ 91, 101,  93]],

        [[ 58,  71,  61],
         [ 43,  57,  44],
         [ 52,  66,  51],
         ...,
         [107, 118, 112],
         [146, 156, 148],
         [124, 134, 126]],

        [[ 42,  55,  45],
     

実験結果も踏まえた、正しいDataloaderでの読み込み方

In [13]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from torchvision import transforms

dataset_v2 = load_dataset("imagefolder", data_dir="data/animal")
transform = transforms.Compose([
    transforms.Resize((16, 16)),
    transforms.ToTensor()
])
def transform_to_set(batch: dict[str, list[any]]) -> dict[str, list[any]]:
    transformed = [transform(image) for image in batch["image"]]
    return {"image": transformed, "label": batch["label"]}
dataset_v2.set_transform(transform_to_set)

dataloader_v2 = DataLoader(dataset_v2["train"], batch_size=2)
for batch in dataloader_v2:
    images, labels = batch["image"], batch["label"]
    print(f"{images=}, {labels=}")
    break

images=tensor([[[[0.1020, 0.0863, 0.0980,  ..., 0.5098, 0.5529, 0.5373],
          [0.1412, 0.1373, 0.1333,  ..., 0.7686, 0.6078, 0.4745],
          [0.2157, 0.2118, 0.2314,  ..., 0.7961, 0.7255, 0.4353],
          ...,
          [0.2000, 0.3216, 0.3725,  ..., 0.3725, 0.3529, 0.3922],
          [0.2745, 0.3961, 0.3922,  ..., 0.3451, 0.3922, 0.4431],
          [0.2941, 0.3569, 0.3843,  ..., 0.4196, 0.4157, 0.4078]],

         [[0.0824, 0.0706, 0.0784,  ..., 0.3765, 0.3569, 0.3569],
          [0.1333, 0.1137, 0.1098,  ..., 0.6392, 0.4667, 0.3373],
          [0.2157, 0.1961, 0.2000,  ..., 0.6588, 0.6000, 0.3647],
          ...,
          [0.2980, 0.3765, 0.4078,  ..., 0.4196, 0.4078, 0.4392],
          [0.3412, 0.4196, 0.4118,  ..., 0.3804, 0.4275, 0.4745],
          [0.3529, 0.3961, 0.4157,  ..., 0.4353, 0.4353, 0.4235]],

         [[0.0471, 0.0353, 0.0392,  ..., 0.2275, 0.1843, 0.1882],
          [0.0863, 0.0627, 0.0510,  ..., 0.4941, 0.3216, 0.2000],
          [0.1686, 0.1451, 0.1294, 

### Map

[map()](https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Dataset.map)はCallableなら何でも渡せる。

しかし、注意しないとDataLoaderでバッチ処理を行う際、バッチ次元を加えたテンソルではなく、テンソルのリストが返ってきてしまう。

In [34]:
from datasets import load_dataset
dataset = load_dataset("xhiroga/MiniAlbum")
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 8
    })
})

In [10]:
# map() かつ batched=False で、transform するとリストになる。これは、HuggingFaceのDatasetが多様なデータ（テンソル以外を含む）を扱うための仕様と思われる。
from torch.utils.data import DataLoader
from torchvision import transforms

compose = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])
def transform(example: dict[str, any]) -> dict[str, any]:
    return {"image": compose(example["image"])}
transformed = dataset.map(transform, batched=False)

dataloader = DataLoader(transformed["train"], batch_size=2)
batch = next(iter(dataloader))
print(f"{type(batch["image"])=}") # <class 'list'>

type(batch["image"])=<class 'list'>


In [35]:
import torch
from torch.utils.data import DataLoader
from torchvision import transforms

compose = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor()
])
def transform(batch: dict[str, list[any]]) -> dict[str, any]:
    # type(batch["image"]) == list だが、transforms.Compose() がバッチ処理を行う際は、対象はテンソルである必要がある。
    # [Can transforms.Compose handle a batch of Images? - vision - PyTorch Forums](https://discuss.pytorch.org/t/can-transforms-compose-handle-a-batch-of-images/4850/4)
    images = [compose(image) for image in batch["image"]]
    stack = torch.stack(images)
    return {"image": stack}
transformed = dataset.map(transform, batched=True) # <class 'list'> ...
print(f"{type(transformed['train']['image'])}")

dataloader = DataLoader(transformed["train"], batch_size=2)
batch = next(iter(dataloader))
print(f"{type(batch["image"])=}") # <class 'list'>

# map() で変換した後に、set_format() で変換すると、リストからテンソルに変換される。
transformed.set_format("torch")
print(f"{type(transformed['train']['image'])=}") # <class 'torch.Tensor'>

dataloader_torch = DataLoader(transformed["train"], batch_size=2)
batch_torch = next(iter(dataloader_torch))
print(f"{type(batch_torch["image"])=}") # <class 'list'>


<class 'list'>
type(batch["image"])=<class 'list'>
type(transformed['train']['image'])=<class 'torch.Tensor'>
type(batch_torch["image"])=<class 'torch.Tensor'>


データ拡張も map() で行うのがよい。

[How to augment data ? · Issue #365 · huggingface/datasets](https://github.com/huggingface/datasets/issues/365)

英文の一部の単語を置き換えることで、データのバリエーションを増やす例。

In [51]:
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")   # [General Language Understanding Evaluation](https://note.com/npaka/n/n5086fc19c5fc)

Downloading readme: 100%|██████████| 35.3k/35.3k [00:00<00:00, 11.8MB/s]
Downloading data: 100%|██████████| 649k/649k [00:00<00:00, 921kB/s]
Downloading data: 100%|██████████| 75.7k/75.7k [00:00<00:00, 357kB/s]
Downloading data: 100%|██████████| 308k/308k [00:00<00:00, 1.44MB/s]
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 203283.61 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 135988.24 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 431384.12 examples/s]


In [62]:
from random import randint
from transformers import pipeline

fillmask = pipeline("fill-mask", model="roberta-base")
mask_token = fillmask.tokenizer.mask_token
smaller_dataset = dataset.filter(lambda e, i: i<10, with_indices=True)

Filter: 100%|██████████| 3668/3668 [00:00<00:00, 76417.65 examples/s]


In [63]:
smaller_dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 10
})

In [64]:
def augment_data(examples):
    outputs = []
    for sentence in examples["sentence1"]:
        words = sentence.split(' ')
        K = randint(1, len(words)-1)
        masked_sentence = " ".join(words[:K]  + [mask_token] + words[K+1:])
        predictions = fillmask(masked_sentence)
        augmented_sequences = [predictions[i]["sequence"] for i in range(3)]
        outputs += [sentence] + augmented_sequences
    return {"data": outputs}

In [66]:
augmented_dataset = smaller_dataset.map(augment_data, batched=True, remove_columns=dataset.column_names, batch_size=8)
augmented_dataset[:9]["data"]

Map: 100%|██████████| 10/10 [00:00<00:00, 19.76 examples/s]


['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'Amrozi accused his brother, whom he called " the witness ", of deliberately distorting his evidence.',
 'Amrozi accused his brother, whom he called " the witness " " of deliberately distorting his evidence.',
 'Amrozi accused his brother, whom he called " the witness "," of deliberately distorting his evidence.',
 "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion .",
 "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $1 billion.",
 "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2 billion.",
 "Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $3 billion.",
 'They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .']

### [Batch processing](https://huggingface.co/docs/datasets/process#batch-processing)

#### [Data augmentation](https://huggingface.co/docs/datasets/process#data-augmentation)

In [111]:
# map() のデータ拡張のサンプル。ただし画像が数万枚ある場合は、CursorのJupyter Notebookで実行するとOOMで落ちた気がする。
from datasets import load_dataset
from torchvision import transforms

dataset = load_dataset("xhiroga/MiniAlbum", split="train")

compose = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor()
])

def augmentation(batch: any):
    images = []
    labels = []
    for image, label in zip(batch["image"], batch["label"]):
        images.append(compose(image))
        labels.append(label)
        images.append(compose(image))
        labels.append(label)
        images.append(compose(image))
        labels.append(label)
    return {"image": images, "label": labels}

mapped = dataset.map(batched=True, function=augmentation)
mapped

Dataset({
    features: ['image', 'label'],
    num_rows: 39
})

In [112]:
# OOMを避けるには、 set_transform() したデータセットを追加する方法がある
from datasets import load_dataset
from torchvision import transforms
from torch.utils.data import DataLoader
from torch.utils.data import ConcatDataset

dataset = load_dataset("xhiroga/MiniAlbum", split="train")

compose = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ToTensor(),
])

def transform(batch: dict[str, list[any]]) -> dict[str, list[any]]:
    composed = [compose(image) for image in batch["image"]]
    return {"image": composed}

dataset.set_transform(transform)

# HuggingFaceのDatasetをPyTorchのConcatDatasetにそのまま突っ込んでいいのか...?
multiplied_dataset = ConcatDataset([dataset]*3)
multiplied_dataset

<torch.utils.data.dataset.ConcatDataset at 0x179021fb500>