In [1]:
import transformers
import datasets
import torch
import random
import numpy as np

from dataclasses import dataclass
from typing import Any, Optional, Union
from prepare_dataset import DataCollatorForCausalLM

In [2]:
class MockArgs:
    debug = True
    model_name_or_path = "/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9"
    tokenizer_name = "/data/users/zhangjunlei/tyx/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9"
    encoded_datasets_name_or_path = "/data/users/zhangjunlei/tyx/reward-by-prm800k/datasets/encoded-datasets-direct-prediction"
    use_slow_tokenizer = True
    max_seq_length = 4096
    per_device_train_batch_size = 2

args = MockArgs()

def print_dict_of_tensors(d):
    for k, v in d.items():
        print(f"{k}: {v.shape}")

In [4]:
@dataclass
class DebugDataCollatorForSeq2Seq:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels.

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
            prepare the *decoder_input_ids*

            This is useful when using *label_smoothing* to avoid calculating loss twice.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: transformers.tokenization_utils_base.PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, transformers.utils.PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    return_tensors: str = "pt"

    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
        labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
        # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
        # same length to return tensors.
        if labels is not None:
            max_label_length = max(len(l) for l in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )

            padding_side = self.tokenizer.padding_side
            for feature in features:
                remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
                if isinstance(feature["labels"], list):
                    feature["labels"] = (
                        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
                    )
                elif padding_side == "right":
                    feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
                else:
                    feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
        feature["labels"] = torch.tensor(feature["labels"])

        print("features[0]:")
        print_dict_of_tensors(features[0])
        

        features = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )
        
        print("features:")
        print_dict_of_tensors(features)

        # prepare decoder_input_ids
        if (
            labels is not None
            and self.model is not None
            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
            features["decoder_input_ids"] = decoder_input_ids

        return features

In [4]:
# model
model = None

# tokenizer

if args.tokenizer_name:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args.tokenizer_name, use_fast=not args.use_slow_tokenizer
    )
elif args.model_name_or_path:
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        args.model_name_or_path, use_fast=not args.use_slow_tokenizer
    )
else:
    raise ValueError(
        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
    )

# no default pad token for llama!
# here we add all special tokens again, because the default ones are not in the special_tokens_map
if isinstance(tokenizer, transformers.LlamaTokenizer):
    num_added_tokens = tokenizer.add_special_tokens(
        {
            "bos_token": "<s>",
            "eos_token": "</s>",
            "unk_token": "<unk>",
            "pad_token": "<pad>",
        }
    )
    assert num_added_tokens in [
        0,
        1,
    ], "LlamaTokenizer should only add one special token - the pad_token, or no tokens if pad token present."
elif isinstance(tokenizer, transformers.GPTNeoXTokenizerFast):
    num_added_tokens = tokenizer.add_special_tokens(
        {
            "pad_token": "<pad>",
        }
    )
    assert (
        num_added_tokens == 1
    ), "GPTNeoXTokenizer should only add one special token - the pad_token."
elif isinstance(tokenizer, transformers.GPT2Tokenizer) and isinstance(model, transformers.OPTForCausalLM):
    num_added_tokens = tokenizer.add_special_tokens({"unk_token": "<unk>"})

In [5]:
# debug
print(type(tokenizer))
print(tokenizer.model_input_names)
tokenizer.model_input_names = ['input_ids', 'labels', 'attention_mask']

<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>
['input_ids', 'attention_mask']


In [6]:
if args.encoded_datasets_name_or_path:
    lm_datasets = datasets.load_from_disk(args.encoded_datasets_name_or_path)
    train_dataset = lm_datasets["train"]

In [7]:
train_dataset = lm_datasets["train"]
train_example = train_dataset[random.randint(0, len(train_dataset) - 1)]
for feature_name, data in train_example.items():
    print(feature_name, data.shape)
print(train_dataset[0])

input_ids torch.Size([138])
labels torch.Size([138])
attention_mask torch.Size([138])
{'input_ids': tensor([    1,  1128,  1784,  6923,   526,   297, 29871, 29955, 29889, 29947,
         6233, 29973,    13, 29955, 29889, 29947,  6233,   338,   278,  1021,
          408, 29871, 29955,  6233,   322, 29871, 29900, 29889, 29947,  6233,
        29889,    13,  7341, 29892,   322,  1951,   727,   526, 29871, 29953,
        29900,  6923,   297,   263, 11015, 29892,   769,   727,   526, 29871,
        29953, 29900,   334, 29871, 29955,   353, 29871, 29946, 29906, 29900,
         6923,   297, 29871, 29955,  6233, 29889,    13,  2855,  1951,   727,
          526, 29871, 29953, 29900,  6923,   297,   263, 11015, 29892,   769,
          727,   526, 29871, 29953, 29900,   334, 29871, 29900, 29889, 29947,
          353, 29871, 29946, 29947,  6923,   297, 29871, 29900, 29889, 29947,
         6233, 29889,    13,  6295, 29892,   297,  3001, 29892,   727,   526,
        29871, 29946, 29906, 29900,   718,

In [8]:

padding_strategy = transformers.utils.PaddingStrategy.MAX_LENGTH

# padding_strategy = transformers.utils.PaddingStrategy.LONGEST

In [12]:
# data_collator = DebugDataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=padding_strategy, max_length=args.max_seq_length)
# data_collator = transformers.DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=padding_strategy, max_length=args.max_seq_length)
data_collator = DataCollatorForCausalLM(tokenizer=tokenizer, model=model, padding=padding_strategy, max_length=args.max_seq_length)
collated_example = data_collator([train_example])

print_dict_of_tensors(collated_example)
print(collated_example)

input_ids: torch.Size([1, 4096])
labels: torch.Size([1, 4096])
attention_mask: torch.Size([1, 4096])
{'input_ids': tensor([[    1,  3118,   714,  ..., 32000, 32000, 32000]]), 'labels': tensor([[-100, -100, -100,  ..., -100, -100, -100]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]])}


In [13]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=args.per_device_train_batch_size,
)

In [15]:
# print(train_dataloader)
for step, batch in enumerate(train_dataloader):
    print(f"batch = train_dataloader[{step}]:")
    for k, v in batch.items():
        print(f"{k}: {v.shape}")
    break

batch = train_dataloader[0]:
input_ids: torch.Size([2, 4096])
labels: torch.Size([2, 4096])
attention_mask: torch.Size([2, 4096])
