##### Example 1

In [None]:
from datasets import load_dataset

In [None]:
dataset_name = "transformersbook/codeparrot-train"

In [None]:
filters = ["pandas", "sklearn", "matplotlib"]

In [None]:
split="train"

In [None]:
data = load_dataset(dataset_name, split="train", streaming=True)

Using custom data configuration transformersbook--codeparrot-train-ba60c789679753de


In [None]:
from collections import defaultdict
from tqdm import tqdm

In [None]:
def any_keyword_in_string(string, keywords):
    for keyword in keywords:
        if keyword in string:
            return True
    return False

In [None]:
def filter_streaming_dataset(dataset, filters):
    filtered_dict = defaultdict(list)
    total = 0
    for sample in tqdm(iter(dataset)):
        total += 1
        if any_keyword_in_string(sample["content"], filters):
            for k, v in sample.items():
                filtered_dict[k].append(v)
                if total > 100: break
    print(f"{len(filtered_dict['content'])/total:.2%} of data after filtering.")
    return Dataset.from_dict(filtered_dict)

In [None]:
filtered_data = filter_streaming_dataset(data, filters)

##### Example 2

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = load_dataset("onestop_english")

Found cached dataset onestop_english (/Users/education/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
context_length = 128

In [None]:
outputs = tokenizer(
    dataset["train"][:2]["text"],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

In [None]:
outputs["overflow_to_sample_mapping"]

[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]

In [None]:
def tokenize(element):
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for input_ids in outputs["input_ids"]:
        if len(input_ids) == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

In [None]:
def tokenize_function(x):
    return tokenizer(x["text"], padding=True, truncation=True, return_tensors="pt")

In [None]:
tokenized_dataset = dataset.map(tokenize)

  0%|          | 0/567 [00:00<?, ?ex/s]

In [None]:
# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function)

Loading cached processed dataset at /Users/education/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf/cache-e341da0daf36ec09.arrow


In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["text", "label"])

In [None]:
train_dataset = tokenized_dataset["train"][:3]

In [None]:
# concatenated_examples = {
#     k: sum(train_dataset[k], []) for k in train_dataset.keys()
# }

In [None]:
train_dataset.keys()

dict_keys(['text', 'label', 'input_ids', 'attention_mask'])

In [None]:
chunk_size = 128


In [None]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_dataset["train"].map(group_texts)

Loading cached processed dataset at /Users/education/.cache/huggingface/datasets/onestop_english/default/1.1.0/6b19eec5680862ad1cf1990e98b06a98d1fa4c85f3585dc4dfab93f52b89d9cf/cache-d849d0c2414e3ff3.arrow


In [None]:
lm_datasets["labels"][0][0][-1]

4800

In [None]:
lm_datasets["input_ids"][0][0][-1]

4800

##### Example 3

In [None]:
from transformers import DataCollatorForLanguageModeling

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])

##### Example 4

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

In [None]:
DEFAULT_TRAINING_DATASET = "databricks/databricks-dolly-15k"

In [None]:
dataset = load_dataset(DEFAULT_TRAINING_DATASET)["train"]

Downloading readme:   0%|          | 0.00/7.70k [00:00<?, ?B/s]

Using custom data configuration databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08


Downloading and preparing dataset json/databricks--databricks-dolly-15k to /Users/education/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/education/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-6e0f9ea7eaa0ee08/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."

In [None]:
INSTRUCTION_KEY = "### Instruction:"
INPUT_KEY = "Input:"
RESPONSE_KEY = "### Response:"
END_KEY = "### End"

In [None]:
PROMPT_WITH_INPUT_FORMAT = """{intro}

{instruction_key}
{instruction}

{input_key}
{input}

{response_key}
{response}

{end_key}""".format(
    intro=INTRO_BLURB,
    instruction_key=INSTRUCTION_KEY,
    instruction="{instruction}",
    input_key=INPUT_KEY,
    input="{input}",
    response_key=RESPONSE_KEY,
    response="{response}",
    end_key=END_KEY,
)

In [None]:
def _add_text(x):
    instruction = rec

In [None]:
dataset

Dataset({
    features: ['instruction', 'context', 'response', 'category'],
    num_rows: 15011
})