## Data preprocessing for token classification

#### Download and extract the two columns we need from the conll dataset

In [5]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
print(raw_datasets)
raw_datasets = raw_datasets.remove_columns(["chunk_tags", "id", "pos_tags"])
raw_datasets = raw_datasets.rename_columns({"tokens": "words", "ner_tags":"labels"})
print(raw_datasets["train"])
print(raw_datasets["train"][0]["words"], raw_datasets["train"][0]["labels"])

label_names = raw_datasets["train"].features["labels"].feature.names
print(label_names)

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})
Dataset({
    features: ['words', 'labels'],
    num_rows: 14041
})
['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.'] [3, 0, 7, 0, 0, 0, 7, 0, 0]
['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


#### B-XXX labels are used at the beginning of an entity while I-XXX labels are used for the following words
#### To tokenize text already split into words, we just need to set ***is_split_into_words=True***

In [6]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
 
inputs = tokenizer(raw_datasets["train"][0]["words"], is_split_into_words=True)
inputs.tokens()



['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

#### Once tokenized, we get more inputs than labels, so we need to do the alignment
<img src="image/data_preprocessing2.png" width=800>

#### Tokenizer use the word ID to match each token with the label of its word
<img src="image/data_preprocessing1.png" width=800>

In [13]:
def shift_label(label):
    if label % 2 == 1:
        label += 1
    return label

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word=None
    for word_id in word_ids:
        if word_id is None:
            new_labels.append(-100)
        elif word_id != current_word:
            current_word = word_id
            new_labels.append(labels[word_id])
        else:
            new_labels.append(shift_label(labels[word_id]))
    return new_labels

def tokenize_and_align_labels(examples):
    tokneized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    new_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokneized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokneized_inputs["labels"] = new_labels
    return tokneized_inputs

tokenized_datasets =raw_datasets.map(tokenize_and_align_labels, batched=True)
print(raw_datasets, "\n", tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['words', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['words', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['words', 'labels'],
        num_rows: 3453
    })
}) 
 DatasetDict({
    train: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['words', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3453
    })
})


#### The last step is batching the inputs together, the input need to be padded to the same length
#### It can be done by a data collator designed for token classification

In [15]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

## Data preprocessing for masked language modeling

#### The dataset should be set in a format with just one column of texts, we will need to batch them together

In [16]:
from datasets import load_dataset

raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
raw_datasets["train"]

Dataset({
    features: ['text'],
    num_rows: 36718
})

#### The first way to group sentences is just to pad or truncate the texts to the length we set

In [19]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("imdb")
raw_datasets = raw_datasets.remove_columns("label")

model_ckpt = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
context_length = 128

def tokenzie_pad_and_truncate(text):
    return tokenizer(text["text"], truncation=True, padding="max_length", max_length=context_length)

tokenized_datasets = raw_datasets.map(tokenzie_pad_and_truncate, batched=True)
print(tokenized_datasets)

Map: 100%|██████████| 25000/25000 [00:02<00:00, 9848.36 examples/s] 

DatasetDict({
    train: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})





#### However, if the text is very long it will lose a lot in trucation
#### Instead, we could generate several blocks of contexts length from that very long text, it can be done by asking the tokenizer to return overflowing tokens by setting ***return_overflowing_tokens=True***

In [20]:
def tokenize_and_chunk(texts):
    return tokenizer(
        texts["text"], truncation=True, max_length=context_length, return_overflowing_tokens=True
    )

tokenized_datasets = raw_datasets.map(
    tokenize_and_chunk, batched=True, remove_columns="text"
)

len(raw_datasets["train"]), len(tokenized_datasets["train"])

Map: 100%|██████████| 25000/25000 [00:04<00:00, 5444.33 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 5764.58 examples/s]
Map: 100%|██████████| 50000/50000 [00:08<00:00, 5813.22 examples/s]


(25000, 76257)

#### If your texts have various lengths, one option is to concatenate them all, then taking chunks of context length
<img src="image/data_preprocessing4.png" width=800>

In [23]:
def tokenize_and_chunk(texts):
    all_input_ids=[]
    for input_ids in tokenizer(texts["text"])["input_ids"]:
        all_input_ids.extend(input_ids)
        all_input_ids.append(tokenizer.eos_token_id)

    chunks = []
    for idx in range(0, len(all_input_ids), context_length):
        chunks.append(all_input_ids[idx:idx+context_length])
    return {"input_ids": chunks}

tokenized_datasets = raw_datasets.map(tokenize_and_chunk, batched=True, remove_columns=["text"])
len(raw_datasets["train"]), len(tokenized_datasets["train"])

Map: 100%|██████████| 25000/25000 [00:02<00:00, 9945.00 examples/s] 


(25000, 63257)

#### The masking in itself is done by the data collator designed for language modeling

In [24]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

## Data preprocessing for translation

#### Download and extract the two columns we need from teh kde4 dataset

In [6]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", lang1="en", lang2="fr", trust_remote_code=True, split="train")
print(raw_datasets)

def extract_language(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    return {"inputs": inputs, "targets": targets}

raw_datasets = raw_datasets.map(extract_language, batched=True, remove_columns=["id", "translation"])
print(raw_datasets)

Dataset({
    features: ['id', 'translation'],
    num_rows: 210173
})
Dataset({
    features: ['inputs', 'targets'],
    num_rows: 210173
})


#### We need to tokenize both the inputs and the targets, as the inputs and target use English and Franch tokenizer, we need to warn the tokenizer use the different tokenizer using the ***with tokenizer.as_target_tokenizer:***

In [16]:
from transformers import AutoTokenizer

model_ckpt = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

sample = raw_datasets[12]
inputs = tokenizer(sample["inputs"])
with tokenizer.as_target_tokenizer():
    targets = tokenizer(sample["targets"])

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]), tokenizer.convert_ids_to_tokens(targets["input_ids"]))

['▁Using', '▁the', '▁B', 'abel', 'fish', '▁plugin', '</s>'] ['▁Utilisation', '▁du', '▁module', '▁externe', '▁B', 'abel', 'f', 'ish', '</s>']




In [17]:
max_input_length = 128
max_target_length = 128

def preprocess_function(example):
    model_inputs = tokenizer(example["inputs"], max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(example["targets"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = raw_datasets.map(
    preprocess_function, batched=True, remove_columns=["inputs", "targets"]
)
print(tokenized_datasets)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 210173
})


#### As the maximum length of the input and target may be different, we need to pad the input and target separately
#### The input is padded with pad token, and the target with -100 index to make sure are not taken into account in the loss computation
<img src="image/data_preprocessing6.png" width=800>

#### It can be inplemented by using the ***DataCollatorForSeq2Seq***

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Preprocessing dataset for summarization

#### Download and extract the two columns we need form the xsum dataset

In [19]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("xsum", split="train")
raw_datasets = raw_datasets.remove_columns(["id"])
print(raw_datasets, "\n", raw_datasets[1])

Dataset({
    features: ['document', 'summary'],
    num_rows: 204045
}) 
 {'document': 'A fire alarm went off at the Holiday Inn in Hope Street at about 04:20 BST on Saturday and guests were asked to leave the hotel.\nAs they gathered outside they saw the two buses, parked side-by-side in the car park, engulfed by flames.\nOne of the tour groups is from Germany, the other from China and Taiwan. It was their first night in Northern Ireland.\nThe driver of one of the buses said many of the passengers had left personal belongings on board and these had been destroyed.\nBoth groups have organised replacement coaches and will begin their tour of the north coast later than they had planned.\nPolice have appealed for information about the attack.\nInsp David Gibson said: "It appears as though the fire started under one of the buses before spreading to the second.\n"While the exact cause is still under investigation, it is thought that the fire was started deliberately."', 'summary': 'Two tou

#### Tokenize both inputs and targets
#### As the special tokens we add might be different for the inputs and the targets, the ***with tokenizer.as_target_tokenizer():*** need to be used

In [21]:
from transformers import AutoTokenizer

model_ckpt = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

sample = raw_datasets[1]
inputs = tokenizer(sample["document"])
with tokenizer.as_target_tokenizer():
    targets = tokenizer(sample["summary"])

print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]), "\n", tokenizer.convert_ids_to_tokens(targets["input_ids"]))


['▁A', '▁fire', '▁alarm', '▁went', '▁off', '▁at', '▁the', '▁Holiday', '▁In', 'n', '▁in', '▁Hope', '▁Street', '▁at', '▁about', '▁04', ':', '20', '▁B', 'ST', '▁on', '▁Saturday', '▁and', '▁guests', '▁were', '▁asked', '▁to', '▁leave', '▁the', '▁hotel', '.', '▁As', '▁they', '▁', 'gathered', '▁outside', '▁they', '▁saw', '▁the', '▁two', '▁buses', ',', '▁', 'parked', '▁side', '-', 'by', '-', 'side', '▁in', '▁the', '▁car', '▁park', ',', '▁', 'en', 'gul', 'fed', '▁by', '▁flame', 's', '.', '▁One', '▁of', '▁the', '▁tour', '▁groups', '▁is', '▁from', '▁Germany', ',', '▁the', '▁other', '▁from', '▁China', '▁and', '▁Taiwan', '.', '▁It', '▁was', '▁their', '▁first', '▁night', '▁in', '▁Northern', '▁Ireland', '.', '▁The', '▁driver', '▁of', '▁one', '▁of', '▁the', '▁buses', '▁said', '▁many', '▁of', '▁the', '▁passengers', '▁had', '▁left', '▁personal', '▁belonging', 's', '▁on', '▁board', '▁and', '▁these', '▁had', '▁been', '▁destroyed', '.', '▁Both', '▁groups', '▁have', '▁organised', '▁replacement', '▁coaches',



#### Processing the whole dataset

In [24]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    model_input = tokenizer(examples["document"], max_length=max_input_length, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], truncation=True, max_length=max_target_length)

    model_input["labels"] = labels["input_ids"]
    return model_input

tokenized_datasets = raw_datasets.map(
    preprocess_function, batched=True, remove_columns=["document", "summary"]
)
print(tokenized_datasets)

Map: 100%|██████████| 204045/204045 [00:54<00:00, 3775.38 examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 204045
})





#### We pad the inputs and targets separately, as their maximum length are completely different
#### The input is padded with pad token and the target is padded with -100 index to make sure they are not taken into sccount in the loss computation
<img src="image/data_preprocessing5.png" width=800>

#### It can be accomplished by using the ***DataCollatorForSeq2Seq***

In [26]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

## Data preprocessing for casual language modeling
#### The inputs also serve as the label in casual language modeling

#### We need to chunk the input sequence into context size pieces
<img src="image/data_preprocessing7.png" width=800>

In [29]:
def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_datasets = raw_datasets.map(tokenize, batched=True, remove_columns=raw_datasets.column_names)

#### When dealing with long context or short sequences we should concatenate first, as the short sequence will be discard due to shorter thant the context length
#### First tokenize each sample without truncation, then concatenate tokenized sample with and EOS token in between
<img src="image/data_preprocessing8.png" width=800>

#### Aligning labels is handled inside the model and we can just pass the inputs

In [None]:
output = model(input_ids=batch["input_ids"], labels=batch["input_ids"])
loss = output.loss

## Data preprocessing for question answering

#### Download and extract the four columns we need from the SQuAD dataset

In [61]:
from datasets import load_dataset

raw_datasets = load_dataset("squad", split="train")
raw_datasets = raw_datasets.remove_columns(["id", "title"])
print(raw_datasets, "\n", raw_datasets.features)
print(raw_datasets[0]["answers"])

def prepare_data(example):
    answer = example["answers"]["text"][0]
    example["answer_start"] = example["answers"]["answer_start"][0]
    example["answer_end"] = example["answer_start"] + len(answer)
    return example

raw_datasets = raw_datasets.map(prepare_data, remove_columns=["answers"])
print(raw_datasets)
print(f"Context: {raw_datasets[0]['context']}\nQuestion: {raw_datasets[0]['question']}\nAnswer: {raw_datasets[0]['context'][raw_datasets[0]['answer_start']:raw_datasets[0]['answer_end']]}")

Dataset({
    features: ['context', 'question', 'answers'],
    num_rows: 87599
}) 
 {'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}
{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}
Dataset({
    features: ['context', 'question', 'answer_start', 'answer_end'],
    num_rows: 87599
})
Context: Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to

#### When the context is very long it may be truncated by the tokenizer, instead, we create several features for different pieces of the context
#### But to avoid truncating the answer, we allow some overlap
<img src="image/data_preprocessing10.png" width=800>

#### This is done automatically by the tokenizer if use ***return_overflowing_tokens=True***

In [71]:
from transformers import AutoTokenizer

model_ckpt = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

example = raw_datasets[0]
inputs = tokenizer(
    example["question"],
    example["context"],
    truncation='only_second',
    padding="max_length",
    max_length=384,
    stride=128,
    return_overflowing_tokens=True,
    return_offsets_mapping=True
)
print(len(inputs["offset_mapping"][0]))

384


#### Once we have tokenized our inputs, we assign labels like this
<img src="image/data_preprocessing11.png" width=800>

#### When context does not contain the answer, we set the labels to the CLS token index

In [101]:
print(inputs.keys())
print(inputs.sequence_ids(0))
print(inputs["offset_mapping"])
print(inputs["overflow_to_sample_mapping"])


def find_labels(offsets, answer_start, answer_end, sequence_ids):
    idx = 0
    # find the context position base on the sequence id, question label 0, context label 1, padding label None
    while sequence_ids[idx] != 1:
        idx += 1
    context_start = idx
    while sequence_ids[idx] ==1:
        idx += 1
    context_end = idx - 1

    # if the answer is not fully in the context, return (0, 0)
    if offsets[context_start][0] > answer_end or offsets[context_end][1] < answer_start:
        return (0, 0)
    else:
        idx = context_start
        while idx <= context_end and offsets[idx][0] <= answer_start:
            idx += 1
        start_position = idx -1

        idx = context_end
        while idx >= context_start and offsets[idx][0] >= answer_end:
            idx -= 1
        end_position = idx + 1
        return start_position, end_position
    
start, end = find_labels(
    # the offset_mapping dim is 2 due to the overflow issue
    inputs["offset_mapping"][0],
    example["answer_start"],
    example["answer_end"],
    inputs.sequence_ids(0)
)
tokenizer.decode(inputs["input_ids"][0][start: end])


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping'])
[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, No

'Saint Bernadette Soubirous'

#### preprocessing the dataset

In [103]:
def preprocess_training_examples(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        padding="max_length",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )

    offset_mapping =inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        start, end = find_labels(
            offset,
            examples["answer_start"][sample_idx],
            examples["answer_end"][sample_idx],
            inputs.sequence_ids(i)
        )

        inputs["start_positions"].append(start)
        inputs["end_positions"].append(end)

    return inputs

tokenized_datasets = raw_datasets.map(preprocess_training_examples,
                                      batched=True,
                                      remove_columns=raw_datasets.column_names)
print(tokenized_datasets)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88729
})
