In [1]:
import torch

# 2. From Text to Tokens

### 2.1 Subword Tokenization

##### Example 1

In [9]:
model_ckpt = "gpt2"

Load the `gpt2` tokenizer and tokenize the text `persistence is all you need`

In [10]:
from transformers import AutoTokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [13]:
output = tokenizer("Persistence is all you need.")

In [14]:
output

{'input_ids': [30946, 13274, 318, 477, 345, 761, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

##### Example 2

In [14]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [38]:
text = "Persistence is all you need. Tokenizing"

In [35]:
encoded_text = tokenizer("Persistence is all you need. Tokenizing")

In [36]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)

In [44]:
text

'Persistence is all you need. Tokenizing'

`tokens` is a list of tokens of the `text` from `distilbert-base-uncased`

In [45]:
tokens

['[CLS]',
 'persistence',
 'is',
 'all',
 'you',
 'need',
 '.',
 'token',
 '##izing',
 '[SEP]']

**Question 1**: What is `[CLS]` and `[SEP]`?

These're tokens that indicate the start and the end of the sequence

**Question 2**: What does the prefix `##` in `##izing` mean?

It's means that that the preceding string is not whitespace

**Question 3**: How does the tokenizer differentiate between tokens with and without the `##` prefix when converting tokens to text?

- Tokens with the `##` prefix are subwords that have been combined to form a single token during the tokenization process. When the Tokenizer converts these tokens back to text, it removes the `##` prefix and combines the subwords into a single word.

- Tokens without the `##` prefix are treated as regular words that are present in the model's vocabulary, and they are not modified during the conversion process.

##### Example 3

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("DDSC/angry-tweets")

Using custom data configuration DDSC--angry-tweets-37d8dddb9469d99e


Downloading and preparing dataset parquet/DDSC--angry-tweets to /root/.cache/huggingface/datasets/DDSC___parquet/DDSC--angry-tweets-37d8dddb9469d99e/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/120k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/269k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/DDSC___parquet/DDSC--angry-tweets-37d8dddb9469d99e/0.0.0/7328ef7ee03eaf3f86ae40594d46a1cec86161704e02dd19f232d81eee72ade8. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [19]:
batch = dataset['train'][:3]

In [25]:
batch

{'text': ['Et stort tillykke til @USER og vinderne af årets Cavlingpris 💪🏼 [LINK]',
  '@USER Jeg lukkede den faktisk ned inden et møde 😬',
  '@USER Así es, para jugar un partido se requieren dos equipos.'],
 'label': ['positiv', 'neutral', 'neutral']}

Write a function `tokenize` a batch of text as bellow given `tokenizer` is a transformer's tokenizer. Explain each parameters in the function

**Hint**: There's one parameter

In [26]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True)

**Explain**

Because the tokenized text all have the same length => set `padding=True` to add special padding tokens to the end of shorter sequences until they reach the maximum input length

In [27]:
encoded_text = tokenize(batch)

In [28]:
list(map(len, encoded_text.input_ids))

[27, 27, 27]

In [29]:
encoded_text

{'input_ids': [[101, 3802, 2358, 11589, 6229, 15922, 3489, 18681, 1030, 5310, 13958, 19354, 25888, 2063, 21358, 2024, 3215, 6187, 2615, 2989, 18098, 2483, 100, 1031, 4957, 1033, 102], [101, 1030, 5310, 15333, 2290, 11320, 19658, 14728, 7939, 6904, 22462, 6711, 12311, 27427, 2368, 3802, 1049, 16415, 3207, 100, 102, 0, 0, 0, 0, 0, 0], [101, 1030, 5310, 2004, 2072, 9686, 1010, 11498, 26536, 2906, 4895, 2112, 13820, 7367, 2128, 15549, 7869, 2078, 9998, 1041, 15549, 6873, 2015, 1012, 102, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]}

##### Example 3.1

In [15]:
from transformers import AutoTokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [17]:
output = tokenizer("Persistence is all you need.")

In [19]:
input_ids = output.input_ids

In [29]:
type(tokenizer)

transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast

Convert a list of `input_ids` representing a sequence back into the original text given `tokenizer`

In [30]:
input_ids

[30946, 13274, 318, 477, 345, 761, 13]

In [31]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

In [32]:
tokens

['Pers', 'istence', 'Ġis', 'Ġall', 'Ġyou', 'Ġneed', '.']

In [33]:
text = tokenizer.convert_tokens_to_string(tokens)

In [34]:
text

'Persistence is all you need.'

##### Example 4

In [111]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [115]:
#dataset_encoded['train'][1]

In [133]:
from transformers import AutoModel

In [134]:
model_ckpt = "distilbert-base-uncased"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModel.from_pretrained(model_ckpt).to(device)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [135]:
text = "this is a test"

In [139]:
inputs = tokenizer(text, return_tensors="pt")

In [140]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [141]:
inputs = {k:v.to(device) for k,v in inputs.items()}

In [142]:
inputs

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [144]:
with torch.no_grad():
    outputs = model(**inputs)

In [145]:
outputs

BaseModelOutput(last_hidden_state=tensor([[[-0.1565, -0.1862,  0.0528,  ..., -0.1188,  0.0662,  0.5470],
         [-0.3575, -0.6484, -0.0618,  ..., -0.3040,  0.3508,  0.5221],
         [-0.2772, -0.4459,  0.1818,  ..., -0.0948, -0.0076,  0.9958],
         [-0.2841, -0.3917,  0.3753,  ..., -0.2151, -0.1173,  1.0526],
         [ 0.2661, -0.5094, -0.3180,  ..., -0.4203,  0.0144, -0.2149],
         [ 0.9441,  0.0112, -0.4714,  ...,  0.1439, -0.7288, -0.1619]]],
       device='cuda:0'), hidden_states=None, attentions=None)

In [147]:
outputs.last_hidden_state.shape

torch.Size([1, 6, 768])

In [156]:
xs = [[1, 2, 3, 4]]

# 3. Training a Text Classifier

### 3.2 Fine-Tuning Transformers

##### Example 1

In [158]:
from transformers import AutoModelForSequenceClassification

In [159]:
num_labels = 6

In [160]:
model_ckpt = "distilbert-base-uncased"

In [161]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=num_labels
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [172]:
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [173]:
from transformers import Trainer, TrainingArguments

In [174]:
batch_size = 64
logging_steps = len(dataset_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=True, 
                                  log_level="error")

In [175]:
training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.EPOCH,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
jit_mode_eval=False,
label_names=None,
label_smoothing_fac

In [176]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [177]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args, 
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train();

KeyError: 'validation'