In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

# 1. Transfer Learning
## 1-1. Feature Extraction
## 1-2. Fine-Tuning with PyTorch

In [None]:
# [1] Freeze weights
# import dataset (comes with colab!)
data = np.loadtxt(open('sample_data/mnist_train_small.csv','rb'),delimiter=',')

# extract labels (number IDs) and remove from data
labels = data[:,0]
data   = data[:,1:]

# normalize the data to a range of [0 1]
dataNorm = data / np.max(data)

In [None]:
# Step 1: convert to tensor
dataT   = torch.tensor( dataNorm ).float()
labelsT = torch.tensor( labels ).long()

# Step 2: use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = train_test_split(dataT, labelsT, test_size=.1)

# Step 3: convert into PyTorch Datasets
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)

# Step 4: translate into dataloader objects
batchsize    = 32
train_loader = DataLoader(train_data,batch_size=batchsize,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

In [None]:
def createTheMNISTNet():

  class mnistNet(nn.Module):
    def __init__(self):
      super().__init__()

      ### input layer
      self.input = nn.Linear(784,64)

      ### hidden layer
      self.fc1 = nn.Linear(64,32)
      self.fc2 = nn.Linear(32,32)

      ### output layer
      self.output = nn.Linear(32,10)

    # forward pass
    def forward(self,x):
      x = F.relu( self.input(x) )
      x = F.relu( self.fc1(x) )
      x = F.relu( self.fc2(x) )
      return self.output(x)

  # create the model instance
  net = mnistNet()

  # loss function
  lossfun = nn.CrossEntropyLoss()

  # optimizer (using SGD to slow down learning!)
  optimizer = torch.optim.SGD(net.parameters(),lr=.001)

  return net,lossfun,optimizer

In [None]:
# inspect the "learning toggle" of a layer
N = createTheMNISTNet()[0]
N.fc1.weight.requires_grad

In [None]:
N = createTheMNISTNet()[0]

# switch off all layers except input
for p in N.named_parameters():
  if 'input' not in p[0]:
    p[1].requires_grad = False


# check what we've done
for p in N.named_parameters():
  print('Requires_grad status in layer %s: %s' %(p[0],p[1].requires_grad))

In [None]:
def function2trainTheModel(net,lossfun,optimizer):

  # number of epochs
  numepochs = 100

  # initialize losses
  losses    = torch.zeros(numepochs)
  trainAcc  = []
  testAcc   = []


  # loop over epochs
  for epochi in range(numepochs):




    # NEW: switch off learning in all-but-output layers during first 1/2 of training
    if epochi<(numepochs/2):
      for p in net.named_parameters():
        if 'output' not in p[0]:
          p[1].requires_grad = False
    else:
      for p in net.named_parameters():
        p[1].requires_grad = True





    # loop over training data batches
    net.train()
    batchAcc  = []
    batchLoss = []
    for X,y in train_loader:

      # forward pass and loss
      yHat = net(X)
      loss = lossfun(yHat,y)

      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # loss from this batch
      batchLoss.append(loss.item())

      # compute accuracy
      matches = torch.argmax(yHat,axis=1) == y     # booleans (false/true)
      matchesNumeric = matches.float()             # convert to numbers (0/1)
      accuracyPct = 100*torch.mean(matchesNumeric) # average and x100
      batchAcc.append( accuracyPct )               # add to list of accuracies
    # end of batch loop...

    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append( np.mean(batchAcc) )

    # and get average losses across the batches
    losses[epochi] = np.mean(batchLoss)

    # test accuracy
    net.eval()
    X,y = next(iter(test_loader)) # extract X,y from test dataloader
    with torch.no_grad(): # deactivates autograd
      yHat = net(X)

    # compare the following really long line of code to the training accuracy lines
    testAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1)==y).float()) )
  # end epochs

  # function output
  return trainAcc,testAcc,losses,net

In [None]:
# create the network
net,lossfun,optimizer = createTheMNISTNet()

# train the model
trainAcc,testAcc,losses,net = function2trainTheModel(net,lossfun,optimizer)

plt.plot(trainAcc,label='Train')
plt.plot(testAcc,label='Test')
plt.plot([len(trainAcc)/2, len(trainAcc)/2],[10,80],'k--',label='Learning switched on')
plt.legend()
plt.show()

# 2. Pre-Trained Transformer Models
Transformer models can be grouped into three categories:
- **Auto-Regressive:** GPT-like for **Natural Language Generation (NLG)** tasks.
- **Auto-Encoding:** BERT-like for **Natural Language Understanding (NLU)**, or **Natural Langauage Inference (NLI)** tasks.
- **Seq2Seq:** BART & T5-like for generative tasks that require an input, such as translation, summarization, or generative question answering.

**Pre-Training** is the act of training a model from scratch. The weights are randomly initialized, and the training starts without any prior knowledge.

The `transformers` library aims to provide a single application programming interface (API) through which any transformer model can be loaded, trained & saved. Its main features are ease of use, flexibility & simplicity.
- [Supported Models & Frameworks](https://huggingface.co/docs/transformers/index#supported-models-and-frameworks)
## 2-1. Pipelines
1. `transformers.pipeline(task, model, config, tokenizer, feature_extractor, image_processor, framework, revision, use_fast, token, device, device_map, torch_dtype, trust_remote_code, model_kwargs, kwargs)`: Returns an end-to-end object that performs a natural language processing task on one or several texts.
    - Commonly used natural language processing pipelines inlcuding `feature-extraction`, `fill-mask`, `ner` (name entity recognition), `question anwsering`, `sentiment analysis`, `summarization`, `text-generation`, `translation` & `zero-shot-classification`.
    - [Complete List of Supported Tasks](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.pipeline.task)

In [None]:
#!pip3 install tf-keras
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
type(classifier)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


transformers.pipelines.text_classification.TextClassificationPipeline

In [None]:
# Output is a dictionary
classifier("This is such a great movie!")

[{'label': 'POSITIVE', 'score': 0.9998759031295776}]

In [None]:
classifier(["I've been waiting for a HuggingFace course my whole life.", "I hate this so much!"])

[{'label': 'POSITIVE', 'score': 0.9598048329353333},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

In [None]:
classifier = pipeline("zero-shot-classification")
classifier("This is a course about the Transformers library.", candidate_labels=["education", "politics", "business"])

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'sequence': 'This is a course about the Transformers library.',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8719879388809204, 0.09406529366970062, 0.03394680842757225]}

In [None]:
generator = pipeline("text-generation")
generator("In this course, we will teach you how to")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to take control of your body by eliminating the effects of bad eating, body odor, and bad exercise. Learn to clean your body and gain control by avoiding unhealthy foods and reducing toxins from your body.'}]

In [None]:
generator = pipeline("text-generation", model="distilgpt2")
generator("In this course, we will teach you how to", max_length=30, num_return_sequences=2)

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this course, we will teach you how to make your own vegan-friendly breakfast foods with a minimal effort.\n\n\n\n\nHow to'},
 {'generated_text': 'In this course, we will teach you how to build the software yourself – by using the language for your use on the Windows XP 7 PCs, Vista'}]

In [None]:
# The `fill-mask` pipeline will predict missing words in a sentence
unmasker = pipeline("fill-mask")
unmasker("This course will teach you all about <mask> models.", top_k=2)

No model was supplied, defaulted to distilbert/distilroberta-base and revision ec58a5b (https://huggingface.co/distilbert/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'score': 0.19198447465896606,
  'token': 30412,
  'token_str': ' mathematical',
  'sequence': 'This course will teach you all about mathematical models.'},
 {'score': 0.04209218546748161,
  'token': 38163,
  'token_str': ' computational',
  'sequence': 'This course will teach you all about computational models.'}]

In [None]:
ner = pipeline("ner", grouped_entities=True)
ner("My name is Joe and I work at exalted AI in New York.")

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER',
  'score': 0.9989868,
  'word': 'Joe',
  'start': 11,
  'end': 14},
 {'entity_group': 'ORG',
  'score': 0.9953941,
  'word': 'AI',
  'start': 37,
  'end': 39},
 {'entity_group': 'LOC',
  'score': 0.9991089,
  'word': 'New York',
  'start': 43,
  'end': 51}]

In [None]:
question_answerer = pipeline("question-answering")
question_answerer(question="Where do I work?", context="My name is Joe and I work at Exalted AI in New York.")

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


{'score': 0.30933716893196106,
 'start': 29,
 'end': 51,
 'answer': 'exalted AI in New York'}

In [None]:
summarizer = pipeline("summarization")
summarizer(
    """
    America has changed dramatically during recent years. Not only has the number of
    graduates in traditional engineering disciplines such as mechanical, civil,
    electrical, chemical, and aeronautical engineering declined, but in most of
    the premier American universities engineering curricula now concentrate on
    and encourage largely the study of engineering science. As a result, there
    are declining offerings in engineering subjects dealing with infrastructure,
    the environment, and related issues, and greater concentration on high
    technology subjects, largely supporting increasingly complex scientific
    developments. While the latter is important, it should not be at the expense
    of more traditional engineering.

    Rapidly developing economies such as China and India, as well as other
    industrial countries in Europe and Asia, continue to encourage and advance
    the teaching of engineering. Both China and India, respectively, graduate
    six and eight times as many traditional engineers as does the United States.
    Other industrial countries at minimum maintain their output, while America
    suffers an increasingly serious decline in the number of engineering graduates
    and a lack of well-educated engineers.
    """
)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'summary_text': ' America has changed dramatically during recent years . The number of engineering graduates in the U.S. has declined in traditional engineering disciplines such as mechanical, civil, electrical, chemical, and aeronautical engineering . Rapidly developing economies such as China and India, as well as other industrial countries in Europe and Asia, continue to encourage and advance engineering .'}]

In [None]:
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en")
translator("Ce cours est produit par Hugging Face.")

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'translation_text': 'This course is produced by Hugging Face.'}]

### 2-1-1. Tokenizers
There is a fast state-of-the-art `tokenizers` Rust library, developed & recommended by Hugging Face to replace the Python implementation, focusing on performance & versatility.

A **Tokenizer** is responsible for the following missions:
- Subword tokenize & convert token strings to integers (indexes).
- Add new tokens to the vocabulary in a way that is independent of the underlying structure, such as BPE & SentencePiece.
- Manage special tokens, like mask & beginning-of-sentence. Add & assign them to attributes.

Auto tokenizer class:
1. `transformers.AutoTokenizer`: Generic tokenizer class.
    - `from_pretrained(pretrained_model_name_or_path, *inputs, config, cache_dir, force_download, proxies, revision, subfolder, use_fast=True, tokenizer_type, trust_remote_code=False, **kwargs)`: Uses a fast Rust-based tokenizer if `use_fast` is `True`.

Tokenizer classes:

2. `transformers.PreTrainedTokenizer(model_max_length, padding_side, truncation_side, chat_template, model_input_names, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, additional_special_tokens, clean_up_tokenization_spaces=True, split_special_tokens=False)`: The base tokenizer class.
3. `transformers.*Tokenizer(model_max_length, padding_side, truncation_side, chat_template, model_input_names, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, additional_special_tokens, clean_up_tokenization_spaces=True, split_special_tokens=False)`

- All tokenizer classes inherit the following attributes & methods:
    - `__call__(text=None, text_pair=None, text_target=None, text_pair_target=None, add_special_tokens=True, padding=False, truncation=False, max_length=None, stride=0, is_split_into_words=False, pad_to_multiple_of=None, padding_side=None, return_tensors=None, return_token_type_ids=None, return_attention_mask=None, return_overflowing_tokens=False, return_special_tokens_mask=False, return_offsets_mapping=False, return_length=False, verbose=True, **kwargs)`
        - `padding` accepts values among `True` or `longest`, `max_length`, and `False` or `do_not_pad`.
        - The `truncation` argument can be set as `True` or `longest_first`, `only_first`, `only_second`, and `False` or `do_not_truncate`. If truncation is activated, long sequences will be truncated to specified `max_length` or to the maximum acceptable input length for the model if not provided.
        - `return_tensors` can be `tf` (`tensorflow.constant`), `pt` (`torch.Tensor`) or `np` (`numpy.ndarray`).
    - `padding_side`: `right` or `left`.
    - `truncation_side`: Also `right` or `left`.
    - `tokenize(text, **kwargs)`
    - `convert_tokens_to_ids(tokens)`
    - `encode(text, text_pair=None, add_special_tokens=True, padding=False, truncation=None, max_length=None, stride=0, padding_side=None, return_tensors=None, **kwargs)`: Same as calling `tokenize` & `convert_tokens_to_ids` in sequence.
    - `decode(token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=None, **kwargs)`: Similar to doing `convert_ids_to_tokens` and then combining tokens to a string.
    - `convert_ids_to_tokens(ids, skip_special_tokens=False)`

In [None]:
# `AutoTokenizer`
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt")
print(inputs)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [None]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple


In [None]:
# `*Tokenizer()`
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

### 2-1-2. Models
Generic auto model classes:
1. `transformers.AutoModel`: The vector output by the transformer module generally has three dimension: $(batch\_size, sequence\_length, hidden\_size)$ which, respectively, means the number of sequences processed at a time, the length of the numerical represention of the sequence, and the vector dimension of each model input.
2. `transformers.AutoModelForPreTraining`: The generic pretraining class for instantiating a model with a pretraining head. Also has `from_config` & `from_pretrained` methods.

- All auto classes have the following methods:
    - `from_config(config)`
    - `from_pretrained(pretrained_model_name_or_path, *model_args, config, state_dict, cache_dir, from_tf, force_download, proxies, output_loading_info, local_files_only, revision, trust_remote_code, code_revision, kwargs)`

Model classes:

3. `transformers.PreTrainedModel(config, *inputs, **kwargs)`: Base class for all models.
4. `transformers.*Model(config, *inputs, **kwargs)`

- All model classes implement the following attributes & methods:
    - `from_pretrained(pretrained_model_name_or_path, *model_args, config=None, cache_dir=None, ignore_mismatched_sizes=False, force_download=False, local_files_only=False, token=None, revision="main", use_safetensors=None, **kwargs)`: `config` can be either an instance of a class derived from `transformers.PreTrainedConfig` or a path valid as input. `**kwargs` are remaining keyword arguments that can be used to update the configuration object & initiate the model.
    - `save_pretrained(save_directory, is_main_process=True, state_dict=None, save_function, push_to_hub=False, max_shard_size='5GB', safe_serialization=True, variant, token, save_peft_format=True, **kwargs)`: Outputs a configuration file & a state dictionary containing all model's weights.

Configuration classes:

5. `transformers.PreTrainedConfig(name_or_path="", output_hidden_states=False, output_attentions=False, return_dict=True, is_encoder_decoder=False, is_decoder=False, cross_attention_hidden_size=None, add_cross_attention=False, tie_encoder_decoder=False, prune_heads={}, architectures=None, funetuning_task=None, id2label=None, label2id=None, num_labels=None, task_specific_params=None, problem_type=None, tokenizer_class=None, prefix=None, bos_token_id=None, pad_token_id=None, eos_token_id=None, decoder_start_token_id=None, sep_token_id=None, torchscript=False, tie_word_embeddings=True, torch_dtype=None, use_bfloat16=False, tf_legacy_loss=False)`: The base class implementing the common methods for loading & saving configurations.
    - For fine-tuning tasks, `problem_type` can be one of `regression`, `single_label_classification` or `multi_label_classification`.
7. `transformers.*Config`

In [None]:
# `AutoModel()`
from transformers import AutoModel

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)

# High-dimensional vector
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

torch.Size([2, 16, 768])


In [None]:
# `*Model()`
# Model is randomly initialized
from transformers import BertConfig, BertModel

# Build the config
config = BertConfig()
print(config)

# Build the model from the config
model = BertModel(config)
print(model)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(


In [None]:
# Load pretrained weights
model = BertModel.from_pretrained("bert-base-cased")
print(model)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# This will create `model.safetensors` & `config.json` in the `models` folder
model.save_pretrained("models")

### 2-1-3. Heads
The **Model Heads** take the high-dimensional vector of hidden states as input and project them onto a different dimension. Different tasks could have been performed with the same architecture, but each of these tasks will have a different head associated with it. For example, for a model with a sequence classification head, we will not actually use the `AutoModel` class but `AutoModelForSequenceClassification`.

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


### 2-1-4. Post Processing

In [None]:
print(outputs.logits)

tensor([[-1.5607,  1.6123],
        [ 4.1692, -3.3464]], grad_fn=<AddmmBackward0>)


In [None]:
import torch

predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [None]:
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


## 2-2. Batching
**Batching** is the act of sending multiple sentences through the model all at once. The following are the points to handle multiple sequences:
- Models in `transformers` expect a batch of inputs (multiple sentences) by default.
- Padding makes sure all our sentences have the same length by adding a special word called the **Padding Token** to the sentences with fewer values.
- Attention masks are tensors with the exact same shape as the input `ids` tensor, filled with `0`s and `1`s: `1`s indicate the corresponding tokens should be attended to, and `0`s indicate the corresponding tokens should not be attended to (should be ignored by the attention layers of the model).
- To solve the problem of very long sequences, apply **Truncation** or use a model with a longer supported sequence length.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequence = "I've been waiting for a HuggingFace course my whole life."

tokenized_inputs = tokenizer(sequence, return_tensors="pt")
print(tokenized_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)

# Add a dimension on top of `ids` because `transformers` models expect multiple sentences by default
input_ids = torch.tensor([ids])
print("Input IDs:", input_ids)

output = model(input_ids)
print("Logits:", output.logits)

tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]])
Input IDs: tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
Logits: tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [None]:
# The following list of lists cannot be converted to a tensor
# batched_ids = [
#     [200, 200, 200],
#     [200, 200]
# ]

sequence1_ids = [[200, 200, 200]]
sequence2_ids = [[200, 200]]
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

print(model(torch.tensor(sequence1_ids)).logits)
print(model(torch.tensor(sequence2_ids)).logits)
print(model(torch.tensor(batched_ids)).logits)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[ 1.5694, -1.3895]], grad_fn=<AddmmBackward0>)
tensor([[ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)
tensor([[ 1.5694, -1.3895],
        [ 1.3374, -1.2163]], grad_fn=<AddmmBackward0>)


In [None]:
# Attention masks
batched_ids = [
    [200, 200, 200],
    [200, 200, tokenizer.pad_token_id],
]

attention_mask = [
    [1, 1, 1],
    [1, 1, 0],
]

outputs = model(torch.tensor(batched_ids), attention_mask=torch.tensor(attention_mask))
print(outputs.logits)

tensor([[ 1.5694, -1.3895],
        [ 0.5803, -0.4125]], grad_fn=<AddmmBackward0>)


In [None]:
# Special tokens
sequence = "I've been waiting for a HuggingFace course my whole life."

model_inputs = tokenizer(sequence)
print(model_inputs["input_ids"])

tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

print(tokenizer.decode(model_inputs["input_ids"]))
print(tokenizer.decode(ids))

[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102]
[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
[CLS] i've been waiting for a huggingface course my whole life. [SEP]
i've been waiting for a huggingface course my whole life.


In [None]:
# From tokenizer to model
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Padding, truncation
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## 2-3. Fine-Tuning with Transformers
### 2-3-1. Data Collators

Typical preprocessing steps after loading data in `datasets.dataset_dict.DatasetDict` from the `datasets` library including:
- Apply a tokenization function on all the datasets at once. The `datasets` library will add new feature fields to them. Usually use the `datasets.dataset_dict.DatasetDict.map` method.
- Define a **Collate Function** which is an argument passed to build a `torch.utils.data.DataLoader`, by default converting your samples to `torch.Tensor` & concatenating them recursively, but for natural language processing (NLP) tasks, more operations are usually required. For example, the inputs are usually not of the same size so that we may have to perform **Dynamic Padding** on each batch.

1. `transformers.DataCollatorWithPadding(tokenizer, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')`: A data collator dynamically pads the inputs received. Takes a `transformers.PreTrainedTokenizer` to know which padding token to use & whether the model expects pre padding or post padding.  

Fine-tune BERT for semantic textual similarity with the [MRPC (Microsoft Research Paraphrase Corpus)](https://gluebenchmark.com/tasks) dataset from the [GLUE (General Language Understanding Evaluation) Benchmark](https://gluebenchmark.com/).

In [None]:
# Train the model on two sentences for example
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")

batch["labels"] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")
print(raw_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [None]:
raw_train_dataset = raw_datasets["train"]
print("A Single Example of the Raw Training Set:")
print(raw_train_dataset[0])
print()
print("The Features Attribute of the Raw Training Set:")
print(raw_train_dataset.features)
# `sentence1` & `sentence2` are features in next sentence prediction

A Single Example of the Raw Training Set:
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}

The Features Attribute of the Raw Training Set:
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


In [None]:
# Data preprocessing
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# Tokenize only one feature
tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"])
tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"])
# print(tokenized_sentences_1)
# Return a overwhelming dictionary with keys containing `input_ids`, `attention_mask` & `token_type_ids`, and their values that are lists of list

In [None]:
# Input `sentence1` & `sentence2`
inputs = tokenizer("This is the first sentence.", "This is the second one.")
print("Tokenized Inputs after Inputting Sentence 1 & Sentence 2:")
print(inputs)

Tokenized Inputs after Inputting Sentence 1 & Sentence 2:
{'input_ids': [101, 2023, 2003, 1996, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
# Model expects the inputs to be the form `[CLS] sentence1 [SEP] sentence2 [SEP]`
print(tokenizer.convert_ids_to_tokens(inputs["input_ids"]))
# Aligned with 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

['[CLS]', 'this', 'is', 'the', 'first', 'sentence', '.', '[SEP]', 'this', 'is', 'the', 'second', 'one', '.', '[SEP]']


In [None]:
# Only works if you have enough RAM to store your whole dataset during the tokenization
tokenized_dataset = tokenizer(raw_datasets["train"]["sentence1"], raw_datasets["train"]["sentence2"], padding=True, truncation=True)
# print(tokenized_dataset)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
# Better approach that keeps the data as a `Dataset`
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

# `batched=True` to speed up the tokenization
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [None]:
# Dynamic padding
from transformers import DataCollatorWithPadding

# `collate_fn` responsible for putting together samples inside a batch as a parameter of `torch.utils.data.DataLoader`
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(data_collator)

2024-10-02 00:23:16.322194: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-02 00:23:16.437840: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-02 00:23:16.484108: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-02 00:23:16.498814: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-02 00:23:16.580612: I tensorflow/core/platform/cpu_feature_guar

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')


In [None]:
samples = tokenized_datasets["train"][:8]
# Remove the `idx`, `sentence1` & `sentence2` columns
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
print("Lengths of Each Entry in the Batch:", [len(x) for x in samples["input_ids"]])
# Get samples of varying length

Lengths of Each Entry in the Batch: [50, 59, 47, 67, 59, 50, 62, 32]


In [None]:
# Inspect the samples after applying the collate function
batch = data_collator(samples)
print({k: v.shape for k, v in batch.items()})
# All padded to the maximum length inside the batch

{'input_ids': torch.Size([8, 67]), 'token_type_ids': torch.Size([8, 67]), 'attention_mask': torch.Size([8, 67]), 'labels': torch.Size([8])}


### 2-3-2. Trainer

1. `transformers.Trainer(model=None, args=None, data_collator=None, train_dataset=None, eval_dataset=None, tokenizer=None, model_init=None, compute_metrics=None, callbacks=None, optimizers=(None, None), preprocess_logits_for_metrics=None)`: `args` defaults to an instance of `TrainingArguments`. `data_collator` defaults to `transformers.DataCollatorWithPadding` if no `tokenizer` is provided. Set `evaluation_strategy` to either `steps` (evaluate every `eval_steps`) or `epoch` (evaluate at the end of each epoch). You can provide a `compute_metrics` function to calculate a metric during said evaluation, otherwise the evaluation would just print the loss which is not a very intuitive number.

In [None]:
# Wrap up all the processes before
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.NO,
eval_use_gather_object=False,
evaluation_str

In [None]:
# Train with almost everything set by default
from transformers import AutoModelForSequenceClassification, Trainer

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5525
1000,0.3384


TrainOutput(global_step=1377, training_loss=0.3770664190155229, metrics={'train_runtime': 78.5738, 'train_samples_per_second': 140.047, 'train_steps_per_second': 17.525, 'total_flos': 405114969714960.0, 'train_loss': 0.3770664190155229, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])
# `predictions` is a named tuple with `predictions`, `label_ids` & `metrics` three fields
print("predictions Shape:", predictions.predictions.shape)
print("label_ids Shape:", predictions.label_ids.shape)
print("metrics:", predictions.metrics)

predictions Shape: (408, 2)
label_ids Shape: (408,)
metrics: {'test_loss': 0.6020950078964233, 'test_runtime': 0.6222, 'test_samples_per_second': 655.688, 'test_steps_per_second': 81.961}


In [None]:
import numpy as np
import evaluate

# Take the index with the maximum value on the second axis
preds = np.argmax(predictions.predictions, axis=-1)

# Load the dataset-specific metrics
metric = evaluate.load("glue", "mrpc")
print(metric.compute(predictions=preds, references=predictions.label_ids))

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.8578431372549019, 'f1': 0.8993055555555556}


In [None]:
# Train with the `compute_metrics` parameter set
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.422424,0.781863,0.854812
2,0.552500,0.362014,0.852941,0.891697
3,0.338400,0.602095,0.857843,0.899306


TrainOutput(global_step=1377, training_loss=0.3770664190155229, metrics={'train_runtime': 85.7439, 'train_samples_per_second': 128.336, 'train_steps_per_second': 16.059, 'total_flos': 405114969714960.0, 'train_loss': 0.3770664190155229, 'epoch': 3.0})

### 2-3-3. Learning Rate Schedulers

Without using the `transformers.Trainer` class, we still can achieve the same results with PyTorch as below.

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Remove the columns corresponding to values the model does not expect
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# Rename the `label` column to `labels`
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Set the format of datasets so that they return `torch.Tensor` instead of lists
tokenized_datasets.set_format("torch")
print(tokenized_datasets["train"].column_names)



Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

['labels', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

for batch in train_dataloader:
    break
print({k: v.shape for k, v in batch.items()})

{'labels': torch.Size([8]), 'input_ids': torch.Size([8, 81]), 'token_type_ids': torch.Size([8, 81]), 'attention_mask': torch.Size([8, 81])}


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

# All transformer models will return the loss when `labels` are provided
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor(0.9363, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [None]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
print(optimizer)

AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    lr: 5e-05
    weight_decay: 0.0
)




In [None]:
# The learning rate scheduler used by default is a linear decay
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

1377


In [None]:
import torch
from tqdm.auto import tqdm

# Device agnostic code
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# The training oop
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
import evaluate

# The evaluation loop
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

{'accuracy': 0.8259803921568627, 'f1': 0.8777969018932874}

In [None]:
%%writefile train.py

# Enable distributed training with `accelerate`
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AdamW, AutoModelForSequenceClassification, get_scheduler
from accelerate import Accelerator
from tqdm import tqdm

# Preprocess data
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Remove the columns corresponding to values the model does not expect
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# Rename the `label` column to `labels`
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# Set the format of datasets so that they return `torch.Tensor` instead of lists
tokenized_datasets.set_format("torch")

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# Training
accelerator = Accelerator()

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
optimizer = AdamW(model.parameters(), lr=3e-5)

# device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# model.to(device)
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        # loss.backward()
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

Writing train.py


In [None]:
!accelerate config default --help

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


usage: accelerate config default [-h]
                                                    [--config_file SAVE_LOCATION]
                                                    [--mixed_precision {no,fp16,bf16}]

options:
  -h, --help            show this help message and exit
  --config_file SAVE_LOCATION, --config-file SAVE_LOCATION
                        The path to use to store the config file. Will default
                        to a file named default_config.yaml in the cache
                        location, which is the content of the environment
                        `HF_HOME` suffixed with 'accelerate', or if you don't
                        have such an environment variable, your cache
                        directory ('~/.cache' or the content of
                        `XDG_CACHE_HOME`) suffixed with 'huggingface'.
  --mixed_precision {no,fp16,bf16}, --mixed-precision {no,fp16,bf16}
                        Whether or not to use mixed precision training. Choose
           

In [None]:
!accelerate config default

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Configuration already exists at /home/yungshun317/.cache/huggingface/accelerate/default_config.yaml, will not override. Run `accelerate config` manually or pass a different `save_location`.


In [None]:
with open("/home/yungshun317/.cache/huggingface/accelerate/default_config.yaml") as f:
    print(f.read())

{
  "compute_environment": "LOCAL_MACHINE",
  "debug": false,
  "distributed_type": "NO",
  "downcast_bf16": false,
  "enable_cpu_affinity": false,
  "machine_rank": 0,
  "main_training_function": "main",
  "mixed_precision": "no",
  "num_machines": 1,
  "num_processes": 1,
  "rdzv_backend": "static",
  "same_network": false,
  "tpu_use_cluster": false,
  "tpu_use_sudo": false,
  "use_cpu": false
}



In [None]:
!accelerate launch train.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2024-10-02 03:31:05.991312: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-02 03:31:05.998033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-02 03:31:06.005568: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-02 03:31:06.007852: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-02 03:31:06.013439: I tensorflow/core/platform/cpu_feature_guar

# 3. Datasets

Main classes:

1. `datasets.arrow_dataset.Dataset`: The base class is backed by an Apache Arrow table.
2. `datasets.dataset_dict.DatasetDict`: A dictionary with split names as keys & `datasets.arrow_dataset.Dataset` objects as values. Also has `datasets.arrow_dataset.Dataset`'s transform methods to process all the splits at once.

## 3-1. Load
1. `datasets.load_dataset()`

[SQuAD-it](https://github.com/crux82/squad-it/)

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz --directory-prefix=datasets/squad_it
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz --directory-prefix=datasets/squad_it
!gzip -dkv datasets/squad_it/SQuAD_it-*.json.gz

--2024-10-02 13:01:26--  https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz [following]
--2024-10-02 13:01:26--  https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7725286 (7.4M) [application/octet-stream]
Saving to: ‘datasets/squad_it/SQuAD_it-train.json.gz’


2024-10-02 13:01:27 (9.82 MB/s) - ‘datasets/squad_it/SQuAD_it-train.json.gz’ saved [7725286/7725286]

--2024-10-02 13:01:27--  https://github.com/cru

In [None]:
# Load a local dataset
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="datasets/squad_it/SQuAD_it-train.json", field="data")
print(squad_it_dataset)
# View one of the examples by indexing into the `train` split
# print(squad_it_dataset["train"][0])

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
})


In [None]:
# Provide a dictionary to the `data_files` arguments
data_files = {"train": "datasets/squad_it/SQuAD_it-train.json", "test": "datasets/squad_it/SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
print(squad_it_dataset)

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})


In [None]:
# Load a remote dataset
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
print(squad_it_dataset)

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})


## 3-2. Data Wrangling
The `datasets` library provides functions to manipulate the contents of `datasets.dataset_dict.DatasetDict` & `datasets.arrow_dataset.Dataset` objects.

1. `shuffle(seed=None, generator=None, keep_in_memory=False, load_from_cache_file=None, indices_cache_file_name=None, writer_batch_size=1000, new_fingerprint=None)`:
2. `select(indices, keep_in_memory=False, indices_cache_file_name=None, writer_batch_size=1000, new_fingerprint=None)`:
3. `unique()`:
4. `sort(column_names, reverse=False, null_placement="at_end", keep_in_memory=False, load_from_cache_file=None, indices_cache_file_name=None, writer_batch_size=1000, new_fingerprint=None)`:
5. `map(function=None, with_indices=False, with_rank=False, input_columns=None, batched=False, batch_size=1000, drop_last_batch=False, remove_columns=None, keep_in_memory=False, load_from_cache_file=None, cache_file_names=None, writer_batch_size=1000, features=None, disable_nullable=False, fn_kwargs=None, num_proc=None, desc=None)`: Applies a function to all the elements in the table, individually or in batches, and updates the table if function does updated examples. The transformation is applied to all the datasets of the dataset dictionary. Setting `batched` to `True` & `num_proc`, maximum number of processes when generating cache, could achieve a massive performance overhaul.
7. `filter()`:

Python `html` module:

7. `html.unescape()`: Unescapes HTML character codes.

[Drug Reviews](https://archive.ics.uci.edu/dataset/462/drug+review+dataset+drugs+com) from UC Irvine Machine Learning Repository.

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip" --directory-prefix=datasets/drug_reviews
!unzip datasets/drug_reviews/drugsCom_raw.zip -d datasets/drug_reviews

--2024-10-02 19:53:44--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘datasets/drug_reviews/drugsCom_raw.zip’

drugsCom_raw.zip        [           <=>      ]  41.00M  2.66MB/s    in 19s     

2024-10-02 19:54:03 (2.21 MB/s) - ‘datasets/drug_reviews/drugsCom_raw.zip’ saved [42989872]

Archive:  datasets/drug_reviews/drugsCom_raw.zip
  inflating: datasets/drug_reviews/drugsComTest_raw.tsv  
  inflating: datasets/drug_reviews/drugsComTrain_raw.tsv  


In [None]:
from datasets import load_dataset

data_files = {"train": "datasets/drug_reviews/drugsComTrain_raw.tsv", "test": "datasets/drug_reviews/drugsComTest_raw.tsv"}
# Separated by tab character
drug_dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


In [None]:
# Grab a small random sample for data analysis
drug_sample = drug_dataset["train"].shuffle(seed=42).select(range(1000))
# Peek at the first few examples
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [None]:
for split in drug_dataset.keys():
    # Verify the number of `ids` matches the number of rows in each split
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


In [None]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

# Some of the entries in the `condition` column are `None`, which cannot be lowercased as they are not strings
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
drug_dataset = drug_dataset.map(lowercase_condition)

# Check that lowercasing worked
print(drug_dataset["train"]["condition"][:3])

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

['left ventricular dysfunction', 'adhd', 'birth control']


In [None]:
def compute_review_length(example):
    return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
# Inspect the first training example
print(drug_dataset["train"][0])

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461, 'drugName': 'Valsartan', 'condition': 'left ventricular dysfunction', 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"', 'rating': 9.0, 'date': 'May 20, 2012', 'usefulCount': 27, 'review_length': 17}


In [None]:
print(drug_dataset["train"].sort("review_length")[:3])

{'patient_id': [111469, 13653, 53602], 'drugName': ['Ledipasvir / sofosbuvir', 'Amphetamine / dextroamphetamine', 'Alesse'], 'condition': ['hepatitis c', 'adhd', 'birth control'], 'review': ['"Headache"', '"Great"', '"Awesome"'], 'rating': [10.0, 10.0, 10.0], 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'], 'usefulCount': [41, 3, 0], 'review_length': [1, 1, 1]}


In [None]:
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset.num_rows)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}


In [None]:
# `html.unescape()`
import html

# Unescape HTML characters in the corpus
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})
print(drug_dataset)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})


### 3-2-1. Truncation & Overflowing Tokens
The mismatched length problem caused by applying truncation & setting `return_overflowing_tokens=True` in tokenizers can be dealt with the following ways:
- Remove the original columns by setting the `remove_columns` argument in the `datasets.dataset_dict.DatasetDict.map` method.
- Make the original columns the same size as the new ones. This can be achieved by taking advantage of the `overflow_to_sample_mappings` field the tokenizer returns. We can associate each key present in the original dataset with a list of values of the right size by repeating the values of each example as many times as it generates new features.

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Set `max_length=128` for truncation & `return_overflowing_tokens=True` to tokenize long reviews into more than one example
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

# tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
# ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463
# Because `batch_size=1000`

result = tokenize_and_split(drug_dataset["train"][0])
print("Example with Overflowing Tokens:")
print(result)
print("Overflowing Example Size after Tokenization:", [len(inp) for inp in result["input_ids"]])
# The first example became two features because it was tokenized to more than the maximum number of tokens we specified

result = tokenize_and_split(drug_dataset["train"][2])
print("\nExample without Overflowing Tokens:")
print(result)
print("Common Example Size after Tokenization:", [len(inp) for inp in result["input_ids"]])
# The example shorter than `max_length` will be only one dimension

Example with Overflowing Tokens:
{'input_ids': [[101, 107, 1422, 1488, 1110, 9079, 1194, 1117, 2223, 1989, 1104, 1130, 19972, 11083, 119, 1284, 1245, 4264, 1165, 1119, 1310, 1142, 1314, 1989, 117, 1165, 1119, 1408, 1781, 1103, 2439, 13753, 1119, 1209, 1129, 1113, 119, 1370, 1160, 1552, 117, 1119, 1180, 6374, 1243, 1149, 1104, 1908, 117, 1108, 1304, 172, 14687, 1183, 117, 1105, 7362, 1111, 2212, 129, 2005, 1113, 170, 2797, 1313, 1121, 1278, 12020, 113, 1304, 5283, 1111, 1140, 119, 114, 146, 1270, 1117, 3995, 1113, 6356, 2106, 1105, 1131, 1163, 1106, 6166, 1122, 1149, 170, 1374, 1552, 119, 3969, 1293, 1119, 1225, 1120, 1278, 117, 1105, 1114, 2033, 1146, 1107, 1103, 2106, 119, 1109, 1314, 1160, 1552, 1138, 1151, 2463, 1714, 119, 1124, 1110, 150, 21986, 3048, 1167, 5340, 1895, 1190, 1518, 102], [101, 119, 1124, 1110, 1750, 6438, 113, 170, 1363, 1645, 114, 117, 1750, 172, 14687, 1183, 119, 1124, 1110, 11566, 1155, 1103, 1614, 1119, 1431, 119, 8007, 1117, 4658, 1110, 1618, 119, 1284, 1138, 1

In [None]:
# Remove the columns from the old dataset
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)
print("Tokenized Dataset Size:", len(tokenized_dataset["train"]))
print("Original Dataset Size:", len(drug_dataset["train"]))

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Tokenized Dataset Size: 206772
Original Dataset Size: 138514


In [None]:
print("Tokenized Dataset Column Names:")
print(tokenized_dataset["train"].column_names)

# If not remove them, their lengths will be different from the lengths of new fields after tokenization
print("\nOriginal Dataset Column Names:")
print(drug_dataset["train"].column_names)

Tokenized Dataset Column Names:
['input_ids', 'token_type_ids', 'attention_mask', 'overflow_to_sample_mapping']

Original Dataset Column Names:
['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length']


In [None]:
# `overflow_to_sample_mapping` gives us a mapping from a new feature index to the index of the sample it originated from
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    # Update the original columns to use new keys & values
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
print(tokenized_dataset)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})


### 3-2-2. Conversion

1. `set_format(type=None, columns=one, output_all_columns=False, **format_kwargs)`: Enables the conversion between various third-party libraries like `pandas`. This function only changes the output format of `datasets.dataset_dict.DatasetDict` or `datasets.arrow_dataset.Dataset` so that you can easily switch to another format without affecting the underlying data format. Output type can be selected in `[None, 'numpy', 'torch', 'tensorflow', 'pandas', 'arrow', 'jax']`.

In [None]:
# Conversion between `pandas` & `datasets`
drug_dataset.set_format("pandas")
drug_dataset["train"][:5]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
3,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,124
4,155963,Cialis,benign prostatic hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43,68


In [None]:
type(drug_dataset)

datasets.dataset_dict.DatasetDict

In [None]:
train_df = drug_dataset["train"][:]
train_df.head()

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
3,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,124
4,155963,Cialis,benign prostatic hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43,68


In [None]:
type(train_df)

pandas.core.frame.DataFrame

In [None]:
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [None]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
print(freq_dataset)

Dataset({
    features: ['frequency', 'count'],
    num_rows: 819
})


In [None]:
drug_dataset.reset_format()
print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})


### 3-2-3. Validation
1. `train_test_split()`

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
print(drug_dataset_clean)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})


In [None]:
drug_dataset_clean.save_to_disk("datasets/drug_reviews_splits")

Saving the dataset (0/1 shards):   0%|          | 0/110811 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/27703 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
# Visualize a directory tree
from pathlib import Path
from itertools import islice

space = '    '
branch = '│   '
tee = '├── '
last = '└── '

def tree(dir_path: Path, level: int=-1, limit_to_directories: bool=False,
         length_limit: int=1000):
    """Given a directory Path object print a visual tree structure"""
    dir_path = Path(dir_path) # accept string coerceable to Path
    files = 0
    directories = 0
    def inner(dir_path: Path, prefix: str='', level=-1):
        nonlocal files, directories
        if not level:
            return # 0, stop iterating
        if limit_to_directories:
            contents = [d for d in dir_path.iterdir() if d.is_dir()]
        else:
            contents = list(dir_path.iterdir())
        pointers = [tee] * (len(contents) - 1) + [last]
        for pointer, path in zip(pointers, contents):
            if path.is_dir():
                yield prefix + pointer + path.name
                directories += 1
                extension = branch if pointer == tee else space
                yield from inner(path, prefix=prefix+extension, level=level-1)
            elif not limit_to_directories:
                yield prefix + pointer + path.name
                files += 1
    print(dir_path.name)
    iterator = inner(dir_path, level=level)
    for line in islice(iterator, length_limit):
        print(line)
    if next(iterator, None):
        print(f'... length_limit, {length_limit}, reached, counted:')
    print(f'\n{directories} directories' + (f', {files} files' if files else ''))

tree(Path.home() / "workspace/py/torch-nlp/datasets/drug_reviews_splits")

drug_reviews_splits
├── train
│   ├── data-00000-of-00001.arrow
│   ├── dataset_info.json
│   └── state.json
├── test
│   ├── data-00000-of-00001.arrow
│   ├── dataset_info.json
│   └── state.json
├── validation
│   ├── data-00000-of-00001.arrow
│   ├── dataset_info.json
│   └── state.json
└── dataset_dict.json

3 directories, 10 files


In [None]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("datasets/drug_reviews_splits")
print(drug_dataset_reloaded)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})


## 3-3. Memory Mapping & Streaming
**Memory Mapping** allows programs to work with datasets larger than physical RAM while also allowing multiple programs running concurrently to believe they have access to the entire memory space or more of the machine by on-disk cache for fast lookup.

Popular open source datasets for fine-tuning large language models (LLMs).
- The [Wikimedia Wikipedia](https://huggingface.co/datasets/wikimedia/wikipedia) dataset is built from the [Wikipedia Dumps](https://dumps.wikimedia.org/) with one subset per language, each containing a single train split.
- [The Pile: An 800GB Dataset of Diverse Text for Language Modeling](https://arxiv.org/abs/2101.00027)

In [1]:
from datasets import load_dataset
import os, psutil, timeit

mem_before = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
wiki = load_dataset("wikipedia", "20220301.en", split='train')
mem_after = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
print(f"RAM memory used: {(mem_after - mem_before)} MB")

Downloading data:   0%|          | 0/41 [00:00<?, ?files/s]

Generating train split:   0%|          | 0/6458670 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

RAM memory used: 569.140625 MB


In [4]:
code_snippet = """batch_size = 1000
for idx in range(0, len(wiki), batch_size):
    batch = wiki[idx:idx + batch_size]
"""

print(f"Number of files in dataset: {wiki.dataset_size}")
time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
size_gb = wiki.dataset_size / (1024 ** 3)
print(f"Dataset size (cache file): {size_gb:.2f} GB")
print(f"Iterated over {len(wiki)} examples (about {size_gb:.1f} GB) in {time:.1f}s, i.e. {size_gb / time:.3f} GB/s.")

Number of files in dataset: 20275174536
Dataset size (cache file): 18.88 GB
Iterated over 6458670 examples (about 18.9 GB) in 15.4s, i.e. 1.225 GB/s.


In [5]:
wiki_streamed = load_dataset("wikipedia", "20220301.en", split='train', streaming=True)
print(next(iter(wiki_streamed)))
# This will show an example with `id = 12` from `https://en.wikipedia.org/wiki/Anarchism`

{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter h

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = wiki_streamed.map(lambda x: tokenizer(x["text"]))
print(next(iter(tokenized_dataset)))
# This will show an example with `id = 12` from `https://en.wikipedia.org/wiki/Anarchism`

Token indices sequence length is longer than the specified maximum sequence length for this model (8351 > 512). Running this sequence through the model will result in indexing errors


{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter h

In [7]:
shuffled_dataset = wiki_streamed.shuffle(buffer_size=10_000, seed=42)
print(next(iter(shuffled_dataset)))

{'id': '51018689', 'url': 'https://en.wikipedia.org/wiki/Geuteling', 'title': 'Geuteling', 'text': 'A geuteling (plural geutelingen) is a traditional food of the Flemish Ardennes region of Belgium. It bears some similarity to a pancake.\n\nOrigin \nGeutelingen began as a food to celebrate the Catholic feast of Saint Apollonia. Families made their own dough, and they brought their dough to the local bakery to be baked. The next weekend the geutelingen were reheated in a casserole and eaten with the whole family.\n\nSaint Apollonia is the patron saint of dentists, and there is a tradition that the geuteling confers year-long immunity to toothache.\n\nThe geuteling today\n\nToday, the religious association has almost disappeared, but the feast of the geutelingen is still organised on the first weekend after the feast of Saint Apollonia: February 9. There is also a tradition of tossing the geuteling, as pancakes are tossed elsewhere.\n\nElst, part of the community of Brakel in East Flander

In [8]:
dataset_head = wiki_streamed.take(5)
# print(list(dataset_head))
# This will take 5 examples from `id = 12`

[{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter 

In [12]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [13]:
simple_wiki_streamed = load_dataset("wikipedia", "20220301.simple", split='train', streaming=True)
# print(next(iter(simple_wiki_streamed)))
# This will show an example with `id = 1` from `https://simple.wikipedia.org/wiki/April`

{'id': '1', 'url': 'https://simple.wikipedia.org/wiki/April', 'title': 'April', 'text': 'April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May. It is one of four months to have 30 days.\n\nApril always begins on the same day of week as July, and additionally, January in leap years. April always ends on the same day of the week as December.\n\nApril\'s flowers are the Sweet Pea and Daisy. Its birthstone is the diamond. The meaning of the diamond is innocence.\n\nThe Month \n\nApril comes between March and May, making it the fourth month of the year. It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.\n\nApril begins on the same day of the week as July every year and on the same day of the week as January in leap years. April ends on the same day of the week as December every year, as each other\'s last days are exactly 35 weeks (245 days) apart.\n\nIn commo

In [14]:
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([wiki_streamed, simple_wiki_streamed])
# print(list(islice(combined_dataset, 2)))
# This will show an example with `id = 12` from `https://en.wikipedia.org/wiki/Anarchism` & an example with `id = 1` from `https://simple.wikipedia.org/wiki/April`

[{'id': '12', 'url': 'https://en.wikipedia.org/wiki/Anarchism', 'title': 'Anarchism', 'text': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.\n\nHumans lived in societies without formal hierarchies long before the establishment of formal states, realms, or empires. With the rise of organised hierarchical bodies, scepticism toward authority also rose. Although traces of anarchist thought are found throughout history, modern anarchism emerged from the Enlightenment. During the latter 

## 3-4. Semantic Search with FAISS (Facebook AI Similarity Search)

# 4. Subword Tokenization
**Subword Tokenization** algorithms rely on the principle that frequently used words should not be split into smaller subwords, but rare words should be decomposed into meaningful subwords.

## 4-1. Tokenizer Training

**Training** a tokenizer is a statistical deterministic process, different from model training with stochastic gradient descent which is randomized by nature, that identifies which subwords are the best to pick for a given corpus, and the exact rules used to pick them depend on the tokenization algorithm.

1. `transformers.PreTrainedTokenizer.train_new_from_iterator(text_iterator, vocab_size, length=None, new_special_tokens=None, special_tokens_map=None, **kwargs)`: A new tokenizer of the same type as the original one, trained on `text_iterator`.

[CodeSearchNet](https://wandb.ai/github/CodeSearchNet/benchmark) by GitHub

In [None]:
from datasets import load_dataset

# This can take a few minutes to load, so grab a coffee or tea while you wait!
raw_datasets = load_dataset("code_search_net", "python")
print(raw_datasets["train"])

Dataset({
    features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
    num_rows: 412178
})


In [None]:
print(raw_datasets["train"][123456]["whole_func_string"])

def has_elem(elem_ref):
    """
    Has element?
    :param elem_ref:
    :return:
    """
    if not is_elem_ref(elem_ref):
        return False
    elif elem_ref[0] == ElemRefObj:
        return hasattr(elem_ref[1], elem_ref[2])
    elif elem_ref[0] == ElemRefArr:
        return elem_ref[2] in elem_ref[1]


In [None]:
# Create a list of lists of texts loading everything in memory
# training_corpus = [raw_datasets["train"][i: i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000)]

# Generator expression
def get_training_corpus():
    return (raw_datasets["train"][i : i + 1000]["whole_func_string"] for i in range(0, len(raw_datasets["train"]), 1000))

training_corpus = get_training_corpus()

In [None]:
# Define the generator by `yield`
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples["whole_func_string"]

In [None]:
from transformers import AutoTokenizer

old_tokenizer = AutoTokenizer.from_pretrained("gpt2")

example = """class LinearLayer():
    def __init__(self, input_size, output_size):
        self.weight = torch.randn(input_size, output_size)
        self.bias = torch.zeros(output_size)

    def __call__(self, x):
        return x @ self.weights + self.bias
    """

tokens = old_tokenizer.tokenize(example)
print(tokens)

['class', 'ĠLinear', 'Layer', '():', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġdef', 'Ġ__', 'init', '__', '(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'rand', 'n', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġself', '.', 'b', 'ias', 'Ġ=', 'Ġtorch', '.', 'zer', 'os', '(', 'output', '_', 'size', ')', 'ĊĊ', 'Ġ', 'Ġ', 'Ġ', 'Ġdef', 'Ġ__', 'call', '__', '(', 'self', ',', 'Ġx', '):', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'b', 'ias', 'Ċ', 'Ġ', 'Ġ', 'Ġ', 'Ġ']




In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)
print(tokenizer)




GPT2TokenizerFast(name_or_path='gpt2', vocab_size=52000, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [None]:
tokens = tokenizer.tokenize(example)
print(tokens)

['class', 'ĠLinear', 'Layer', '():', 'ĊĠĠĠ', 'Ġdef', 'Ġ__', 'init', '__(', 'self', ',', 'Ġinput', '_', 'size', ',', 'Ġoutput', '_', 'size', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'weight', 'Ġ=', 'Ġtorch', '.', 'randn', '(', 'input', '_', 'size', ',', 'Ġoutput', '_', 'size', ')', 'ĊĠĠĠĠĠĠĠ', 'Ġself', '.', 'bias', 'Ġ=', 'Ġtorch', '.', 'zeros', '(', 'output', '_', 'size', ')', 'ĊĊĠĠĠ', 'Ġdef', 'Ġ__', 'call', '__(', 'self', ',', 'Ġx', '):', 'ĊĠĠĠĠĠĠĠ', 'Ġreturn', 'Ġx', 'Ġ@', 'Ġself', '.', 'weights', 'Ġ+', 'Ġself', '.', 'bias', 'ĊĠĠĠĠ']


In [None]:
print(len(tokens))
print(len(old_tokenizer.tokenize(example)))

70
107


In [None]:
tokenizer.save_pretrained("code-search-net-tokenizer")

('code-search-net-tokenizer/tokenizer_config.json',
 'code-search-net-tokenizer/special_tokens_map.json',
 'code-search-net-tokenizer/vocab.json',
 'code-search-net-tokenizer/merges.txt',
 'code-search-net-tokenizer/added_tokens.json',
 'code-search-net-tokenizer/tokenizer.json')

## 4-2. Normalizer & Pre-Tokenizer

In [28]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print(type(tokenizer.backend_tokenizer))

<class 'tokenizers.Tokenizer'>


In [29]:
# Apply lowercasing & remove the accents
print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

hello how are u?


In [30]:
# Pre-tokenization involves splitting on whitespace & punctuation
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?"))

[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (16, 19)), ('?', (19, 20))]


In [31]:
# Keep the spaces and replace them with a `Ġ` symbol, enabling it to recover the original spaces if we decode the tokens
tokenizer = AutoTokenizer.from_pretrained("gpt2")
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?"))

[('Hello', (0, 5)), (',', (5, 6)), ('Ġhow', (6, 10)), ('Ġare', (10, 14)), ('Ġ', (14, 15)), ('Ġyou', (15, 19)), ('?', (19, 20))]


In [32]:
# SentencePiece keeps spaces and replaces them with a specific token `_` but only splits on whitespace, not punctuation
tokenizer = AutoTokenizer.from_pretrained("t5-small")
print(tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str("Hello, how are  you?"))

[('▁Hello,', (0, 6)), ('▁how', (7, 10)), ('▁are', (11, 14)), ('▁you?', (16, 20))]


## 4-3. Byte Pair Encoding

## 4-4. WordPiece

## 4-5. Unigram

## 4-6. Token Classification