In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.4 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 38.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyy

In this example, we’ll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes the text of a review and requires the model to predict whether the sentiment of the review is positive or negative. Let’s start by downloading the dataset from the **Large Movie Review Dataset** webpage.

In [None]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2022-03-09 07:29:46--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2022-03-09 07:29:53 (11.4 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



This data is organized into **pos** and **neg** folders with one text file per example. Let’s write a function that can read this in.

In [None]:
from pathlib import Path 

def read_imdb_split(split_dir):
  split_dir = Path(split_dir)
  texts = []
  labels = []
  for label_dir in ["pos", "neg"]:
    for text_file in (split_dir/label_dir).iterdir():
      texts.append(text_file.read_text())
      labels.append(0 if label_dir is "neg" else 1)
  
  return texts, labels

In [None]:
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

In [None]:
train_texts[0]

"This is a family film, which to some people is an automatic turn off. It seems that too many people do not want to see films that are not loaded down with failing arms and legs, gratuitous violence and enough expletives to fill the New York phone book. This film is none of those. It is cliché, it is formula, but it is also fun. It doesn't ask you to think, it doesn't demand that you accept the film as reality. It simply does what a good film ought to do, which is to willingly suspend disbelief for two hours and enjoy the adventure. The cast is good, while not excellent. As another commenter pointed out the John Williams sound score was, as usual, excellent. And the fact that a lot of the film was shot in Huntsville at the real space camp made it even more believable. <br /><br />It was ironic that the original release of the film was delayed for some months due to the Challenger Shuttle disaster, which may have played a large part in it's original theatrical opening, but the film even

In [None]:
train_labels[0]

1

In [None]:
len(train_texts)

25000

In [None]:
len(test_texts)

25000

We now have a train and test dataset, but let’s also also create a validation set which we can use for for evaluation and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

Alright, we’ve read in our dataset. Now let’s tackle tokenization. We’ll eventually train a classifier using **pre-trained DistilBert**, so let’s use the **DistilBert tokenizer.**

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation = True, padding = True)
val_encodings = tokenizer(val_texts, truncation = True, padding = True)
test_encodings = tokenizer(test_texts, truncation = True, padding = True)

In [None]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
train_encodings["input_ids"][0]

[101,
 1996,
 2214,
 22260,
 18994,
 2008,
 11471,
 2111,
 2024,
 11771,
 2111,
 2003,
 2092,
 7645,
 1999,
 1000,
 2308,
 1999,
 2293,
 1012,
 1000,
 1996,
 5896,
 1010,
 2579,
 2013,
 1040,
 1012,
 1044,
 1012,
 5623,
 1005,
 1055,
 3117,
 1010,
 3397,
 2019,
 10866,
 4834,
 1997,
 8474,
 2008,
 2024,
 1010,
 2012,
 2190,
 1010,
 2061,
 8458,
 19506,
 7277,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 2054,
 1037,
 12063,
 2061,
 2172,
 3947,
 2253,
 2046,
 2061,
 12436,
 10841,
 3560,
 2019,
 6912,
 1025,
 2054,
 2019,
 4064,
 9140,
 1997,
 3494,
 2445,
 2107,
 3086,
 1012,
 1999,
 8741,
 1997,
 2152,
 2537,
 5300,
 1010,
 2023,
 2143,
 3310,
 2408,
 2004,
 6945,
 6313,
 2004,
 2049,
 5073,
 1012,
 1026,
 7987,
 1013,
 1028,
 1026,
 7987,
 1013,
 1028,
 1037,
 7065,
 17417,
 2102,
 1999,
 2541,
 6414,
 23283,
 1037,
 3440,
 8605,
 1997,
 18414,
 8159,
 10484,
 9273,
 1999,
 4639,
 4230,
 1010,
 4830,
 21724,
 2989,
 7880,
 1010,
 1998,
 11865,
 29256,
 2296,
 3357

In [None]:
len(train_encodings["input_ids"])

20000

In [None]:
type(train_encodings)

transformers.tokenization_utils_base.BatchEncoding

Now, let’s **turn our labels and encodings into a Dataset object**. In PyTorch, this is done by subclassing a torch.utils.data.Dataset object and implementing **__len__** and **__getitem__**. In TensorFlow, we pass our input encodings and labels to the from_tensor_slices constructor method. We put the data in this format so that the data can be easily batched such that each key in the batch encoding corresponds to a named parameter of the **forward()** method of the model we will train.

In [None]:
for key, value in train_encodings.items():
  print({key: value})

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import torch 

class IMDBDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.labels = labels
    self.encodings = encodings

  def __getitem__(self, idx):
    item = {key: torch.tensor(value[idx]) for key, value in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx]) 
    return item
  
  def __len__(self):
    return len(self.labels)

In [None]:
train_dataset = IMDBDataset(train_encodings, train_labels)
val_dataset = IMDBDataset(val_encodings, val_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

Now that our datasets our ready, we can fine-tune a model either with the 🤗 **Trainer/TFTrainer or with native PyTorch/TensorFlow**. See training.

##Fine-Tuning with Trainer

The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is **create a model to fine-tune, define the TrainingArguments/TFTrainingArguments and instantiate a Trainer/TFTrainer**.

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "./results", #output directory
    num_train_epochs = 3, #total number of training epochs
    per_device_train_batch_size=16, #batch size per device during training
    per_device_eval_batch_size = 64, #batch size per device during eval
    warmup_steps = 500, #number of warmup steps for learning rate scheduler
    weight_decay = 0.01, #strength of weight decay
    logging_dir = "./logs", #directory for storing logs
    logging_steps = 10
)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model = model, #the instantiated 🤗 Transformers model to be trained
    args = training_args, #training arguments, defined above
    train_dataset = train_dataset, #training dataset
    eval_dataset = val_dataset #evaluation dataset
)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [None]:
trainer.train()

***** Running training *****
  Num examples = 20000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3750


Step,Training Loss
10,0.6931
20,0.6997
30,0.6993
40,0.6929
50,0.6912
60,0.6921
70,0.6808
80,0.6724
90,0.6425
100,0.5603


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin


KeyboardInterrupt: ignored

In [None]:
a=3

##Fine-Tuning with Pytorch

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.17.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677c637033ee4bce9

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [None]:
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
optim = AdamW(model.parameters(), lr = 5e-5)



In [None]:
for epoch in range(3):
  for batch in train_loader:
    optim.zero_grad()
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device) 
    labels = batch["labels"].to(device)
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs[0]
    loss.backward()
    optim.step()

KeyboardInterrupt: ignored