In [1]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate
import evaluate
import numpy as np
import torch
from tqdm import tqdm

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 KB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0
Defaulting to user installation because normal site-packages is not writeable
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


2023-08-10 17:48:42.743302: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Chapter 1: The Pipeline

In [None]:
# High level code
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
classifier(["Oppenheimer is one of the worst movies ever!"])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


[{'label': 'NEGATIVE', 'score': 0.9963310360908508}]

# Chapter 2: Behind the Pipeline

## Under the hood of pipeline

In [None]:
# Loading the particular tokenizer
from transformers import AutoTokenizer

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}


In [None]:
# Loading particular model
from transformers import AutoModel  # Model without any head

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(checkpoint)


In [None]:
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

torch.Size([2, 16, 768])


In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [None]:
outputs = model(**inputs)
print(outputs.logits.shape)

torch.Size([2, 2])


In [None]:
# Post processing the model output
import torch.nn.functional as F

predictions = F.softmax(outputs.logits, dim=1)
print(predictions)

tensor([[4.0195e-02, 9.5980e-01],
        [9.9946e-01, 5.4418e-04]], grad_fn=<SoftmaxBackward0>)


In [None]:
print(model.config.id2label)

{0: 'NEGATIVE', 1: 'POSITIVE'}


## Going in details of individual components: Model

### a) Instantiating model directly

#### Using automodel class

In [None]:
from transformers import AutoModel

In [None]:
bert_model = AutoModel.from_pretrained('bert-base-cased')

#### Using some specific class

In [None]:
from transformers import BertModel

In [None]:
bert_model = BertModel.from_pretrained("bert-base-cased")

### b) Instantiating Configuration for model and then the model


#### Using autoconfig and giving it checkpoint or folder name

In [None]:
from transformers import AutoConfig

In [None]:
bert_config = AutoConfig.from_pretrained('bert-base-cased')

In [None]:
print(type(bert_config))

<class 'transformers.models.bert.configuration_bert.BertConfig'>


#### Using bertconfig without any checkpoint name: returns random config file

In [None]:
from transformers import BertConfig

In [None]:
bert_config_random = BertConfig()

#### Using bertconfig with the checkpoint name: returns non-random config file

In [None]:
from transformers import BertModel

In [None]:
bert_config = BertConfig.from_pretrained('bert-base-cased')

#### Finally instantiating the model using any of the above configuration files

In [None]:
bert_model = BertModel(bert_config)

### c) Saving the model

In [None]:
bert_model.save_pretrained('/content/')

## Going in details of individual components: Tokenizer

In [None]:
'''
There are three types of tokenizers:
1. Word based
2. Character based
3. Subword based
'''

'\nThere are three types of tokenizers:\n1. Word based\n2. Character based\n3. Subword based\n'

### a) Word based tokenizers

In [None]:
'''
Issues:
  1. Similar words may not have similar tokens
  2. Vocabulary size can become very large :( Large vocabularies result in heavy models
  3. A lot of unknown words will occur

To get word based tokens, create a unique vocabulary, assign each unique word
an id to get a word-to-id dictionary, and then use this dictionary to assign tokens to the input text.
'''

'\nIssues:\n  1. Similar words may not have similar tokens\n  2. Vocabulary size can become very large :( Large vocabularies result in heavy models\n  3. A lot of unknown words will occur\n\nTo get word based tokens, create a unique vocabulary, assign each unique word\nan id to get a word-to-id dictionary, and then use this dictionary to assign tokens to the input text.\n'

### b) Character Based Tokens

In [None]:
'''
. This has two primary benefits:
    - The vocabulary is much smaller.
    - There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters.

. Problem:
    - We’ll end up with a very large amount of tokens to be processed by our model.
'''

'\n. This has two primary benefits:\n    - The vocabulary is much smaller.\n    - There are much fewer out-of-vocabulary (unknown) tokens, since every word can be built from characters.\n\n. Problem:\n    - We’ll end up with a very large amount of tokens to be processed by our model.\n'

### c) Subword Tokenization



In [None]:
'''
- Do not split frequently occuring words
- Otherwise split
'''

'\n- Do not split frequently occuring words\n- Otherwise split\n'

### d) Code

In [None]:
from transformers import AutoTokenizer
'''
Autotokenizer.from_pretrained takes in model checkpoint,
and returns the algorithm(similar to config file) which the
model used for tokenization during pretraining, and the
vocabulary(similar to weights of the model)
'''

'\nAutotokenizer.from_pretrained takes in model checkpoint,\nand returns the algorithm(similar to config file) which the\nmodel used for tokenization during pretraining, and the\nvocabulary(similar to weights of the model)\n'

In [None]:
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer('Yoo ma boi, how are you?')

{'input_ids': [101, 14941, 1186, 12477, 171, 8136, 117, 1293, 1132, 1128, 136, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
from transformers import BertTokenizer

In [None]:
model_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
tokenizer("I love it when you call me Senorita...")

{'input_ids': [101, 146, 1567, 1122, 1165, 1128, 1840, 1143, 14895, 9012, 1777, 119, 119, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

### e) Under the hood of tokenizer('string')

In [None]:
model_name = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
input_text = "Binte Khan was Bin Hassan's first Love"

In [None]:
tokens = tokenizer.tokenize(input_text)
print(tokens)

['Bin', '##te', 'Khan', 'was', 'Bin', 'Hassan', "'", 's', 'first', 'Love']


In [None]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
print(input_ids)

[21700, 1566, 4340, 1108, 21700, 13583, 112, 188, 1148, 2185]


In [None]:
final_inputs = tokenizer.prepare_for_model(input_ids)
print(final_inputs)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': [101, 21700, 1566, 4340, 1108, 21700, 13583, 112, 188, 1148, 2185, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
decoded = tokenizer.decode(final_inputs['input_ids'])
print(decoded)

[CLS] Binte Khan was Bin Hassan's first Love [SEP]


### f) Batching input sequences

In [None]:
import torch
'''
Following line will give error as the model expects
not a single sentence, but a batch of sentences.
'''
# model(torch.tensor(final_inputs['input_ids']))

'\nFollowing line will give error as the model expects\nnot a single sentence, but a batch of sentences.\n'

In [None]:
# This line won't give any error
model(torch.tensor([final_inputs['input_ids']]))

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.4303, -2.0763]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
# Apart from just getting the right tokens to input into
# the model, we also need to pad tokens of all sentences
# to equal length, and to get attention masks.

# But keep in mind that pad token id should be the same
# as the model used during training

# All of this is done by tokenizer by default :)

### g) More on tokenizer functionalities

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

model_inputs = tokenizer(sequences)

In [None]:
# Will pad the sequences up to the maximum sequence length
model_inputs = tokenizer(sequences, padding="longest")

# Will pad the sequences up to the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, padding="max_length")

# Will pad the sequences up to the specified max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Will truncate the sequences that are longer than the model max length
# (512 for BERT or DistilBERT)
model_inputs = tokenizer(sequences, truncation=True)

# Will truncate the sequences that are longer than the specified max length
model_inputs = tokenizer(sequences, max_length=8, truncation=True)

In [None]:
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

# Returns PyTorch tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="pt")

# Returns TensorFlow tensors
model_inputs = tokenizer(sequences, padding=True, return_tensors="tf")

# Returns NumPy arrays
model_inputs = tokenizer(sequences, padding=True, return_tensors="np")

### h) Conclusion

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]

tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
output = model(**tokens)
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


#

# Chapter 3: Finetuning a pre-trained model

## One step of model training, with two sentences as dataset

In [None]:
import torch
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "This course is amazing!",
]
batch = tokenizer(sequences, padding=True,
                  truncation=True, return_tensors='pt')  # dictionary

batch['labels'] = torch.tensor([1, 1])

optimizer = AdamW(model.parameters())
loss = model(**batch).loss
loss.backward()
optimizer.step()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Exploring the Dataset library


In [None]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("glue", "mrpc")

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [None]:
raw_train_dataset = raw_datasets["train"]
print(raw_train_dataset)
print('---------------------')
# Viewing a single example
example = raw_train_dataset[0]
print(example)

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 3668
})
---------------------
{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}


In [None]:
# To see which label represents what:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [None]:
# Tokenizing all the elements of the dataset
from transformers import AutoTokenizer

checkpoint = "bert-base-cased"
toeknizer = AutoTokenizer.from_pretrained(checkpoint)

# Noice how we gave 2, and not 1 sentence as input to the tokenizer()
def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], padding="max_length",
      truncation=True, max_length=128
  )

tokenize_function(example)

{'input_ids': [101, 7277, 2180, 5303, 4806, 1117, 1711, 117, 2292, 1119, 1270, 107, 1103, 7737, 107, 117, 1104, 9938, 4267, 12223, 21811, 1117, 2554, 119, 102, 11336, 6732, 3384, 1106, 1140, 1112, 1178, 107, 1103, 7737, 107, 117, 7277, 2180, 5303, 4806, 1117, 1711, 1104, 9938, 4267, 12223, 21811, 1117, 2554, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
print(tokenized_datasets.column_names)

{'train': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'], 'test': ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask']}


In [None]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [None]:
print(tokenized_datasets["train"])

Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 3668
})


In [None]:
# Manipulating the dataset according to what model is expecting.
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})


In [None]:
# Other way to tokenize the dataset
tokenized_dataset_v2 = tokenizer(
    raw_datasets["train"]["sentence1"],
    raw_datasets["train"]["sentence2"],
    padding=True,
    truncation=True
)

In [None]:
print(tokenized_dataset_v2)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



The problem with the second approach is that it returns a python dictionary and will work only if you have enough RAM. To keep the data as a dataset, we will use the Dataset.map() method.

## Exploring the Dataset Library (Putting together)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], padding="max_length",
      truncation=True, max_length=128
  )


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

{'labels': tensor(1), 'input_ids': tensor([  101,  7277,  2180,  5303,  4806,  1117,  1711,   117,  2292,  1119,
         1270,   107,  1103,  7737,   107,   117,  1104,  9938,  4267, 12223,
        21811,  1117,  2554,   119,   102, 11336,  6732,  3384,  1106,  1140,
         1112,  1178,   107,  1103,  7737,   107,   117,  7277,  2180,  5303,
         4806,  1117,  1711,  1104,  9938,  4267, 12223, 21811,  1117,  2554,
          119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

## Giving this dataset to torch dataloader

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets["train"],
                              batch_size=16,
                              shuffle=True)

for step, batch in enumerate(train_dataloader):
  print(batch['input_ids'].shape)
  if step > 5:
    break

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 128])


### Doing above two sections by Dynamic Padding

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"],
      truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(tokenized_datasets["train"],
                              batch_size=16,
                              shuffle=True,
                              collate_fn=data_collator)

for step, batch in enumerate(train_dataloader):
  print(batch['input_ids'].shape)
  if step > 5:
    break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


torch.Size([16, 80])
torch.Size([16, 72])
torch.Size([16, 87])
torch.Size([16, 84])
torch.Size([16, 77])
torch.Size([16, 74])
torch.Size([16, 69])


## Trainer API

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"],
      truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
# !pip install transformers
from transformers import TrainingArguments
training_args = TrainingArguments('test-trainer')

In [None]:
from transformers import AutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
# trainer.train()

In [None]:
# Evaluating the model
# Trainer.predict returns named tuple containing 3 elements
# predictions, labels, metric value
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(408, 2) (408,)


In [None]:
print(predictions.predictions)

[[-0.6967151  -0.3756896 ]
 [-0.6876184  -0.391532  ]
 [-0.68211657 -0.37875637]
 [-0.6914974  -0.38169637]
 [-0.69527936 -0.38151777]
 [-0.70225835 -0.38775015]
 [-0.6917361  -0.40015525]
 [-0.6837981  -0.40252933]
 [-0.68638617 -0.37447944]
 [-0.69432944 -0.36988056]
 [-0.6873695  -0.382127  ]
 [-0.7064969  -0.39345717]
 [-0.67053777 -0.3508614 ]
 [-0.68649125 -0.37919453]
 [-0.692019   -0.38762498]
 [-0.6945637  -0.3777609 ]
 [-0.69388425 -0.39638898]
 [-0.6602345  -0.38809955]
 [-0.68243265 -0.38765866]
 [-0.66972566 -0.38133958]
 [-0.63704437 -0.42157665]
 [-0.678729   -0.38502657]
 [-0.68850255 -0.39314327]
 [-0.69666815 -0.39865604]
 [-0.6876614  -0.38122246]
 [-0.7025974  -0.39227557]
 [-0.672801   -0.40824246]
 [-0.68693805 -0.3873577 ]
 [-0.68496865 -0.39938754]
 [-0.68684095 -0.37949717]
 [-0.6943828  -0.38600424]
 [-0.6897569  -0.39814025]
 [-0.6925422  -0.3841939 ]
 [-0.6759791  -0.37521875]
 [-0.69334537 -0.37910366]
 [-0.67045295 -0.37173864]
 [-0.68579894 -0.38805583]
 

In [None]:
print(predictions.label_ids)

[1 0 0 1 0 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0
 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 1 1
 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1
 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 0 0 1 1 0
 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 0 1 0 1 0 1 1 1
 0 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 0 1 0 1 1 1 1 1 0
 1 1 1 1 0 0 1 1 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0
 1 1 1 1 0 0 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0
 1]


In [None]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# !pip install evaluate
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Putting together the Trainer API code

In [None]:
# Wrapping everything togther, we can get our compute_metrics function
# !pip install evaluate
import evaluate
import numpy as np
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")  # 1
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # 2

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True,
      max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)  # 3

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)  # 4

from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")  # 5

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # 6

from transformers import Trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)  # 7

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.684488,0.816176,0.877651
2,0.308400,0.917826,0.835784,0.888147
3,0.129600,0.990127,0.857843,0.900344


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=1377, training_loss=0.17081803105062734, metrics={'train_runtime': 218.6065, 'train_samples_per_second': 50.337, 'train_steps_per_second': 6.299, 'total_flos': 559439881459200.0, 'train_loss': 0.17081803105062734, 'epoch': 3.0})

## Putting together training code, but without Trainer API use

In [None]:
# Wrapping everything togther, we can get our compute_metrics function
def compute_metrics(eval_preds):
  metric = evaluate.load("glue", "mrpc")
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

raw_datasets = load_dataset("glue", "mrpc")  # 1
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)  # 2

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True,
      max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["idx", "sentence1", "sentence2"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.with_format('torch')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)  # 4

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
# Defining my own dataloader
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
# To check there is no mistake, we can check a batch:
for batch in train_dataloader:
  break
{k: v.shape for k, v in batch.items()}

In [None]:
# Data processing done, time to load model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)  # 6/

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7016, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [None]:
# Preparation for training loop
from transformers import AdamW
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

    lr_scheduler.step()
    progress_bar.update(1)

  0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
    import evaluate
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    metric.compute()

{'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}

## Putting together training code, but without Trainer API use, but this time updating training loop to work with 'accelerator'

In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("glue", "mrpc")

from transformers import AutoTokenizer
checkpoint = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
  return tokenizer(
      example["sentence1"], example["sentence2"], truncation=True, max_length=128
  )

tokenized_datasets = raw_datasets.map(tokenize_function)
tokenized_datasets = tokenized_datasets.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')
tokenized_datasets.set_format('torch')

from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer)

from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=16, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], shuffle=True, batch_size=16, collate_fn=data_collator
)

# To check there is no mistake, we can check a batch:
for batch in train_dataloader:
  break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([16]),
 'input_ids': torch.Size([16, 105]),
 'token_type_ids': torch.Size([16, 105]),
 'attention_mask': torch.Size([16, 105])}

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Preparing for the training loop
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)

from accelerate import Accelerator
accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
    train_dataloader, eval_dataloader, model, optimizer
)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
from transformers import get_scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    outputs = model(**batch)
    loss = outputs.loss
    accelerator.backward(loss)

    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

In [None]:
    import evaluate
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=accelerator.gather(predictions), references=accelerator.gather(batch["labels"]))

    metric.compute()

Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

{'accuracy': 0.6862745098039216, 'f1': 0.8134110787172011}

# Chapter 4: Datasets

### a) Loading a local dataset

In [None]:
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

--2023-08-09 09:16:01--  https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
Resolving github.com (github.com)... 192.30.255.113
Connecting to github.com (github.com)|192.30.255.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz [following]
--2023-08-09 09:16:01--  https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7725286 (7.4M) [application/octet-stream]
Saving to: ‘SQuAD_it-train.json.gz’


2023-08-09 09:16:02 (112 MB/s) - ‘SQuAD_it-train.json.gz’ saved [7725286/7725286]

--2023-08-09 09:16:02--  https://github.com/crux82/squad-it/raw/master/SQuAD_it-te

In [None]:
!gzip -dkv SQuAD_it-*.json.gz

SQuAD_it-test.json.gz:	 87.5% -- created SQuAD_it-test.json
SQuAD_it-train.json.gz:	 82.3% -- created SQuAD_it-train.json


In [None]:
from datasets import load_dataset

squad_it_dataset = load_dataset("json", data_files="SQuAD_it-train.json",
                              field="data")
# By default, loading local files creates a
# DatasetDict object with a train split.
print(squad_it_dataset)

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
})


In [None]:
# squad_it_dataset["train"][0]

In [None]:
data_files = {"train": "SQuAD_it-train.json", "test": "SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})

In [None]:
# The loading scripts in 🤗 Datasets actually support automatic decompression of the input files
data_files = {"train": "SQuAD_it-train.json.gz", "test": "SQuAD_it-test.json.gz"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})

## b) Loading a remote dataset

In [None]:
# Just place the url in front of data_files argument instead of the filepath.
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
print(squad_it_dataset)

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})


## Time to slice and dice

I used following functions on the dataset:
- shuffle( )
- select (list of indices)
- unique (column_name)
- rename_column (old_column_name, new_column_name)
- map ( ): To add new column, to update existing columns
- filter ( )
- sort (column_name)
- .column_names attribute
- Dealing with mismatch length problem

In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2023-08-09 12:54:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.4’

drugsCom_raw.zip.4      [   <=>              ]  41.00M  82.7MB/s    in 0.5s    

2023-08-09 12:54:55 (82.7 MB/s) - ‘drugsCom_raw.zip.4’ saved [42989872]



In [None]:
# Loading a dataset
from datasets import load_dataset

data_files = {"train": 'drugsComTrain_raw.tsv', "test": 'drugsComTest_raw.tsv'}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter='\t')
print(drug_dataset)

In [None]:
# Selecting random chunk of data to get a feel of how it looks like
drug_dataset_sample = drug_dataset["train"].shuffle(seed=42).select([2, 43])
drug_dataset_sample

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 2
})

In [None]:
drug_dataset_sample[0]

{'Unnamed: 0': 80482,
 'drugName': 'Mobic',
 'condition': 'Inflammatory Conditions',
 'review': '"I have been taking Mobic for over a year with no side effects other than an elevated blood pressure.  I had severe knee and ankle pain which completely went away after taking Mobic.  I attempted to stop the medication however pain returned after a few days."',
 'rating': 10.0,
 'date': 'June 5, 2013',
 'usefulCount': 128}

In [None]:
# The first column looks suspicious. Perhaps it is column of patient ids.
# Let's check this:
for split in drug_dataset.keys():
  assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

True
True


In [None]:
# Our hypothesis seems correct. Let's rename the column.
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


In [None]:
# Now let's normalise the condition labels
def lowercase_condition(example):
  return {"condition": example["condition"].lower()}

drug_dataset.map(lowercase_condition)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: ignored

Some of the entries in condition column are None :(

In [None]:
# def filter_nones(x):
#   return x["condition"] is not None

# drug_dataset_filtered = drug_dataset.filter(filter_nones)

# We could also filter using lambda function
drug_dataset_filtered = drug_dataset.filter(lambda x: x["condition"] is not None)
print(drug_dataset_filtered)

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})


In [None]:
drug_dataset = drug_dataset_filtered.map(lowercase_condition)
print(drug_dataset)

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})


In [None]:
drug_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27}

In [None]:
# Creating new columns
def compute_review_length(example):
  return {"review_length": len(example["review"].split())}

drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset["train"][0]

'''
🙋 An alternative way to add new columns to a dataset is with the
Dataset.add_column() function. This allows you to provide the column
as a Python list or NumPy array and can be handy in situations where
Dataset.map() is not well suited for your analysis.
'''

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [None]:
# Sorting dataset based on a column value
drug_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [None]:
# Filter out records with very small reviews
drug_dataset = drug_dataset.filter(lambda x: x["review_length"] > 30)
print(drug_dataset)

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 138514
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})


In [None]:
# We’ll use Dataset.map() to unescape all the HTML characters in our corpus:
import html

text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [None]:
drug_dataset = drug_dataset.map(lambda x: {"review": html.unescape(x["review"])})

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
# Using batched=True makes thing so fast!
new_drug_dataset = drug_dataset.map(
    lambda x: {"review": [html.unescape(o) for o in x["review"]]}, batched=True
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
# You can change the number of elements in your dataset.
# Let us see how it works.
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

result = tokenize_and_split(drug_dataset["train"][0])
print(result)
[len(inp) for inp in result["input_ids"]]

{'input_ids': [[101, 107, 1422, 1488, 1110, 9079, 1194, 1117, 2223, 1989, 1104, 1130, 19972, 11083, 119, 1284, 1245, 4264, 1165, 1119, 1310, 1142, 1314, 1989, 117, 1165, 1119, 1408, 1781, 1103, 2439, 13753, 1119, 1209, 1129, 1113, 119, 1370, 1160, 1552, 117, 1119, 1180, 6374, 1243, 1149, 1104, 1908, 117, 1108, 1304, 172, 14687, 1183, 117, 1105, 7362, 1111, 2212, 129, 2005, 1113, 170, 2797, 1313, 1121, 1278, 12020, 113, 1304, 5283, 1111, 1140, 119, 114, 146, 1270, 1117, 3995, 1113, 6356, 2106, 1105, 1131, 1163, 1106, 6166, 1122, 1149, 170, 1374, 1552, 119, 3969, 1293, 1119, 1225, 1120, 1278, 117, 1105, 1114, 2033, 1146, 1107, 1103, 2106, 119, 1109, 1314, 1160, 1552, 1138, 1151, 2463, 1714, 119, 1124, 1110, 150, 21986, 3048, 1167, 5340, 1895, 1190, 1518, 102], [101, 119, 1124, 1110, 1750, 6438, 113, 170, 1363, 1645, 114, 117, 1750, 172, 14687, 1183, 119, 1124, 1110, 11566, 1155, 1103, 1614, 1119, 1431, 119, 8007, 1117, 4658, 1110, 1618, 119, 1284, 1138, 1793, 1242, 1472, 23897, 1105, 117

[128, 49]

So, our first example in the training set became two features because it was tokenized to more than the maximum number of tokens we specified: the first one of length 128 and the second one of length 49.

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

ArrowInvalid: ignored

In [None]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched=True, remove_columns=drug_dataset["train"].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [None]:
len(tokenized_dataset["train"]), len(drug_dataset["train"])  # Clearly both datasets are of different lengths

(206772, 138514)

We can also deal with the mismatched length problem by making the old columns the same size as the new ones. To do this, we will need the overflow_to_sample_mapping field the tokenizer returns when we set return_overflowing_tokens=True. It gives us a mapping from a new feature index to the index of the sample it originated from. Using this, we can associate each key present in our original dataset with a list of values of the right size by repeating the values of each example as many times as it generates new features.

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    # Extract mapping between new and old indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

## Dataset --> Dataframe


In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2023-08-09 13:00:47--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.6’

drugsCom_raw.zip.6      [    <=>             ]  41.00M  66.1MB/s    in 0.6s    

2023-08-09 13:00:48 (66.1 MB/s) - ‘drugsCom_raw.zip.6’ saved [42989872]

Archive:  drugsCom_raw.zip
replace drugsComTest_raw.tsv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# Loading a dataset
from datasets import load_dataset

data_files = {"train": 'drugsComTrain_raw.tsv', "test": 'drugsComTest_raw.tsv'}
drug_dataset = load_dataset("csv", data_files=data_files, delimiter='\t')
print(drug_dataset)

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})


In [None]:
print(type(drug_dataset))

<class 'datasets.dataset_dict.DatasetDict'>


In [None]:
drug_dataset.set_format("pandas")
drug_dataset["train"][3]

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10


In [None]:
drug_dataset["train"]  # this still return the old Dataset type rather than dataframe

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})

In [None]:
# Let's create pandas dataframe for the whole training set
train_df = drug_dataset["train"][:]
train_df

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37
...,...,...,...,...,...,...,...
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1.0,"November 1, 2011",34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2.0,"March 15, 2014",35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10.0,"September 19, 2015",79


In [None]:
train_df[:3]

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17


In [None]:
print(type(train_df))

<class 'pandas.core.frame.DataFrame'>


In [None]:
# Once you are done playing with pandas, make sure to reset the format of dataset back to arrow.
drug_dataset.reset_format()

In [None]:
# Once we are done playing with pandas, we can create dataset object again from the dataframe object.
from datasets import Dataset
dataset = Dataset.from_pandas(train_df)
dataset

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})

## Creating a validation set: Using dataset.train_test_split method

In [None]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)

In [None]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 129037
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 32260
    })
})

In [None]:
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
drug_dataset_clean["validation"]

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 32260
})

In [None]:
drug_dataset_clean["test"] = drug_dataset["test"]

In [None]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 129037
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 32260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

## Saving the Dataset, or Loading it back

In [None]:
# Once you download data, it is stored in cache directory in the form of arrow files
drug_dataset.cache_files

# But what if you want to save your data in different location and format?

{'train': [{'filename': '/root/.cache/huggingface/datasets/csv/default-52d3f5c14d226f02/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/csv-train.arrow'}],
 'test': [{'filename': '/root/.cache/huggingface/datasets/csv/default-52d3f5c14d226f02/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/csv-test.arrow'}]}

### a) Arrow format:
This is great for stroring large datasets, not as space efficient as Parquet format.

In [None]:
# Save
drug_dataset.save_to_disk('my-arrow-datasets')

Saving the dataset (0/1 shards):   0%|          | 0/161297 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/53766 [00:00<?, ? examples/s]

In [None]:
# To reload back
from datasets import load_from_disk
dataset = load_from_disk('my-arrow-datasets')
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### b) CSV format:
Good for storing small datasets.

In [None]:
# Save
for split, dataset in dataset.items():
  dataset.to_csv(f"my-dataset-{split}.csv", index=None)

Creating CSV from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/54 [00:00<?, ?ba/s]

In [None]:
# To reload back
from datasets import load_dataset
data_files = {
    "train": "/content/my-dataset-train.csv",
    "test": "/content/my-dataset-test.csv"
}
datasets = load_dataset("csv", data_files=data_files)
datasets

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### c) Parquet Format:
Good for saving Large datasets for a very long time, very space efficient.

In [None]:
# Saving
for split, dataset in datasets.items():
  dataset.to_parquet(f"my-dataset-{split}.parquet")

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/54 [00:00<?, ?ba/s]

In [None]:
# Reloading back
from datasets import load_dataset
data_files = {
    "train": "/content/my-dataset-train.parquet",
    "test": '/content/my-dataset-test.parquet'
}
datasets = load_dataset("parquet", data_files=data_files)
datasets

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

### d) JSON Format
Good for storing small datasets.

In [None]:
# Saving
for split, dataset in datasets.items():
  dataset.to_parquet(f"my-dataset-{split}.json")

Creating parquet from Arrow format:   0%|          | 0/162 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/54 [00:00<?, ?ba/s]

In [None]:
# Reloading back
from datasets import load_dataset
data_files = {
    "train": "/content/my-dataset-train.json",
    "test": '/content/my-dataset-test.json'
}
datasets = load_dataset("parquet", data_files=data_files)
datasets

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

## Big Data

Datasets uses Arrow and streaming to handle data at scale. 
- Arrow is desined for high performance data processing, and uses columnar format.
- Streaming api allows to progressively download raw data.

- Arrow uses memory mapping mechanism, which maps the data present in hard disk to virtual memory. This allows us to play with data without loading the whole dataset into the RAM.
- This memory mapped file can be shared across multiple processes.

### a) Arrow

In [8]:
# Loading large dataset
from datasets import load_dataset

data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"
large_dataset = load_dataset("json", data_files=data_files, split="train")
size_gb = large_dataset.dataset_size / (1024 ** 3)
print(f"Dataset size (cache file) : {size_gb:.2f} GB")

In [9]:
# Ram used is still less than size of the dataset
import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 661.64 MB


In [None]:
# Working with the arrow dataset is extremely fast
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(large_dataset), batch_size):
    _ = large_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(
    f"Iterated over {len(large_dataset)} examples (about {size_gb:.1f} GB) in "
    f"{time:.1f}s, i.e. {size_gb/time:.3f} GB/s"
)

### b) Streaming API

In [11]:
# Loading the dataset
large_dataset_streamed = load_dataset(
"json", data_files=data_files, split="train", streaming=True)

next(iter(large_dataset_streamed))

In [None]:
type(large_dataset_streamed)

In [None]:
# Even tokenized dataset is accessed the same way as loaded_dataset_stream :)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = large_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

In [None]:
# Select using take method instead of select method,
# as we can't index into the streamed dataset.

dataset_head = large_dataset_streamed.take(5)
list(dataset_head)

In [None]:
# Slice using skip 

# Skip the first 1,000 examples and include the rest in the training set
train_dataset = large_dataset_streamed.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = large_dataset_streamed.take(1000)