# **Generating BERT Embeddings**

In [None]:
!pip install torch==1.4.0

Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/3b/fa92ece1e58a6a48ec598bab327f39d69808133e5b2fb33002ca754e381e/torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 16kB/s 
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.4.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.8.1+cu101
    Uninstalling torch-1.8.1+cu101:
      Successfully uninstalled torch-1.8.1+cu101
Successfully installed torch-1.4.0


In [None]:
!pip install transformers==3.5.1



In [None]:
from transformers import BertModel
import torch

In [None]:
from transformers import BertTokenizer

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## **Preprocessing the input**

### **Define the sentence**

In [None]:
sentence = 'I love Paris'

### **Tokenize the sentence and obtain the tokens**

In [None]:
tokens = tokenizer.tokenize(sentence)

### **Let's print the tokens:**

In [None]:
print(tokens)

['i', 'love', 'paris']


### **Now, we will add the [CLS] token at the begining and [SEP] token at the end of the tokens list:**

In [None]:
tokens = ['[CLS]'] + tokens + ['[SEP]']

### **Let's look at our updated tokens list:**

In [None]:
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]']


### **As we can observe, we have [CLS] token at the begining and [SEP] token at the end of our tokens list. We can also observe the length of our tokens is 5.**



### **Say, we need to keep the length of our tokens list to 7, then, in that case, we will add two [PAD] tokens at the end as shown in the following: **

In [None]:
tokens = tokens + ['[PAD]'] + ['[PAD]']

### **Let's print our outdated tokens list**

In [None]:
print(tokens)

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']


### **As we can observe, now we have the tokens list consists of [PAD] tokens and the length of our tokens list is 7**

### **Next, we create the attention mask. We set the attention mask value of 1 if the tokens is not a [PAD] token else we will set the attention mask to 0 as shown below:**

In [None]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]

### **Let's print attent_mask**

In [None]:
print(attention_mask)

[1, 1, 1, 1, 1, 0, 0]


### **As we can observe, we have attention mask value 0 at the position where have [PAD] token to their token_ids as shown below:**

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

### **Let's have a look at token_ids**

In [None]:
print(token_ids)

[101, 1045, 2293, 3000, 102, 0, 0]


### **From the above output, we can observe that each token is mapped to a unique token id.**

### **Now, we convert the token_ids and attention_mask to tensors as shown below:**

In [None]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
attention_mask = torch.tensor(attention_mask).unsqueeze(0)

## **Getting the Embeddings**

In [None]:
hidden_rep, cls_head = model(token_ids, attention_mask=attention_mask)

In [None]:
print(hidden_rep.shape)

torch.Size([1, 7, 768])


In [None]:
print(cls_head.shape)

torch.Size([1, 768])


# **Extracting Embeddings from All the Encoders**

In [None]:
from transformers import BertModel, BertTokenizer
import torch

In [None]:
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## **Preprocess the Input**

### **Let's consider the same sentence we saw in the previous section. First, we tokenize the sentence and add [CLS] token at the begining [SEP] at the end.**

In [None]:
sentence = 'I love Paris'
tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']

### **Suppose, we need to keep the token length to 7. So, we add the [PAD] tokens and also define the attention mask:**

In [None]:
tokens = tokenizer.tokenize(sentence)
tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]']

### **Suppose, we need to keep the token length to 7. So, we add the [PAD] tokens and also define the attention mask**

In [None]:
tokens = tokens + ['[PAD]'] + ['[PAD]']
tokens

['[CLS]', 'i', 'love', 'paris', '[SEP]', '[PAD]', '[PAD]']

In [None]:
attention_mask = [1 if i!= '[PAD]' else 0 for i in tokens]

### **Next Convert the Tokens to their token_ids:**

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)

In [None]:
token_ids

[101, 1045, 2293, 3000, 102, 0, 0]

### **Now we convert the token_ids and attention_mask to tensor**

In [None]:
token_ids = torch.tensor(token_ids).unsqueeze(0)
token_ids

tensor([[ 101, 1045, 2293, 3000,  102,    0,    0]])

In [None]:
attention_mask

[1, 1, 1, 1, 1, 0, 0]

In [None]:
attention_mask = torch.tensor(attention_mask).unsqueeze(0)
attention_mask

tensor([[1, 1, 1, 1, 1, 0, 0]])

## **Getting the Embeddings**

In [None]:
last_hidden_state, pooler_output, hidden_states = model(token_ids, attention_mask=attention_mask)

In [None]:
last_hidden_state.shape

torch.Size([1, 7, 768])

In [None]:
pooler_output.shape

torch.Size([1, 768])

In [None]:
len(hidden_states)

13

In [None]:
type(hidden_states)

tuple

In [None]:
hidden_states[0].shape

torch.Size([1, 7, 768])

In [None]:
torch.Size([1, 7, 768])

torch.Size([1, 7, 768])

In [None]:
hidden_states[1].shape

torch.Size([1, 7, 768])

# **Text Classification**

In [None]:
!pip install torch==1.4.0

Collecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/3b/fa92ece1e58a6a48ec598bab327f39d69808133e5b2fb33002ca754e381e/torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 21kB/s 
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.4.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.4.0 which is incompatible.[0m
[?25hInstalling collected packages: torch
  Found existing installation: torch 1.8.1+cu101
    Uninstalling torch-1.8.1+cu101:
      Successfully uninstalled torch-1.8.1+cu101
Successfully installed torch-1.4.0


In [None]:
!pip install transformers==3.5.1

Collecting transformers==3.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 8.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 33.7MB/s 
[?25hCollecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/f2/e2/813dff3d72df2f49554204e7e5f73a3dc0f0eb1e3958a4cad3ef3fb278b7/sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 51.7MB/s 
[?25hCollecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/7b/ac/f5ba028f0f097d855e1541301e946d4672eb0f30b6e25cb2369075f916d2/tokenizers-0.9.3-cp37-cp37m-manylinux1_x

In [None]:
!pip install nlp==0.4.0

Collecting nlp==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/09/e3/bcdc59f3434b224040c1047769c47b82705feca2b89ebbc28311e3764782/nlp-0.4.0-py3-none-any.whl (1.7MB)
[K     |▏                               | 10kB 23.8MB/s eta 0:00:01[K     |▍                               | 20kB 15.2MB/s eta 0:00:01[K     |▋                               | 30kB 13.5MB/s eta 0:00:01[K     |▉                               | 40kB 12.4MB/s eta 0:00:01[K     |█                               | 51kB 7.7MB/s eta 0:00:01[K     |█▏                              | 61kB 7.2MB/s eta 0:00:01[K     |█▍                              | 71kB 8.1MB/s eta 0:00:01[K     |█▋                              | 81kB 8.9MB/s eta 0:00:01[K     |█▉                              | 92kB 8.5MB/s eta 0:00:01[K     |██                              | 102kB 7.4MB/s eta 0:00:01[K     |██▏                             | 112kB 7.4MB/s eta 0:00:01[K     |██▍                             | 122kB 7.4MB/s eta 

In [None]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np

### **Load the model and dataset. First let's download and load the dataset using the nlp library**

In [None]:
!gdown https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-

Downloading...
From: https://drive.google.com/uc?id=11_M4ootuT7I1G0RlihcC0cA3Elqotlc-
To: /content/imdbs.csv
  0% 0.00/132k [00:00<?, ?B/s]100% 132k/132k [00:00<00:00, 41.1MB/s]


In [None]:
dataset = load_dataset('csv', data_files='./imdbs.csv', split='train')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2749.0, style=ProgressStyle(description…




Using custom data configuration default


Downloading and preparing dataset csv/default-11046c2826f07a01 (download: Unknown size, generated: Unknown size, post-processed: Unknown sizetotal: Unknown size) to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-11046c2826f07a01/0.0.0/ede98314803c971fef04bcee45d660c62f3332e8a74491e0b876106f3d99bd9b. Subsequent calls will reuse this data.


### **Let's us check the datatype:**

In [None]:
type(dataset)

nlp.arrow_dataset.Dataset

### **Next let's split the dataset into train and test set**

In [None]:
dataset = dataset.train_test_split(test_size=0.3)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




### **Let's print the dataset**

In [None]:
dataset

{'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30),
 'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70)}

### **Now we create the train and test sets**

In [None]:
train_set = dataset['train']
test_set = dataset['test']

### **Next, lets download and load the pre-trained BERT model. In this example, we use the pre-trained bert-base-uncased model. As we can observe below, since we are performing sequence classification, we use the BertForSequenceClassification class:**

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

### **Next, we download and load the tokenizer which is used for pretraining the bert-base-uncased model. As we can observe, we create the tokenizer using the BertTokenizerFast Class intead of BertTokenizer. The BertTokenizerFast class has many advantages compared to BertTokenizer.**

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




### **Instead of doing tokenizing, segmentation and padding manually, our tokenizer will do it automatically**

In [None]:
tokenizer('I love Paris')

{'input_ids': [101, 1045, 2293, 3000, 102], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}

### **Set padding to True and sequence length to 5**

In [None]:
tokenizer(['I love Paris', 'birds fly', 'snow fall'], padding=True, max_length=5)

{'input_ids': [[101, 1045, 2293, 3000, 102], [101, 5055, 4875, 102, 0], [101, 4586, 2991, 102, 0]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 0], [1, 1, 1, 1, 0]]}

### **Now we define our preprocess function for our dataset**

In [None]:
def preprocess(data):
  return tokenizer(data['text'], padding=True, truncation=True)

### **Now preprocess the train and test data**

In [None]:
train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




### **Now, we use set_format function and select the columns which we need in our dataset and in which format we need**

In [None]:
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

## **Training the Model**

### **Define the Batch Size and Epoch Size**

In [None]:
batch_size = 8
epochs = 2

### **Define WarmUp Steps and Weight Decay**

In [None]:
warmup_steps = 500
weight_decay = 0.01

### **Define the Training Arguments**

In [None]:
training_args = TrainingArguments(
    output_dir = './results',
    num_train_epochs = epochs,
    per_device_train_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    evaluate_during_training=True,
    logging_dir='./logs',
)



### **Now define the trainer**

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

### **Start Training the Model**

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=18, training_loss=0.6974869834052192)

### **After Training we can evaluate the model using the evaluate function**

In [None]:
trainer.evaluate()

{'epoch': 2.0, 'eval_loss': 0.7163257598876953}

## **Q & A Fine Tune BERT**

In [1]:
!pip install torch==1.4.0



In [3]:
!pip install transformers==3.5.1



In [4]:
import torch 
from transformers import BertForQuestionAnswering, BertTokenizer

### **Now we download and load the model. We use the bert-language-large-uncased-whole-word-masking-finetuned-squad model which is finetunred on the SQUAD (Standford Question Answering DataSet)**

In [5]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




### **Next we download and load the tokenizer which is used for pretraining the bert-large-uncased-whole-word-masking-finetuned-squad model:**

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




## **Preprocessing the Input**

### **First, we define the input to the BERT which is question and paragraph text**

In [7]:
question = "What is the immune system?"
paragraph = "The immune system is a system of many biological structures and processes within an organism that protects against disease. To function properly, an immune system must detect a wide variety of agents, known as pathogens, from viruses to parasitic worms, and distinguish them from the organism's own healthy tissue."

### **Add [CLS] token to the begining of the question and [SEP] token at the end of both the question and paragraph:**

In [8]:
question = '[CLS]' + question + '[SEP]'
paragraph = paragraph + '[SEP]'

### **Now, tokenize the question and paragraph**

In [9]:
question_tokens = tokenizer.tokenize(question)
paragraph_tokens = tokenizer.tokenize(paragraph)

### **Combine the Question and Paragraph Tokens and Convet Them into input_ids**

In [10]:
tokens = question_tokens + paragraph_tokens

In [11]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)

### **Next, we define the segment_ids. The segment_ids will be 0 for all the tokens of question and it will be 1 for all the tokens of the paragraph**

In [12]:
segment_ids = [0] * len(question_tokens)
segment_ids


[0, 0, 0, 0, 0, 0, 0, 0]

In [13]:
segment_ids = segment_ids + [1] * len(paragraph_tokens)
segment_ids

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

### **Now, we convert the input_ids and segment_ids to tensor:**

In [14]:
input_ids = torch.tensor([input_ids])
segment_ids = torch.tensor([segment_ids])

## **Getting the Answer**

### **We feed the input_ids and segment_ids to the model which returns the start score and end score for all of the tokens**

In [15]:
start_scores, end_scores = model(input_ids, token_type_ids=segment_ids)

### **Now, we select the start_index which is the index of the token which has a maximum start score and end_index which is the index of the token**

In [16]:
start_index = torch.argmax(start_scores)
end_index = torch.argmax(end_scores)

### **Now, we print the text span between the start and end index as our answer:**

In [17]:
print(' '.join(tokens[start_index:end_index+1]))

a system of many biological structures and processes within an organism that protects against disease
