

## Installing libraries


In [None]:
import torch 
from transformers import BertForQuestionAnswering
import gc 
from torch.utils.data import DataLoader
from transformers import AdamW

## Initilizing the model

Loading BERT base mulitiligual cased pre-trained model from HuggingFace Transformers

In [None]:
model = BertForQuestionAnswering.from_pretrained("bert-base-multilingual-cased")

Analyzing the model configuration

In [None]:
model.config_class()

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

Defining the model checkpoint path 

In [None]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')

## Model training

Training configuration

In [None]:
gc.collect() # used to prevent the "cuda running out of memory" error

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # model to GPU

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True) # batch size is 1 (because the model is large), the data is shuffled

optim = AdamW(model.parameters(), lr=5e-5) # AdamW optimization algorithm, learning rate is 5e-5

for epoch in range(10): # 10 epochs
    for batch in train_loader:       
        optim.zero_grad() 
        input_ids = batch['input_ids'].to(device) # integers
        attention_mask = batch['attention_mask'].to(device) # 0's and 1's sequences
        start_positions = batch['start_positions'].to(device) # span
        end_positions = batch['end_positions'].to(device) 
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0] 
        loss.backward() # backpropagation
        optim.step() # gradient descent

filepath = '/content/model.pth' # saving weights
torch.save(model.state_dict(), filepath)
model.load_state_dict(torch.load(filepath))
model.eval() # model summary

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise