<a href="https://colab.research.google.com/github/tohpedo/wikibot/blob/main/transformers_latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Environment Set Up


In [1]:
#check if Google Pro is running. From Google Pro Documentation
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')


  gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!
Tue Nov 23 12:47:43 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---

TRAIN

In [2]:
#install hugging face
!pip install pytorch-pretrained-bert pytorch-nlp pytorch_transformers
!pip install folium

#import required packages
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_transformers import BertTokenizer, BertConfig, BertModel
from pytorch_transformers import AdamW, BertForQuestionAnswering
from tqdm import tqdm, trange
import pandas as pd
import io
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline





In [3]:
#import helper functions for squad dataset
!wget 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad.py'
!wget 'https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad_evaluate.py'

from utils_squad import (read_squad_examples, convert_examples_to_features,
                         RawResult, write_predictions,
                         RawResultExtended, write_predictions_extended)
from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad, plot_pr_curve

--2021-11-23 12:47:53--  https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 41529 (41K) [text/plain]
Saving to: ‘utils_squad.py.6’


2021-11-23 12:47:53 (13.5 MB/s) - ‘utils_squad.py.6’ saved [41529/41529]

--2021-11-23 12:47:53--  https://raw.githubusercontent.com/nlpyang/pytorch-transformers/master/examples/utils_squad_evaluate.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12493 (12K) [text/plain]
Saving to: ‘utils_squad_evaluate.py.6

In [4]:
#check if GPU is available
def isGPUAvailable():
  #check if GPU is available 
  gpu = tf.test.gpu_device_name()
  if gpu != '/device:GPU:0':
    print('No GPU available')
  else:
    print('GPU is available')

isGPUAvailable()

GPU is available


In [5]:
#mount colab drive
from google.colab import drive
drive.mount('/content/drive/')
!ls

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
drive			   utils_squad_evaluate.py.3  utils_squad.py.2
__pycache__		   utils_squad_evaluate.py.4  utils_squad.py.3
sample_data		   utils_squad_evaluate.py.5  utils_squad.py.4
utils_squad_evaluate.py    utils_squad_evaluate.py.6  utils_squad.py.5
utils_squad_evaluate.py.1  utils_squad.py	      utils_squad.py.6
utils_squad_evaluate.py.2  utils_squad.py.1


In [6]:
#set random seeds
np.random.seed(1234)
torch.manual_seed(1234)

<torch._C.Generator at 0x7f611fa51610>

In [7]:
#read in training data

train_data_file = '/content/drive/My Drive/train-v2.0.json'
train_data_raw = read_squad_examples(input_file=train_data_file, is_training=True, version_2_with_negative=True)

In [8]:
#inspecting first records
sample_records = train_data_raw[:1]
print(sample_records)

[qas_id: 56be85543aeaaa14008c9063, question_text: When did Beyonce start becoming popular?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".], start_position: 39, end_position: 42,
 qas_id: 56be85543aeaaa14008c9065, question_text: What areas did Beyonce compete in when she was growing up?, doc_tokens: [Beyoncé Giselle Knowles-Carter (/biːˈ

In [9]:
#take first 75,000 rows for training due to RAM limitation
train_data_raw = train_data_raw[:75000]
print("Number of records for training: " + str(len(train_data)))
train_data = pd.DataFrame.from_records([vars(record) for record in train_data_raw])


Number of records for training: 75000


In [14]:
#Set model parameters
batch_size = 64
doc_stride = 128
max_seq_length = 256
max_query_length = 64
bert_base = 1

if bert_base == 1:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
else:
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

model_features = convert_examples_to_features(examples=train_data_raw,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=True)
torch.save(model_features, "/content/drive/My Drive/bert_train")


In [15]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_start_positions, all_end_positions,
                        all_cls_index, all_p_mask)

In [16]:
random_sampler = RandomSampler(dataset)
train_dataloader = DataLoader(dataset, sampler=random_sampler, batch_size=batch_size, drop_last=True)

In [17]:
def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [18]:
import glob
checkpoints = sorted(glob.glob('/content/drive/My Drive/checkpoint*-[0-9]*'))

In [19]:
global_step = 0
train_loss_set = []
tr_loss = 0.0

if bert_base == 1:
    model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
else:
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased')



model.cuda()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

OPTIMIZER


In [20]:
param_optimizer = list(model.named_parameters())
print(param_optimizer[-2])
print(param_optimizer[-1])

('qa_outputs.weight', Parameter containing:
tensor([[-0.0231, -0.0370, -0.0441,  ...,  0.0076, -0.0057,  0.0081],
        [ 0.0022,  0.0065,  0.0223,  ..., -0.0018,  0.0032, -0.0065]],
       device='cuda:0', requires_grad=True))
('qa_outputs.bias', Parameter containing:
tensor([0., 0.], device='cuda:0', requires_grad=True))


In [21]:
#initialize AdamW optimizer for fine tuning
num_epochs=10
lr = 0.001
#lr = 0.01
eps=0.00000001
no_decay = ['bias', 'LayerNorm.weight']
param_grouped = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(param_grouped, lr=lr, eps=eps)

Run training

In [None]:
#Train model

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

print("***** Running training *****")
print("  Num examples = %d" % len(dataset))
print("  Num Epochs = %d" % num_train_epochs)
print("  Batch size = %d" % batch_size)
print("  Total optimization steps = %d" % (len(train_dataloader) // num_train_epochs))

model.zero_grad()
train_iterator = trange(num_epochs, desc="Epoch")

for _ in train_iterator:
    epoch_iterator = tqdm(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
      if step < global_step + 1:
        continue

      model.train()
      batch = tuple(t.to(device) for t in batch)

      inputs = {'input_ids':       batch[0],
                'attention_mask':  batch[1], 
                'token_type_ids':  batch[2],  
                'start_positions': batch[3], 
                'end_positions':   batch[4]}

      outputs = model(**inputs)

      loss = outputs[0]
      train_loss_set.append(loss)
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      tr_loss += loss.item()
      optimizer.step()
      model.zero_grad()
      global_step += 1
    
      if global_step % 1000 == 0:
        print("Train loss: {}".format(tr_loss/global_step))
        output_dir = '/content/drive/My Drive/checkpoint-{}'.format(global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        torch.save(torch.tensor(train_loss_set), os.path.join(output_dir, 'training_loss.pt'))
        print("Saving model checkpoint to %s" % output_dir)

***** Running training *****
  Num examples = 83016
  Num Epochs = 10
  Batch size = 32
  Total optimization steps = 259


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1050.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)

Iteration:   0%|          | 2/2594 [00:00<19:26,  2.22it/s][A
Iteration:   0%|          | 3/2594 [00:01<26:35,  1.62it/s][A
Iteration:   0%|          | 4/2594 [00:02<30:02,  1.44it/s][A
Iteration:   0%|          | 5/2594 [00:03<32:12,  1.34it/s][A
Iteration:   0%|          | 6/2594 [00:04<33:31,  1.29it/s][A
Iteration:   0%|          | 7/2594 [00:05<34:17,  1.26it/s][A
Iteration:   0%|          | 8/2594 [00:05<34:49,  1.24it/s][A
Iteration:   0%|          | 9/2594 [00:06<35:18,  1.22it/s][A
Iteration:   0%|          | 10/2594 [00:07<35:34,  1.21it/s][A
Iteration:   0%|          | 11/2594 [00:08<35:43,  1.21it/s][A
Iteration:   0%|          | 12/2594 [00:09<35:46,  1.20it/s][A
Iteration

Train loss: 1.581760053396225



Iteration:  39%|███▊      | 1001/2594 [14:02<43:21,  1.63s/it][A

Saving model checkpoint to /content/drive/My Drive/checkpoint-1000



Iteration:  39%|███▊      | 1002/2594 [14:03<37:01,  1.40s/it][A
Iteration:  39%|███▊      | 1003/2594 [14:04<32:33,  1.23s/it][A
Iteration:  39%|███▊      | 1004/2594 [14:04<29:29,  1.11s/it][A
Iteration:  39%|███▊      | 1005/2594 [14:05<27:17,  1.03s/it][A
Iteration:  39%|███▉      | 1006/2594 [14:06<25:47,  1.03it/s][A
Iteration:  39%|███▉      | 1007/2594 [14:07<24:41,  1.07it/s][A
Iteration:  39%|███▉      | 1008/2594 [14:08<23:54,  1.11it/s][A
Iteration:  39%|███▉      | 1009/2594 [14:09<23:21,  1.13it/s][A
Iteration:  39%|███▉      | 1010/2594 [14:09<23:10,  1.14it/s][A
Iteration:  39%|███▉      | 1011/2594 [14:10<22:52,  1.15it/s][A
Iteration:  39%|███▉      | 1012/2594 [14:11<22:39,  1.16it/s][A
Iteration:  39%|███▉      | 1013/2594 [14:12<22:32,  1.17it/s][A
Iteration:  39%|███▉      | 1014/2594 [14:13<22:26,  1.17it/s][A
Iteration:  39%|███▉      | 1015/2594 [14:14<22:23,  1.18it/s][A
Iteration:  39%|███▉      | 1016/2594 [14:15<22:18,  1.18it/s][A
Iteration

Train loss: 1.364032013386488



Iteration:  77%|███████▋  | 2001/2594 [28:04<12:47,  1.29s/it][A

Saving model checkpoint to /content/drive/My Drive/checkpoint-2000



Iteration:  77%|███████▋  | 2002/2594 [28:05<11:25,  1.16s/it][A
Iteration:  77%|███████▋  | 2003/2594 [28:06<10:28,  1.06s/it][A
Iteration:  77%|███████▋  | 2004/2594 [28:07<09:48,  1.00it/s][A
Iteration:  77%|███████▋  | 2005/2594 [28:08<09:19,  1.05it/s][A
Iteration:  77%|███████▋  | 2006/2594 [28:08<08:59,  1.09it/s][A
Iteration:  77%|███████▋  | 2007/2594 [28:09<08:47,  1.11it/s][A
Iteration:  77%|███████▋  | 2008/2594 [28:10<08:35,  1.14it/s][A
Iteration:  77%|███████▋  | 2009/2594 [28:11<08:28,  1.15it/s][A
Iteration:  77%|███████▋  | 2010/2594 [28:12<08:22,  1.16it/s][A
Iteration:  78%|███████▊  | 2011/2594 [28:13<08:18,  1.17it/s][A
Iteration:  78%|███████▊  | 2012/2594 [28:14<08:18,  1.17it/s][A
Iteration:  78%|███████▊  | 2013/2594 [28:14<08:15,  1.17it/s][A
Iteration:  78%|███████▊  | 2014/2594 [28:15<08:13,  1.17it/s][A
Iteration:  78%|███████▊  | 2015/2594 [28:16<08:10,  1.18it/s][A
Iteration:  78%|███████▊  | 2016/2594 [28:17<08:08,  1.18it/s][A
Iteration

In [None]:
output_dir = '/content/drive/My Drive/checkpoint-final'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)

In [None]:
plt.figure(figsize=(16,9))
plt.title("Training loss")
plt.xlabel("Batch Number")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

In [None]:
test_data = '/content/drive/My Drive/dev-v2.0.json'
val_examples = read_squad_examples(input_file=test_data,
                                is_training=False,
                                version_2_with_negative=True
                                )

cached_features_file = '/content/drive/My Drive/cache_validation'


# Cache features for faster loading
if not os.path.exists(cached_features_file):
  features = convert_examples_to_features(examples=val_examples,
                                        tokenizer=tokenizer,
                                        max_seq_length=max_seq_length,
                                        doc_stride=doc_stride,
                                        max_query_length=max_query_length,
                                        is_training=False)
  torch.save(features, cached_features_file)
else:
  features = torch.load(cached_features_file)

In [None]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)

all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_example_index, all_cls_index, all_p_mask)

In [None]:
seq_sampler = SequentialSampler(dataset)
validation_dataloader = DataLoader(dataset, sampler=seq_sampler, batch_size=batch_size, drop_last=True)

TEST MODEL

In [None]:
def evaluate(model, tokenizer):
  print("***** Running evaluation *****")
  print("  Num examples = %d" % len(dataset))
  print("  Batch size = %d" % batch_size)
  all_results = []
  predict_file = '/content/drive/My Drive/dev-v2.0.json'
  for batch in tqdm(validation_dataloader, desc="Evaluating", miniters=100, mininterval=5.0):
    model.eval()
    batch = tuple(t.to(device) for t in batch)
    with torch.no_grad():
      inputs = {'input_ids':      batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2]
                }
      example_indices = batch[3]
      outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
      eval_feature = features[example_index.item()]
      unique_id = int(eval_feature.unique_id)

      result = RawResult(unique_id    = unique_id,
                         start_logits = to_list(outputs[0][i]),
                         end_logits   = to_list(outputs[1][i]))
      all_results.append(result)

  # Compute predictions
  output_prediction_file = "/content/drive/My Drive/predictions.json"
  output_nbest_file = "/content/drive/My Drive/nbest_predictions.json"
  output_null_log_odds_file = "/content/drive/My Drive/null_odds.json"
  output_dir = "/content/drive/My Drive/predict_results"

  write_predictions(val_examples, features, all_results, 10,
                  30, True, output_prediction_file,
                  output_nbest_file, output_null_log_odds_file, False,
                  True, 0.0)

  # Evaluate with the official SQuAD script
  evaluate_options = EVAL_OPTS(data_file=predict_file,
                               pred_file=output_prediction_file,
                               na_prob_file=output_null_log_odds_file,
                               out_image_dir=None)
  results = evaluate_on_squad(evaluate_options)
  return results

In [None]:
results = evaluate(model, tokenizer)