#Load torchtext and initialize XLM-R model

In [None]:
import torch
import torch.nn as nn
import torchtext

from torchtext.models import RobertaClassificationHead
from torchtext.functional import to_tensor

xlmr_large = torchtext.models.XLMR_LARGE_ENCODER
classifier_head = torchtext.models.RobertaClassificationHead(num_classes=2, input_dim = 1024)
model = xlmr_large.get_model(head=classifier_head)

# Put model into inference mode (reduces runtime even without BT - esp for GPU execution, required for Better Transformer)
model.eval()

# Define input transform
transform = xlmr_large.transform()


# System Information

In [None]:
import platform

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

cpu = platform.processor()
gpu = torch.cuda.get_device_name(DEVICE)

print(f"torch version: {torch.__version__}")
print(f"torch cuda available: {torch.cuda.is_available()}")
print(f"CPU type: {cpu}")
print(f"GPU type: {gpu}")

torch version: 1.12.0+cu113
torch cuda available: True
CPU type: x86_64
GPU type: Tesla T4


# Check default sparsity support setting
Sparsity support enables transformers to skip padding in inputs.

In [None]:
model.encoder.transformer.layers.enable_nested_tensor

False

# Benchmark setup

###Define inputs

In [None]:
small_input_batch = [
               "Hello world",
               "How are you!"
]
big_input_batch = [
               "Hello world",
               "How are you!",
               """`Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.`

It was in July, 1805, and the speaker was the well-known Anna
Pavlovna Scherer, maid of honor and favorite of the Empress Marya
Fedorovna. With these words she greeted Prince Vasili Kuragin, a man
of high rank and importance, who was the first to arrive at her
reception. Anna Pavlovna had had a cough for some days. She was, as
she said, suffering from la grippe; grippe being then a new word in
St. Petersburg, used only by the elite."""
]

###Select small or big input set

Modify the assignment to input_batch below to select either the small_input_batch or big_inoput_batch, or substitute your own inputs.

In [None]:
input_batch=big_input_batch

model_input = to_tensor(transform(input_batch), padding_value=1)
output = model(model_input)
output.shape

torch.Size([3, 2])

###Iteration count for performance measurements

In [None]:
ITERATIONS=10

#Measure CPU  performance with slow and fast path, without and with sparsity

Sparsity support enables transformers to skip padding in inputs.


### CPU performance without BT sparsity

In [None]:
model.encoder.transformer.layers.enable_nested_tensor = False

In [None]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm        63.40%       41.691s        64.59%       42.478s      57.403ms           740  
                   aten::mm        21.27%       13.990s        21.27%       13.990s      58.291ms           240  
                aten::copy_         3.58%        2.356s         3.58%        2.356s     875.793us          2690  
                  aten::bmm         2.75%        1.811s         2.75%        1.811s       7.547ms           240  
             aten::_softmax         2.40%        1.578s         2.40%        1.578s       6.573ms           240  
---------------------------  ------------  ------------  ------------  ------

###CPU performance with BT sparsity

In [None]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [None]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=False) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cpu_time_total", row_limit=5))



slow path:
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                aten::addmm        63.66%       41.845s        64.78%       42.575s      57.533ms           740  
                   aten::mm        21.40%       14.067s        21.40%       14.067s      58.614ms           240  
                aten::copy_         3.39%        2.228s         3.39%        2.228s     828.338us          2690  
                  aten::bmm         2.77%        1.824s         2.77%        1.824s       7.599ms           240  
             aten::_softmax         2.29%        1.507s         2.29%        1.507s       6.281ms           240  
---------------------------  ------------  ------------  ------------  ------

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not())


----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                    Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
----------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             aten::addmm        26.80%        8.664s        27.04%        8.743s      17.487ms           500  
                 aten::_addmm_activation        21.29%        6.884s        22.87%        7.395s      30.814ms           240  
                                aten::mm        15.88%        5.134s        15.88%        5.134s      21.394ms           240  
                   aten::_masked_softmax        12.97%        4.194s        12.98%        4.195s      17.480ms           240  
                               aten::bmm         8.82%        2.852s         8.82%        2.852s       5.941ms 

#Measure DEVICE performance with slow and fast path, without and with sparsity

Please ensure that the runtime has GPUs enabled to see the performance benefits of Better Transformer fastpath execution on GPUs. You can confirm and change the Runtime type in the Google Colab menu with (Runtime > Change Runtime Type)

In [None]:
model.to(DEVICE)
model.eval()
model_input = model_input.to(DEVICE)

### DEVICE performance without BT sparsity

In [None]:
model.encoder.transformer.layers.enable_nested_tensor=False

In [None]:
print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                               aten::mm         1.01%      14.178ms        44.20%     622.315ms       2.593ms     859.598ms        43.74%     859.598ms       3.582ms           240  
                                            aten::addmm         3.29%      46.383ms         5.19%      73.045ms      98.709us     756.916ms        38.52%     761.477ms       1.029ms           740 

### DEVICE performance performance with BT sparsity

In [None]:
model.encoder.transformer.layers.enable_nested_tensor = True

In [None]:
model.to(DEVICE)
model_input = model_input.to(DEVICE)

print("slow path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  for i in range(ITERATIONS):
    output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

print("fast path:")
print("==========")
with torch.autograd.profiler.profile(use_cuda=True) as prof:
  with torch.no_grad():
    for i in range(ITERATIONS):
      output = model(model_input)
print(prof.key_averages(group_by_stack_n=5).table(sort_by="self_cuda_time_total", row_limit=5))

slow path:
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                            aten::addmm         6.15%      48.067ms         9.02%      70.473ms      95.234us     764.116ms        55.84%     768.719ms       1.039ms           740  
                                               aten::mm         1.40%      10.924ms         1.91%      14.890ms      62.042us     254.936ms        18.63%     254.936ms       1.062ms           240 