Skip to content

Commit

Permalink
Merge df41384 into ff11ee6
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Jun 17, 2020
2 parents ff11ee6 + df41384 commit 3e1e224
Show file tree
Hide file tree
Showing 8 changed files with 206 additions and 131 deletions.
6 changes: 2 additions & 4 deletions benchmarker/modules/problems/bert/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,13 @@


def get_data(params):
"""generates sinthetic dataset"""
#input size -> cnt_sequences, len_suqence, cnt_dimentsions
# transform into (seq_len, batch) x cnt_batches
assert params["problem"]["size"][0] % params["batch_size"] == 0
params["problem"]["len_sequence"] = params["problem"]["size"][1]
cnt_batches = params["problem"]["size"][0] // params["batch_size"]
shape = (cnt_batches,
params["batch_size"],
params["problem"]["len_sequence"],
params["batch_size"])
)
X = np.random.random(shape).astype(np.int64)
Y = np.ones((cnt_batches, params["batch_size"]))
return X, Y
149 changes: 26 additions & 123 deletions benchmarker/modules/problems/bert/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,135 +1,38 @@
"""
This is transformer-based language model for benchmarker
it is based on the torch sample code and is not identical
to the original BERT model from Vaswani et al., 2017 paper.
It should, however, expose similar performace behaviour.
Multuple parameters can be specified for this model:
number of layers, attention heads, hidden size etc.
One thing to keep in mind is that this should not be used
for comparison between different framworks.
Hopefully this should be fixed when we start importing models from ONNX
"""

import argparse
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class PositionalEncoding(nn.Module):
r"""Inject some information about the relative or absolute position of the tokens
in the sequence. The positional encodings have the same dimension as
the embeddings, so that the two can be summed. Here, we use sine and cosine
functions of different frequencies.
.. math::
\text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
\text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
\text{where pos is the word position and i is the embed idx)
Args:
d_model: the embed dim (required).
dropout: the dropout value (default=0.1).
max_len: the max. length of the incoming sequence (default=5000).
Examples:
>>> pos_encoder = PositionalEncoding(d_model)
"""

def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):
r"""Inputs of forward function
Args:
x: the sequence fed to the positional encoder model (required).
Shape:
x: [sequence length, batch size, embed dim]
output: [sequence length, batch size, embed dim]
Examples:
>>> output = pos_encoder(x)
"""
from transformers import AutoModelForSequenceClassification, AutoConfig

x = x + self.pe[:x.size(0), :]
return self.dropout(x)

config = AutoConfig.from_pretrained(
"bert-base-uncased",
num_labels=3)

class TransformerModel(nn.Module):
"""Container module with an encoder, a recurrent or transformer module, and a decoder."""

def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout=0.5):
super(TransformerModel, self).__init__()
try:
from torch.nn import TransformerEncoder, TransformerEncoderLayer
except:
raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
self.model_type = 'Transformer'
self.src_mask = None
self.pos_encoder = PositionalEncoding(ninp, dropout)
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.encoder = nn.Embedding(ntokens, ninp)
self.ninp = ninp
self.decoder = nn.Linear(ninp, ntokens)
self.ntokens = ntokens
self.init_weights()
class BertTraining(nn.Module):
def __init__(self, net):
super().__init__()
self.net = net

def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask
def __call__(self, x, t):
loss, _logits = self.net(input_ids=x,
labels=t)
return loss

def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)

def forward(self, src, has_mask=True):
if has_mask:
device = src.device
if self.src_mask is None or self.src_mask.size(0) != len(src):
mask = self._generate_square_subsequent_mask(len(src)).to(device)
self.src_mask = mask
else:
self.src_mask = None
class BertInference(nn.Module):
def __init__(self, net):
super().__init__()
self.net = net

src = self.encoder(src) * math.sqrt(self.ninp)
src = self.pos_encoder(src)
output = self.transformer_encoder(src, self.src_mask)
output = output[-1]
output = self.decoder(output)
# TODO: return softmax or cross_entropy depending on the mode
return output
# return F.log_softmax(output, dim=-1)
def __call__(self, x):
logits = self.net(input_ids=x)
return logits


def get_kernel(params, unparsed_args=None):
# assert params["mode"] == "inference"
parser = argparse.ArgumentParser(description='Benchmark lstm kernel')
parser.add_argument('--cnt_units', type=int, default=512)
parser.add_argument('--cnt_heads', type=int, default=8)
parser.add_argument('--cnt_layers', type=int, default=1)
parser.add_argument('--cnt_tokens', type=int, default=1000)
parser.add_argument('--bidirectional', type=bool, default=False)
args = parser.parse_args(unparsed_args)
params["problem"].update(vars(args))
# print(params["problem"])
# TODO: use cnt_tokens in data generation as max rand!
Net = TransformerModel(ntokens=params["problem"]["cnt_tokens"],
ninp=params["problem"]["cnt_units"],
nhead=params["problem"]["cnt_heads"],
nhid=params["problem"]["cnt_units"],
nlayers=params["problem"]["cnt_layers"],
dropout=0.5)
return Net
assert unparsed_args == []
net = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", config=config)
if params["mode"] == "inference":
return BertInference(net)
else:
return BertTraining(net)
return net
Empty file.
16 changes: 16 additions & 0 deletions benchmarker/modules/problems/bert_custom/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import numpy as np


def get_data(params):
"""generates sinthetic dataset"""
#input size -> cnt_sequences, len_suqence, cnt_dimentsions
# transform into (seq_len, batch) x cnt_batches
assert params["problem"]["size"][0] % params["batch_size"] == 0
params["problem"]["len_sequence"] = params["problem"]["size"][1]
cnt_batches = params["problem"]["size"][0] // params["batch_size"]
shape = (cnt_batches,
params["problem"]["len_sequence"],
params["batch_size"])
X = np.random.random(shape).astype(np.int64)
Y = np.ones((cnt_batches, params["batch_size"]))
return X, Y
135 changes: 135 additions & 0 deletions benchmarker/modules/problems/bert_custom/pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
This is transformer-based language model for benchmarker
it is based on the torch sample code and is not identical
to the original BERT model from Vaswani et al., 2017 paper.
It should, however, expose similar performace behaviour.
Multuple parameters can be specified for this model:
number of layers, attention heads, hidden size etc.
One thing to keep in mind is that this should not be used
for comparison between different framworks.
Hopefully this should be fixed when we start importing models from ONNX
"""

import argparse
import math
import torch
import torch.nn as nn
import torch.nn.functional as F


class PositionalEncoding(nn.Module):
r"""Inject some information about the relative or absolute position of the tokens
in the sequence. The positional encodings have the same dimension as
the embeddings, so that the two can be summed. Here, we use sine and cosine
functions of different frequencies.
.. math::
\text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
\text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
\text{where pos is the word position and i is the embed idx)
Args:
d_model: the embed dim (required).
dropout: the dropout value (default=0.1).
max_len: the max. length of the incoming sequence (default=5000).
Examples:
>>> pos_encoder = PositionalEncoding(d_model)
"""

def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):
r"""Inputs of forward function
Args:
x: the sequence fed to the positional encoder model (required).
Shape:
x: [sequence length, batch size, embed dim]
output: [sequence length, batch size, embed dim]
Examples:
>>> output = pos_encoder(x)
"""

x = x + self.pe[:x.size(0), :]
return self.dropout(x)


class TransformerModel(nn.Module):
"""Container module with an encoder, a recurrent or transformer module, and a decoder."""

def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout=0.5):
super(TransformerModel, self).__init__()
try:
from torch.nn import TransformerEncoder, TransformerEncoderLayer
except:
raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
self.model_type = 'Transformer'
self.src_mask = None
self.pos_encoder = PositionalEncoding(ninp, dropout)
encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
self.encoder = nn.Embedding(ntokens, ninp)
self.ninp = ninp
self.decoder = nn.Linear(ninp, ntokens)
self.ntokens = ntokens
self.init_weights()

def _generate_square_subsequent_mask(self, sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask

def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)

def forward(self, src, has_mask=True):
if has_mask:
device = src.device
if self.src_mask is None or self.src_mask.size(0) != len(src):
mask = self._generate_square_subsequent_mask(len(src)).to(device)
self.src_mask = mask
else:
self.src_mask = None

src = self.encoder(src) * math.sqrt(self.ninp)
src = self.pos_encoder(src)
output = self.transformer_encoder(src, self.src_mask)
output = output[-1]
output = self.decoder(output)
# TODO: return softmax or cross_entropy depending on the mode
return output
# return F.log_softmax(output, dim=-1)


def get_kernel(params, unparsed_args=None):
# assert params["mode"] == "inference"
parser = argparse.ArgumentParser(description='Benchmark lstm kernel')
parser.add_argument('--cnt_units', type=int, default=512)
parser.add_argument('--cnt_heads', type=int, default=8)
parser.add_argument('--cnt_layers', type=int, default=1)
parser.add_argument('--cnt_tokens', type=int, default=1000)
parser.add_argument('--bidirectional', type=bool, default=False)
args = parser.parse_args(unparsed_args)
params["problem"].update(vars(args))
# print(params["problem"])
# TODO: use cnt_tokens in data generation as max rand!
Net = TransformerModel(ntokens=params["problem"]["cnt_tokens"],
ninp=params["problem"]["cnt_units"],
nhead=params["problem"]["cnt_heads"],
nhid=params["problem"]["cnt_units"],
nlayers=params["problem"]["cnt_layers"],
dropout=0.5)
return Net
6 changes: 2 additions & 4 deletions test/pytorch/test_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,8 @@ def test_bert(self):
self.name,
"--framework=pytorch",
"--problem=bert",
"--problem_size=32,32",
"--batch_size=8",
"--problem_size=4,8",
"--batch_size=2",
"--nb_epoch=1",
"--mode=inference",
"--cnt_units=128",
"--cnt_heads=4",
)
24 changes: 24 additions & 0 deletions test/pytorch/test_bert_custom.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import logging
import unittest

from ..helpers import run_module

logging.basicConfig(level=logging.DEBUG)


class PytorchBertTest(unittest.TestCase):
def setUp(self):
self.name = "benchmarker"

def test_bert(self):
run_module(
self.name,
"--framework=pytorch",
"--problem=bert_custom",
"--problem_size=32,32",
"--batch_size=8",
"--nb_epoch=1",
"--mode=inference",
"--cnt_units=128",
"--cnt_heads=4",
)
1 change: 1 addition & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ pylint
tensorflow
torch
torchvision
transformers

0 comments on commit 3e1e224

Please sign in to comment.