Merge df41384 into ff11ee6

undertherain · Jun 17, 2020 · 3e1e224 · 3e1e224
2 parents ff11ee6 + df41384
commit 3e1e224
Show file tree

Hide file tree

Showing 8 changed files with 206 additions and 131 deletions.
diff --git a/benchmarker/modules/problems/bert/data.py b/benchmarker/modules/problems/bert/data.py
@@ -2,15 +2,13 @@
 
 
 def get_data(params):
-    """generates sinthetic dataset"""
-    #input size -> cnt_sequences, len_suqence, cnt_dimentsions
-    # transform into (seq_len, batch) x cnt_batches
     assert params["problem"]["size"][0] % params["batch_size"] == 0
     params["problem"]["len_sequence"] = params["problem"]["size"][1]
     cnt_batches = params["problem"]["size"][0] // params["batch_size"]
     shape = (cnt_batches,
+             params["batch_size"],
              params["problem"]["len_sequence"],
-             params["batch_size"])
+             )
     X = np.random.random(shape).astype(np.int64)
     Y = np.ones((cnt_batches, params["batch_size"]))
     return X, Y
diff --git a/benchmarker/modules/problems/bert/pytorch.py b/benchmarker/modules/problems/bert/pytorch.py
@@ -1,135 +1,38 @@
-"""
-This is transformer-based language model for benchmarker
-it is based on the torch sample code and is not identical
-to the original BERT model from Vaswani et al., 2017 paper.
-It should, however, expose similar performace behaviour.
-
-Multuple parameters can be specified for this model:
-number of layers, attention heads, hidden size etc.
-
-One thing to keep in mind is that this should not be used
-for comparison between different framworks.
-
-Hopefully this should be fixed when we start importing models from ONNX
-"""
-
-import argparse
-import math
-import torch
 import torch.nn as nn
-import torch.nn.functional as F
-
-
-class PositionalEncoding(nn.Module):
-    r"""Inject some information about the relative or absolute position of the tokens
-        in the sequence. The positional encodings have the same dimension as
-        the embeddings, so that the two can be summed. Here, we use sine and cosine
-        functions of different frequencies.
-    .. math::
-        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
-        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
-        \text{where pos is the word position and i is the embed idx)
-    Args:
-        d_model: the embed dim (required).
-        dropout: the dropout value (default=0.1).
-        max_len: the max. length of the incoming sequence (default=5000).
-    Examples:
-        >>> pos_encoder = PositionalEncoding(d_model)
-    """
-
-    def __init__(self, d_model, dropout=0.1, max_len=5000):
-        super(PositionalEncoding, self).__init__()
-        self.dropout = nn.Dropout(p=dropout)
-
-        pe = torch.zeros(max_len, d_model)
-        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
-        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
-        pe[:, 0::2] = torch.sin(position * div_term)
-        pe[:, 1::2] = torch.cos(position * div_term)
-        pe = pe.unsqueeze(0).transpose(0, 1)
-        self.register_buffer('pe', pe)
-
-    def forward(self, x):
-        r"""Inputs of forward function
-        Args:
-            x: the sequence fed to the positional encoder model (required).
-        Shape:
-            x: [sequence length, batch size, embed dim]
-            output: [sequence length, batch size, embed dim]
-        Examples:
-            >>> output = pos_encoder(x)
-        """
+from transformers import AutoModelForSequenceClassification, AutoConfig
 
-        x = x + self.pe[:x.size(0), :]
-        return self.dropout(x)
 
+config = AutoConfig.from_pretrained(
+    "bert-base-uncased",
+    num_labels=3)
 
-class TransformerModel(nn.Module):
-    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
 
-    def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout=0.5):
-        super(TransformerModel, self).__init__()
-        try:
-            from torch.nn import TransformerEncoder, TransformerEncoderLayer
-        except:
-            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
-        self.model_type = 'Transformer'
-        self.src_mask = None
-        self.pos_encoder = PositionalEncoding(ninp, dropout)
-        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
-        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
-        self.encoder = nn.Embedding(ntokens, ninp)
-        self.ninp = ninp
-        self.decoder = nn.Linear(ninp, ntokens)
-        self.ntokens = ntokens
-        self.init_weights()
+class BertTraining(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
 
-    def _generate_square_subsequent_mask(self, sz):
-        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
-        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
-        return mask
+    def __call__(self, x, t):
+        loss, _logits = self.net(input_ids=x,
+                                 labels=t)
+        return loss
 
-    def init_weights(self):
-        initrange = 0.1
-        self.encoder.weight.data.uniform_(-initrange, initrange)
-        self.decoder.bias.data.zero_()
-        self.decoder.weight.data.uniform_(-initrange, initrange)
 
-    def forward(self, src, has_mask=True):
-        if has_mask:
-            device = src.device
-            if self.src_mask is None or self.src_mask.size(0) != len(src):
-                mask = self._generate_square_subsequent_mask(len(src)).to(device)
-                self.src_mask = mask
-        else:
-            self.src_mask = None
+class BertInference(nn.Module):
+    def __init__(self, net):
+        super().__init__()
+        self.net = net
 
-        src = self.encoder(src) * math.sqrt(self.ninp)
-        src = self.pos_encoder(src)
-        output = self.transformer_encoder(src, self.src_mask)
-        output = output[-1]
-        output = self.decoder(output)
-        # TODO: return softmax or cross_entropy depending on the mode
-        return output
-        # return F.log_softmax(output, dim=-1)
+    def __call__(self, x):
+        logits = self.net(input_ids=x)
+        return logits
 
 
 def get_kernel(params, unparsed_args=None):
-    # assert params["mode"] == "inference"
-    parser = argparse.ArgumentParser(description='Benchmark lstm kernel')
-    parser.add_argument('--cnt_units', type=int, default=512)
-    parser.add_argument('--cnt_heads', type=int, default=8)
-    parser.add_argument('--cnt_layers', type=int, default=1)
-    parser.add_argument('--cnt_tokens', type=int, default=1000)
-    parser.add_argument('--bidirectional', type=bool, default=False)
-    args = parser.parse_args(unparsed_args)
-    params["problem"].update(vars(args))
-    # print(params["problem"])
-    # TODO: use cnt_tokens in data generation as max rand!
-    Net = TransformerModel(ntokens=params["problem"]["cnt_tokens"],
-                           ninp=params["problem"]["cnt_units"],
-                           nhead=params["problem"]["cnt_heads"],
-                           nhid=params["problem"]["cnt_units"],
-                           nlayers=params["problem"]["cnt_layers"],
-                           dropout=0.5)
-    return Net
+    assert unparsed_args == []
+    net = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", config=config)
+    if params["mode"] == "inference":
+        return BertInference(net)
+    else:
+        return BertTraining(net)
+    return net
diff --git a/benchmarker/modules/problems/bert_custom/__init__.py b/benchmarker/modules/problems/bert_custom/__init__.py
diff --git a/benchmarker/modules/problems/bert_custom/data.py b/benchmarker/modules/problems/bert_custom/data.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+
+def get_data(params):
+    """generates sinthetic dataset"""
+    #input size -> cnt_sequences, len_suqence, cnt_dimentsions
+    # transform into (seq_len, batch) x cnt_batches
+    assert params["problem"]["size"][0] % params["batch_size"] == 0
+    params["problem"]["len_sequence"] = params["problem"]["size"][1]
+    cnt_batches = params["problem"]["size"][0] // params["batch_size"]
+    shape = (cnt_batches,
+             params["problem"]["len_sequence"],
+             params["batch_size"])
+    X = np.random.random(shape).astype(np.int64)
+    Y = np.ones((cnt_batches, params["batch_size"]))
+    return X, Y
diff --git a/benchmarker/modules/problems/bert_custom/pytorch.py b/benchmarker/modules/problems/bert_custom/pytorch.py
@@ -0,0 +1,135 @@
+"""
+This is transformer-based language model for benchmarker
+it is based on the torch sample code and is not identical
+to the original BERT model from Vaswani et al., 2017 paper.
+It should, however, expose similar performace behaviour.
+
+Multuple parameters can be specified for this model:
+number of layers, attention heads, hidden size etc.
+
+One thing to keep in mind is that this should not be used
+for comparison between different framworks.
+
+Hopefully this should be fixed when we start importing models from ONNX
+"""
+
+import argparse
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class PositionalEncoding(nn.Module):
+    r"""Inject some information about the relative or absolute position of the tokens
+        in the sequence. The positional encodings have the same dimension as
+        the embeddings, so that the two can be summed. Here, we use sine and cosine
+        functions of different frequencies.
+    .. math::
+        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
+        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
+        \text{where pos is the word position and i is the embed idx)
+    Args:
+        d_model: the embed dim (required).
+        dropout: the dropout value (default=0.1).
+        max_len: the max. length of the incoming sequence (default=5000).
+    Examples:
+        >>> pos_encoder = PositionalEncoding(d_model)
+    """
+
+    def __init__(self, d_model, dropout=0.1, max_len=5000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0).transpose(0, 1)
+        self.register_buffer('pe', pe)
+
+    def forward(self, x):
+        r"""Inputs of forward function
+        Args:
+            x: the sequence fed to the positional encoder model (required).
+        Shape:
+            x: [sequence length, batch size, embed dim]
+            output: [sequence length, batch size, embed dim]
+        Examples:
+            >>> output = pos_encoder(x)
+        """
+
+        x = x + self.pe[:x.size(0), :]
+        return self.dropout(x)
+
+
+class TransformerModel(nn.Module):
+    """Container module with an encoder, a recurrent or transformer module, and a decoder."""
+
+    def __init__(self, ntokens, ninp, nhead, nhid, nlayers, dropout=0.5):
+        super(TransformerModel, self).__init__()
+        try:
+            from torch.nn import TransformerEncoder, TransformerEncoderLayer
+        except:
+            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
+        self.model_type = 'Transformer'
+        self.src_mask = None
+        self.pos_encoder = PositionalEncoding(ninp, dropout)
+        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
+        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
+        self.encoder = nn.Embedding(ntokens, ninp)
+        self.ninp = ninp
+        self.decoder = nn.Linear(ninp, ntokens)
+        self.ntokens = ntokens
+        self.init_weights()
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
+        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
+        return mask
+
+    def init_weights(self):
+        initrange = 0.1
+        self.encoder.weight.data.uniform_(-initrange, initrange)
+        self.decoder.bias.data.zero_()
+        self.decoder.weight.data.uniform_(-initrange, initrange)
+
+    def forward(self, src, has_mask=True):
+        if has_mask:
+            device = src.device
+            if self.src_mask is None or self.src_mask.size(0) != len(src):
+                mask = self._generate_square_subsequent_mask(len(src)).to(device)
+                self.src_mask = mask
+        else:
+            self.src_mask = None
+
+        src = self.encoder(src) * math.sqrt(self.ninp)
+        src = self.pos_encoder(src)
+        output = self.transformer_encoder(src, self.src_mask)
+        output = output[-1]
+        output = self.decoder(output)
+        # TODO: return softmax or cross_entropy depending on the mode
+        return output
+        # return F.log_softmax(output, dim=-1)
+
+
+def get_kernel(params, unparsed_args=None):
+    # assert params["mode"] == "inference"
+    parser = argparse.ArgumentParser(description='Benchmark lstm kernel')
+    parser.add_argument('--cnt_units', type=int, default=512)
+    parser.add_argument('--cnt_heads', type=int, default=8)
+    parser.add_argument('--cnt_layers', type=int, default=1)
+    parser.add_argument('--cnt_tokens', type=int, default=1000)
+    parser.add_argument('--bidirectional', type=bool, default=False)
+    args = parser.parse_args(unparsed_args)
+    params["problem"].update(vars(args))
+    # print(params["problem"])
+    # TODO: use cnt_tokens in data generation as max rand!
+    Net = TransformerModel(ntokens=params["problem"]["cnt_tokens"],
+                           ninp=params["problem"]["cnt_units"],
+                           nhead=params["problem"]["cnt_heads"],
+                           nhid=params["problem"]["cnt_units"],
+                           nlayers=params["problem"]["cnt_layers"],
+                           dropout=0.5)
+    return Net
diff --git a/test/pytorch/test_bert.py b/test/pytorch/test_bert.py
@@ -15,10 +15,8 @@ def test_bert(self):
             self.name,
             "--framework=pytorch",
             "--problem=bert",
-            "--problem_size=32,32",
-            "--batch_size=8",
+            "--problem_size=4,8",
+            "--batch_size=2",
             "--nb_epoch=1",
             "--mode=inference",
-            "--cnt_units=128",
-            "--cnt_heads=4",
         )
diff --git a/test/pytorch/test_bert_custom.py b/test/pytorch/test_bert_custom.py
@@ -0,0 +1,24 @@
+import logging
+import unittest
+
+from ..helpers import run_module
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+class PytorchBertTest(unittest.TestCase):
+    def setUp(self):
+        self.name = "benchmarker"
+
+    def test_bert(self):
+        run_module(
+            self.name,
+            "--framework=pytorch",
+            "--problem=bert_custom",
+            "--problem_size=32,32",
+            "--batch_size=8",
+            "--nb_epoch=1",
+            "--mode=inference",
+            "--cnt_units=128",
+            "--cnt_heads=4",
+        )
diff --git a/test_requirements.txt b/test_requirements.txt
@@ -7,3 +7,4 @@ pylint
 tensorflow
 torch
 torchvision
+transformers