In [5]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, tie_weights=False):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        # Optionally tie weights as in:
        # "Using the Output Embedding to Improve Language Models" (Press & Wolf 2016)
        # https://arxiv.org/abs/1608.05859
        # and
        # "Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling" (Inan et al. 2016)
        # https://arxiv.org/abs/1611.01462
        if tie_weights:
            if nhid != ninp:
                raise ValueError('When using the tied flag, nhid must be equal to emsize')
            self.decoder.weight = self.encoder.weight

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.nlayers, bsz, self.nhid),
                    weight.new_zeros(self.nlayers, bsz, self.nhid))
        else:
            return weight.new_zeros(self.nlayers, bsz, self.nhid)

# Temporarily leave PositionalEncoding module here. Will be moved somewhere else.
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)


In [6]:
from torchsummary import summary

In [7]:
import data
device = torch.device("cuda")

corpus = data.Corpus('./data/wikitext-2')
ntokens = len(corpus.dictionary)

input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

In [8]:
model = RNNModel('LSTM', ntokens, 650, 650, 2, 0, False).to(device)

In [9]:
hidden = model.init_hidden(1)

In [10]:
model(input, hidden)

(tensor([[[-9.5344e-03, -4.1326e-05, -8.2702e-03,  ..., -1.4432e-02,
            1.0969e-02,  2.6121e-03]]], device='cuda:0', grad_fn=<ViewBackward>),
 (tensor([[[ 0.0031, -0.0114,  0.0083,  ...,  0.0180,  0.0029,  0.0096]],
  
          [[ 0.0032,  0.0047, -0.0143,  ...,  0.0152,  0.0169, -0.0046]]],
         device='cuda:0', grad_fn=<CudnnRnnBackward>),
  tensor([[[ 0.0062, -0.0220,  0.0170,  ...,  0.0362,  0.0059,  0.0192]],
  
          [[ 0.0063,  0.0094, -0.0288,  ...,  0.0299,  0.0339, -0.0094]]],
         device='cuda:0', grad_fn=<CudnnRnnBackward>)))

In [11]:
print(len(list(model.parameters())))

11


In [12]:
for i in range(len(list(model.parameters()))):
    print(list(model.parameters())[i].size())

torch.Size([33278, 650])
torch.Size([2600, 650])
torch.Size([2600, 650])
torch.Size([2600])
torch.Size([2600])
torch.Size([2600, 650])
torch.Size([2600, 650])
torch.Size([2600])
torch.Size([2600])
torch.Size([33278, 650])
torch.Size([33278])


In [13]:
model

RNNModel(
  (drop): Dropout(p=0, inplace=False)
  (encoder): Embedding(33278, 650)
  (rnn): LSTM(650, 650, num_layers=2)
  (decoder): Linear(in_features=650, out_features=33278, bias=True)
)

In [14]:
model.rnn_type

'LSTM'

In [15]:
model.rnn

LSTM(650, 650, num_layers=2)

In [16]:
model.named_parameters

<bound method Module.named_parameters of RNNModel(
  (drop): Dropout(p=0, inplace=False)
  (encoder): Embedding(33278, 650)
  (rnn): LSTM(650, 650, num_layers=2)
  (decoder): Linear(in_features=650, out_features=33278, bias=True)
)>

In [17]:
model.type

<bound method Module.type of RNNModel(
  (drop): Dropout(p=0, inplace=False)
  (encoder): Embedding(33278, 650)
  (rnn): LSTM(650, 650, num_layers=2)
  (decoder): Linear(in_features=650, out_features=33278, bias=True)
)>

In [18]:
model.__class__.__name__

'RNNModel'

In [19]:
nodes = []
for name, module in model.named_modules():
    # Only print leaf modules
    if len(module._modules) == 0:
        nodes.append([name, module.__class__.__name__])

In [20]:
nodes

[['drop', 'Dropout'],
 ['encoder', 'Embedding'],
 ['rnn', 'LSTM'],
 ['decoder', 'Linear']]

In [21]:
for name, param in model.state_dict().items():
    print(name)

encoder.weight
rnn.weight_ih_l0
rnn.weight_hh_l0
rnn.bias_ih_l0
rnn.bias_hh_l0
rnn.weight_ih_l1
rnn.weight_hh_l1
rnn.bias_ih_l1
rnn.bias_hh_l1
decoder.weight
decoder.bias


In [31]:
for name, param in model.state_dict().items():
    print(param.size())

torch.Size([5000, 1, 650])
torch.Size([1950, 650])
torch.Size([1950])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([1950, 650])
torch.Size([1950])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([33278, 650])
torch.Size([33278, 650])
torch.Size([33278])


In [23]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [24]:

class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        self.src_mask = None
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, self.src_mask)
        output = self.decoder(output)
        return F.log_softmax(output, dim=-1)

In [25]:
#model = RNNModel('LSTM', ntokens, 650, 650, 2, 0, False).to(device)

In [26]:
model = TransformerModel(ntokens, 650, 650, 650, 2, False).to(device)

In [27]:
model

TransformerModel(
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=False, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=650, out_features=650, bias=True)
        )
        (linear1): Linear(in_features=650, out_features=650, bias=True)
        (dropout): Dropout(p=False, inplace=False)
        (linear2): Linear(in_features=650, out_features=650, bias=True)
        (norm1): LayerNorm((650,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((650,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=False, inplace=False)
        (dropout2): Dropout(p=False, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): Linear(in_features=650, out_features=650, bias=True)
        )
        (linear1): Linear(in_features=650, out_fea

In [28]:
nodes = []
for name, module in model.named_modules():
    # Only print leaf modules
    if len(module._modules) == 0:
        nodes.append([name, module.__class__.__name__])

In [29]:
nodes

[['pos_encoder.dropout', 'Dropout'],
 ['transformer_encoder.layers.0.self_attn.out_proj', 'Linear'],
 ['transformer_encoder.layers.0.linear1', 'Linear'],
 ['transformer_encoder.layers.0.dropout', 'Dropout'],
 ['transformer_encoder.layers.0.linear2', 'Linear'],
 ['transformer_encoder.layers.0.norm1', 'LayerNorm'],
 ['transformer_encoder.layers.0.norm2', 'LayerNorm'],
 ['transformer_encoder.layers.0.dropout1', 'Dropout'],
 ['transformer_encoder.layers.0.dropout2', 'Dropout'],
 ['transformer_encoder.layers.1.self_attn.out_proj', 'Linear'],
 ['transformer_encoder.layers.1.linear1', 'Linear'],
 ['transformer_encoder.layers.1.dropout', 'Dropout'],
 ['transformer_encoder.layers.1.linear2', 'Linear'],
 ['transformer_encoder.layers.1.norm1', 'LayerNorm'],
 ['transformer_encoder.layers.1.norm2', 'LayerNorm'],
 ['transformer_encoder.layers.1.dropout1', 'Dropout'],
 ['transformer_encoder.layers.1.dropout2', 'Dropout'],
 ['encoder', 'Embedding'],
 ['decoder', 'Linear']]

In [30]:
for name, param in model.state_dict().items():
    print(name)

pos_encoder.pe
transformer_encoder.layers.0.self_attn.in_proj_weight
transformer_encoder.layers.0.self_attn.in_proj_bias
transformer_encoder.layers.0.self_attn.out_proj.weight
transformer_encoder.layers.0.self_attn.out_proj.bias
transformer_encoder.layers.0.linear1.weight
transformer_encoder.layers.0.linear1.bias
transformer_encoder.layers.0.linear2.weight
transformer_encoder.layers.0.linear2.bias
transformer_encoder.layers.0.norm1.weight
transformer_encoder.layers.0.norm1.bias
transformer_encoder.layers.0.norm2.weight
transformer_encoder.layers.0.norm2.bias
transformer_encoder.layers.1.self_attn.in_proj_weight
transformer_encoder.layers.1.self_attn.in_proj_bias
transformer_encoder.layers.1.self_attn.out_proj.weight
transformer_encoder.layers.1.self_attn.out_proj.bias
transformer_encoder.layers.1.linear1.weight
transformer_encoder.layers.1.linear1.bias
transformer_encoder.layers.1.linear2.weight
transformer_encoder.layers.1.linear2.bias
transformer_encoder.layers.1.norm1.weight
transfo

In [32]:
for name, param in model.state_dict().items():
    print(param.size())

torch.Size([5000, 1, 650])
torch.Size([1950, 650])
torch.Size([1950])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([1950, 650])
torch.Size([1950])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650, 650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([650])
torch.Size([33278, 650])
torch.Size([33278, 650])
torch.Size([33278])
