#### Building Models

In [1]:
import torch
import torch.nn as nn

In [3]:
class TinyModel(nn.Module):
    def __init__(self) -> None:
        super(TinyModel, self).__init__()
        self.linear1 = nn.Linear(100, 200)
        self.activation = nn.ReLU()
        self.linear2 = nn.Linear(200, 10)
        self.softmax = nn.Softmax()
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.activation(x)
        x = self.linear2(x)
        x = self.softmax(x)
        return x
    
tiny_model = TinyModel()
print(tiny_model)

TinyModel(
  (linear1): Linear(in_features=100, out_features=200, bias=True)
  (activation): ReLU()
  (linear2): Linear(in_features=200, out_features=10, bias=True)
  (softmax): Softmax(dim=None)
)


In [7]:
print('The model:')
print(tiny_model)

print('Model params:')
for param in tiny_model.parameters():
    print(param)

The model:
TinyModel(
  (linear1): Linear(in_features=100, out_features=200, bias=True)
  (activation): ReLU()
  (linear2): Linear(in_features=200, out_features=10, bias=True)
  (softmax): Softmax(dim=None)
)
Model params:
Parameter containing:
tensor([[ 0.0599,  0.0374, -0.0713,  ...,  0.0261,  0.0272, -0.0440],
        [ 0.0913, -0.0499, -0.0633,  ...,  0.0938,  0.0671,  0.0498],
        [ 0.0488,  0.0582, -0.0928,  ..., -0.0435,  0.0292,  0.0893],
        ...,
        [-0.0525,  0.0408, -0.0647,  ..., -0.0447,  0.0820, -0.0416],
        [-0.0350,  0.0921, -0.0649,  ..., -0.0439, -0.0367,  0.0733],
        [-0.0182,  0.0480,  0.0634,  ..., -0.0078,  0.0457,  0.0339]],
       requires_grad=True)
Parameter containing:
tensor([ 0.0111, -0.0105, -0.0015,  0.0892, -0.0154,  0.0468,  0.0806,  0.0600,
         0.0237, -0.0015, -0.0477,  0.0290, -0.0930, -0.0838,  0.0960,  0.0604,
        -0.0168,  0.0245, -0.0535,  0.0293, -0.0887, -0.0345, -0.0085,  0.0187,
        -0.0867, -0.0839,  0.085

In [8]:
print('One Layer:')
print(tiny_model.linear2)

print('Layer params:')
for param in tiny_model.linear2.parameters():
    print(param)

One Layer:
Linear(in_features=200, out_features=10, bias=True)
Layer params:
Parameter containing:
tensor([[-1.1201e-02,  1.4560e-02, -3.5855e-02,  ...,  5.9941e-02,
          4.7385e-02, -7.1248e-03],
        [ 6.0764e-02,  4.7733e-02, -5.9405e-02,  ...,  5.8891e-02,
          1.2440e-02,  2.2069e-02],
        [-2.1890e-02, -4.2067e-02,  6.8043e-02,  ..., -5.9911e-02,
          4.6748e-02,  3.6947e-02],
        ...,
        [-5.9539e-02,  3.9088e-02,  1.3436e-02,  ..., -2.5686e-02,
          2.8275e-02,  1.3285e-02],
        [ 2.9305e-02,  2.5379e-02, -3.1464e-02,  ..., -9.2827e-03,
         -2.0208e-02,  3.6044e-02],
        [ 5.2464e-05,  9.3200e-03, -1.5876e-02,  ...,  6.7781e-02,
         -1.6186e-02,  3.9045e-02]], requires_grad=True)
Parameter containing:
tensor([ 0.0089, -0.0421, -0.0142,  0.0374, -0.0515, -0.0011, -0.0024,  0.0452,
        -0.0003, -0.0187], requires_grad=True)


##### Common Layer Types

In [14]:
# Linear

lin = nn.Linear(3, 2)
x = torch.rand(1, 3)
print('Input:', x)
print('-------------------------------------------------------')

print('Weight and Bias parameters:')
for params in lin.parameters():
    print(params)

y = lin(x)
print('-------------------------------------------------------')
print('Output:', y)

Input: tensor([[0.6943, 0.5076, 0.7240]])
-------------------------------------------------------
Weight and Bias parameters:
Parameter containing:
tensor([[-0.2247,  0.5034, -0.3776],
        [-0.3783, -0.0466,  0.0951]], requires_grad=True)
Parameter containing:
tensor([-0.5374, -0.4108], requires_grad=True)
-------------------------------------------------------
Output: tensor([[-0.7113, -0.6283]], grad_fn=<AddmmBackward0>)


In [17]:
# Convolutional

import torch.functional as F

class LeNet(nn.Module):
    def __init__(self, *args, **kwargs) -> None:
        super(LeNet, self).__init__(*args, **kwargs)
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool2d(x, (2, 2))
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, (2, 2))
        x = x.view(-1, 576) # flatten
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x


In [18]:
conv_model = LeNet()
print(conv_model)

LeNet(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [21]:
# Recurrent Layers
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()

        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [23]:
lstm_model = LSTMTagger(100, 50, 10000, 50)
print(lstm_model)

LSTMTagger(
  (word_embeddings): Embedding(10000, 100)
  (lstm): LSTM(100, 50)
  (hidden2tag): Linear(in_features=50, out_features=50, bias=True)
)


In [33]:
# Transformers
transformer_model = nn.Transformer(nhead=16, num_encoder_layers=12)
input = torch.rand((10, 32, 512))
tgt = torch.rand((20, 32, 512))
output = transformer_model(input, tgt)
print(transformer_model)
print(output)



Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512,

In [31]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
input = torch.rand(10, 32, 512)
output = transformer_encoder(input)
print(transformer_encoder)
print(output)

TransformerEncoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (linear1): Linear(in_features=512, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)
tensor([[[ 0.4952, -1.7868, -0.1444,  ..., -0.7066,  1.3589, -0.7895],
         [ 0.9629, -1.3782,  0.9430,  ...,  1.2434,  0.6322, -0.1833],
         [ 1.9673, -0.5689,  0.3360,  ...,  0.9851,  0.9429, -0.0715],
         ...,
         [-0.1213, -0.2957,  0.7424,  ...,  0.3319,  0.0518, -0.9152],
         [ 0.7456, -0.9266,  0.8963,  



In [37]:
decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
memory = torch.rand(10, 32, 512)
tgt = torch.rand(20, 32, 512)
out = transformer_decoder(tgt, memory)
print(transformer_decoder)
print(out)

TransformerDecoder(
  (layers): ModuleList(
    (0-5): 6 x TransformerDecoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (multihead_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
      )
      (linear1): Linear(in_features=512, out_features=2048, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=2048, out_features=512, bias=True)
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (dropout3): Dropout(p=0.1, inplace=False)
    )
  )
)
tensor([[[-4.0713e-01, -3.4067e-02,  3.1157e-01,  ...,  2.0228e+00,
 

In [5]:
# Other Layers and Functions
my_tensor = torch.rand(1, 6, 6)
print(my_tensor)

maxpool_layer = torch.nn.MaxPool2d(3)
print(maxpool_layer(my_tensor))


tensor([[[0.2084, 0.9683, 0.4169, 0.8430, 0.9849, 0.1134],
         [0.2743, 0.7948, 0.3798, 0.9022, 0.2047, 0.0855],
         [0.1745, 0.9958, 0.1633, 0.5394, 0.7763, 0.8241],
         [0.4677, 0.6048, 0.7953, 0.8265, 0.3833, 0.5445],
         [0.5640, 0.1609, 0.1808, 0.5632, 0.9814, 0.4485],
         [0.6967, 0.9981, 0.8327, 0.2695, 0.3640, 0.6561]]])
tensor([[[0.9958, 0.9849],
         [0.9981, 0.9814]]])


In [14]:
my_tensor = torch.rand(1, 4, 4) * 20 + 5
print(my_tensor)

print(my_tensor.mean())

norm_layer = torch.nn.BatchNorm1d(4)
normed_tensor = norm_layer(my_tensor)
print(normed_tensor)
print(normed_tensor.mean())

tensor([[[22.0691, 22.9752, 10.7985,  9.9651],
         [ 7.6940,  9.0773, 12.1485, 15.4833],
         [11.3351, 18.9554,  7.1318, 19.9361],
         [18.9342,  5.9508, 11.4436, 19.5958]]])
tensor(13.9684)
tensor([[[ 0.9230,  1.0719, -0.9290, -1.0659],
         [-1.1355, -0.6745,  0.3492,  1.4607],
         [-0.5638,  0.8661, -1.3525,  1.0501],
         [ 0.8791, -1.4252, -0.4504,  0.9965]]],
       grad_fn=<NativeBatchNormBackward0>)
tensor(-4.4703e-08, grad_fn=<MeanBackward0>)


In [16]:
my_tensor = torch.rand(1, 4, 4)
print(my_tensor)

dropout = torch.nn.Dropout(p=0.4)
print(dropout(my_tensor))
print(dropout(my_tensor))
print(dropout(my_tensor))

tensor([[[0.8128, 0.6534, 0.2062, 0.6433],
         [0.6296, 0.4932, 0.5928, 0.6854],
         [0.5585, 0.8713, 0.1364, 0.9941],
         [0.9421, 0.9414, 0.0943, 0.6333]]])
tensor([[[1.3547, 1.0889, 0.3437, 1.0721],
         [1.0493, 0.0000, 0.9880, 0.0000],
         [0.9308, 0.0000, 0.2274, 0.0000],
         [0.0000, 0.0000, 0.0000, 1.0556]]])
tensor([[[0.0000, 1.0889, 0.3437, 1.0721],
         [1.0493, 0.0000, 0.0000, 1.1423],
         [0.9308, 1.4522, 0.0000, 1.6568],
         [1.5701, 0.0000, 0.1572, 1.0556]]])
tensor([[[0.0000, 0.0000, 0.3437, 1.0721],
         [0.0000, 0.8220, 0.9880, 1.1423],
         [0.9308, 0.0000, 0.2274, 0.0000],
         [1.5701, 0.0000, 0.0000, 0.0000]]])


In [11]:
input = torch.randn(1, 1, 2)
print('Input: ', input)
print('------------------------------------')
# activation functions
print('Relu: ', torch.nn.ReLU()(input))
print('Sigmoid: ', torch.nn.Sigmoid()(input))
print('Softmax:', torch.nn.Softmax(dim=1)(input))
print('Tanh:', torch.nn.Tanh()(input))
print('------------------------------------')
print('Relu:', torch.nn.functional.relu(input))
print('Sigmoid:', torch.nn.functional.sigmoid(input))
print('Softmax:', torch.nn.functional.softmax(input, dim=1))
print('Tanh:', torch.nn.functional.tanh(input))


h = torch.rand(1, 2)
y = torch.rand(1, 2)

# loss function
print('------------------------------------')
print('L1: ', torch.nn.functional.l1_loss(input=h, target=y))
print('MSE: ', torch.nn.functional.mse_loss(input=h, target=y))
print('CE:', torch.nn.functional.cross_entropy(input=h, target=y))
print('BCE:', torch.nn.functional.binary_cross_entropy(input=h, target=y))
print('------------------------------------')
print('L1: ', torch.nn.L1Loss()(h, y))
print('MSE: ', torch.nn.MSELoss()(h, y))
print('CE:', torch.nn.CrossEntropyLoss()(h, y))
print('BCE:', torch.nn.BCELoss()(h, y))



Input:  tensor([[[-1.3407,  1.3262]]])
------------------------------------
Relu:  tensor([[[0.0000, 1.3262]]])
Sigmoid:  tensor([[[0.2074, 0.7902]]])
Softmax: tensor([[[1., 1.]]])
Tanh: tensor([[[-0.8718,  0.8683]]])
------------------------------------
Relu: tensor([[[0.0000, 1.3262]]])
Sigmoid: tensor([[[0.2074, 0.7902]]])
Softmax: tensor([[[1., 1.]]])
Tanh: tensor([[[-0.8718,  0.8683]]])
------------------------------------
L1:  tensor(0.3102)
MSE:  tensor(0.1275)
CE: tensor(1.2655)
BCE: tensor(0.7424)
------------------------------------
L1:  tensor(0.3102)
MSE:  tensor(0.1275)
CE: tensor(1.2655)
BCE: tensor(0.7424)
