In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
SEED = 1

## First, let's create a fake data tensor

In [2]:
batch_size = 3
seq_len = 4
hidden_size = 5

In [3]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
idx_targets = torch.tensor([0, 1, 3]) # matrix of size (batch_size) where entry is class idx
print(inputs.shape, idx_targets.shape)

torch.Size([3, 4, 5]) torch.Size([3])


## Then, our index softmax model

In [4]:
from SoftmaxRegression import IdxSoftmaxRegression

## And train it

In [5]:
lr = 1e-3
device = "cpu" # else "cuda:0"
max_epoch = 2000
print_every = 100
max_grad_norm = 1.0

In [6]:
torch.manual_seed(0)

model = IdxSoftmaxRegression(seq_len, hidden_size)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=float(lr))

epoch = 0
while epoch < max_epoch:
    epoch += 1
    loss = 0
    
    model.train()
    for i in range(batch_size):
        example = inputs[i, :, :].unsqueeze(0)
        target = idx_targets[i].unsqueeze(0)
        
        loss = model.train_forward(example, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch%print_every==0:
        print("epoch {} loss {}".format(epoch, loss))

epoch 100 loss 0.9858344793319702
epoch 200 loss 0.7027537226676941
epoch 300 loss 0.5020290613174438
epoch 400 loss 0.3692546784877777
epoch 500 loss 0.2781364917755127
epoch 600 loss 0.21403783559799194
epoch 700 loss 0.16786591708660126
epoch 800 loss 0.13383346796035767
epoch 900 loss 0.10820576548576355
epoch 1000 loss 0.08852928876876831
epoch 1100 loss 0.07316025346517563
epoch 1200 loss 0.060973044484853745
epoch 1300 loss 0.05118098109960556
epoch 1400 loss 0.04322313144803047
epoch 1500 loss 0.03669150546193123
epoch 1600 loss 0.03128419816493988
epoch 1700 loss 0.026774315163493156
epoch 1800 loss 0.022988714277744293
epoch 1900 loss 0.019793258979916573
epoch 2000 loss 0.017082812264561653


In [7]:
(model.predict_proba(inputs))

tensor([[9.6258e-01, 3.6998e-02, 8.5927e-06, 4.0928e-04],
        [2.9356e-05, 9.8474e-01, 8.4081e-03, 6.8241e-03],
        [4.6515e-04, 9.9967e-03, 6.4669e-03, 9.8307e-01]],
       grad_fn=<SoftmaxBackward>)

In [8]:
(model.predict(inputs))

tensor([0, 1, 3])

### Then let's do a softmax for whether or not there is an answer at all

In [15]:
from SoftmaxRegression import ImpossibleSoftmaxRegression

In [16]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
is_impossible_targets = torch.tensor([0, 1, 0], dtype=torch.float) # matrix of size (batch_size) where entry is binary whether there is an answer
print(inputs.shape, is_impossible_targets.shape)

torch.Size([3, 4, 5]) torch.Size([3])


In [17]:
device = "cpu" # else "cuda:0"
lr = 1e-3
max_grad_norm = 1.0
max_epoch = 2000
print_every = 100

In [19]:
torch.manual_seed(0)

model = ImpossibleSoftmaxRegression(seq_len, hidden_size)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=float(lr))

epoch = 0
while epoch < max_epoch:
    epoch += 1
    loss = 0
    
    model.train()
    for i in range(batch_size):
        example = inputs[i, :, :].unsqueeze(0)
        target = is_impossible_targets[i].unsqueeze(0)
        
        loss = model.train_forward(example, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch%print_every==0:
        print("epoch {} loss {}".format(epoch, loss))

epoch 100 loss 0.14965412020683289
epoch 200 loss 0.0693732351064682
epoch 300 loss 0.03901948407292366
epoch 400 loss 0.024781912565231323
epoch 500 loss 0.017031462863087654
epoch 600 loss 0.012340199202299118
epoch 700 loss 0.009279616177082062
epoch 800 loss 0.0071703349240124226
epoch 900 loss 0.005655335728079081
epoch 1000 loss 0.004531742073595524
epoch 1100 loss 0.003677146742120385
epoch 1200 loss 0.0030137368012219667
epoch 1300 loss 0.0024901016149669886
epoch 1400 loss 0.0020711994729936123
epoch 1500 loss 0.0017322394996881485
epoch 1600 loss 0.0014553522923961282
epoch 1700 loss 0.0012273568427190185
epoch 1800 loss 0.0010384346824139357
epoch 1900 loss 0.000880927313119173
epoch 2000 loss 0.0007489743293263018


In [20]:
(model.predict_proba(inputs))

tensor([5.1660e-04, 9.9878e-01, 7.4828e-04], grad_fn=<SigmoidBackward>)

In [21]:
(model.predict(inputs))

tensor([0., 1., 0.], grad_fn=<RoundBackward>)

### Let's also look at using our multi softmax class

In [22]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
start_idx_targets = torch.tensor([0, 1, 2]) # matrix of size (batch_size) where entry is class idx
stop_idx_targets = torch.tensor([1, 1, 3]) # matrix of size (batch_size) where entry is class idx
is_impossible_targets = torch.tensor([0, 1, 0], dtype=torch.float) # matrix of size (batch_size) where entry is binary whether answering is impossible

In [23]:
from SoftmaxRegression import MultiSoftmaxRegression

In [24]:
model = MultiSoftmaxRegression(seq_len, hidden_size)

In [25]:
max_epoch = 2000
print_every = 100
epoch = 0
device = "cpu"
while epoch < max_epoch:
    epoch += 1
    start_idx_loss = model.train_step(inputs, start_idx_targets, 'start', device)
    stop_idx_loss = model.train_step(inputs, stop_idx_targets, 'stop', device)
    is_impossible_loss = model.train_step(inputs, is_impossible_targets, 'impossible', device)
    
    if epoch%print_every==0:
        print("epoch {}, loss start {:.2f}, stop {:.2f}, impossible {:.2f}".format(epoch, start_idx_loss, stop_idx_loss, is_impossible_loss))
    

epoch 100, loss start 1.32, stop 1.07, impossible 0.44
epoch 200, loss start 1.10, stop 0.88, impossible 0.19
epoch 300, loss start 0.92, stop 0.73, impossible 0.08
epoch 400, loss start 0.77, stop 0.62, impossible 0.04
epoch 500, loss start 0.64, stop 0.53, impossible 0.02
epoch 600, loss start 0.53, stop 0.46, impossible 0.01
epoch 700, loss start 0.44, stop 0.40, impossible 0.00
epoch 800, loss start 0.36, stop 0.35, impossible 0.00
epoch 900, loss start 0.29, stop 0.31, impossible 0.00
epoch 1000, loss start 0.24, stop 0.27, impossible 0.00
epoch 1100, loss start 0.20, stop 0.24, impossible 0.00
epoch 1200, loss start 0.16, stop 0.21, impossible 0.00
epoch 1300, loss start 0.13, stop 0.19, impossible 0.00
epoch 1400, loss start 0.11, stop 0.17, impossible 0.00
epoch 1500, loss start 0.09, stop 0.15, impossible 0.00
epoch 1600, loss start 0.07, stop 0.13, impossible 0.00
epoch 1700, loss start 0.06, stop 0.12, impossible 0.00
epoch 1800, loss start 0.05, stop 0.11, impossible 0.00
e

In [26]:
(model.predict(inputs, device))

array([[ 0.,  1.],
       [-1., -1.],
       [ 2.,  3.]], dtype=float32)

### Let's experiment with saving and loading our models

First, make a directory 'softmax model' in the current directory

In [27]:
model.save('./sm_model/model')

In [28]:
model = MultiSoftmaxRegression(seq_len, hidden_size)

In [29]:
model.load('./sm_model/model')

In [30]:
(model.predict(inputs, device))

array([[ 0.,  1.],
       [-1., -1.],
       [ 2.,  3.]], dtype=float32)