In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
SEED = 1

## First, let's create a fake data tensor

In [2]:
batch_size = 3
seq_len = 4
hidden_size = 5

In [3]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
idx_targets = torch.tensor([0, 1, 3]) # matrix of size (batch_size) where entry is class idx
print(inputs.shape, idx_targets.shape)

torch.Size([3, 4, 5]) torch.Size([3])


## Then, our index softmax model

In [4]:
from SoftmaxRegression import IdxSoftmaxRegression

## And train it

In [5]:
lr = 1e-3
device = "cpu" # else "cuda:0"
max_epoch = 2000
print_every = 100
max_grad_norm = 1.0

In [6]:
torch.manual_seed(0)

model = IdxSoftmaxRegression(seq_len, hidden_size)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=float(lr))

epoch = 0
while epoch < max_epoch:
    epoch += 1
    loss = 0
    
    model.train()
    for i in range(batch_size):
        example = inputs[i, :, :].unsqueeze(0)
        target = idx_targets[i].unsqueeze(0)
        
        loss = model.train_forward(example, target, device)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch%print_every==0:
        print("epoch {} loss {}".format(epoch, loss))

epoch 100 loss 0.9858344793319702
epoch 200 loss 0.7027537226676941
epoch 300 loss 0.5020290613174438
epoch 400 loss 0.3692546784877777
epoch 500 loss 0.2781364917755127
epoch 600 loss 0.21403783559799194
epoch 700 loss 0.16786591708660126
epoch 800 loss 0.13383346796035767
epoch 900 loss 0.10820576548576355
epoch 1000 loss 0.08852928876876831
epoch 1100 loss 0.07316025346517563
epoch 1200 loss 0.060973044484853745
epoch 1300 loss 0.05118098109960556
epoch 1400 loss 0.04322313144803047
epoch 1500 loss 0.03669150546193123
epoch 1600 loss 0.03128419816493988
epoch 1700 loss 0.026774315163493156
epoch 1800 loss 0.022988714277744293
epoch 1900 loss 0.019793258979916573
epoch 2000 loss 0.017082812264561653


In [7]:
(model.predict_proba(inputs))

tensor([[9.6258e-01, 3.6998e-02, 8.5927e-06, 4.0928e-04],
        [2.9356e-05, 9.8474e-01, 8.4081e-03, 6.8241e-03],
        [4.6515e-04, 9.9967e-03, 6.4669e-03, 9.8307e-01]],
       grad_fn=<SoftmaxBackward>)

In [8]:
(model.predict(inputs))

tensor([0, 1, 3])

### Then let's do a softmax for whether or not there is an answer at all

In [9]:
from SoftmaxRegression import ImpossibleSoftmaxRegression

In [10]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
is_impossible_targets = torch.tensor([0, 1, 0], dtype=torch.float) # matrix of size (batch_size) where entry is binary whether there is an answer
print(inputs.shape, is_impossible_targets.shape)

torch.Size([3, 4, 5]) torch.Size([3])


In [11]:
device = "cpu" # else "cuda:0"
lr = 1e-3
max_grad_norm = 1.0
max_epoch = 2000
print_every = 100

In [12]:
torch.manual_seed(0)

model = ImpossibleSoftmaxRegression(seq_len, hidden_size)
model = model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=float(lr))

epoch = 0
while epoch < max_epoch:
    epoch += 1
    loss = 0
    
    model.train()
    for i in range(batch_size):
        example = inputs[i, :, :].unsqueeze(0)
        target = is_impossible_targets[i].unsqueeze(0)
        
        loss = model.train_forward(example, target, device)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()
        
    if epoch%print_every==0:
        print("epoch {} loss {}".format(epoch, loss))

epoch 100 loss 0.15036962926387787
epoch 200 loss 0.0723341554403305
epoch 300 loss 0.041916608810424805
epoch 400 loss 0.026582689955830574
epoch 500 loss 0.018196450546383858
epoch 600 loss 0.01313718780875206
epoch 700 loss 0.009850339964032173
epoch 800 loss 0.007593781221657991
epoch 900 loss 0.0059781791642308235
epoch 1000 loss 0.004783250391483307
epoch 1100 loss 0.0038764411583542824
epoch 1200 loss 0.0031737331300973892
epoch 1300 loss 0.0026200732681900263
epoch 1400 loss 0.0021777006331831217
epoch 1500 loss 0.0018201335333287716
epoch 1600 loss 0.0015284172259271145
epoch 1700 loss 0.0012884092284366488
epoch 1800 loss 0.0010896299500018358
epoch 1900 loss 0.0009240007493644953
epoch 2000 loss 0.0007854207651689649


In [13]:
(model.predict_proba(inputs))

tensor([5.3539e-04, 9.9932e-01, 7.8465e-04], grad_fn=<SigmoidBackward>)

In [14]:
(model.predict(inputs))

tensor([0., 1., 0.], grad_fn=<RoundBackward>)

### Let's also look at using our multi softmax class

In [15]:
torch.manual_seed(SEED)

inputs = torch.randn((batch_size, seq_len, hidden_size)) # matrix of size (batch_size, seq_len, hidden_size)
start_idx_targets = torch.tensor([0, 1, 2]) # matrix of size (batch_size) where entry is class idx
stop_idx_targets = torch.tensor([1, 1, 3]) # matrix of size (batch_size) where entry is class idx
is_impossible_targets = torch.tensor([0, 1, 0], dtype=torch.float) # matrix of size (batch_size) where entry is binary whether answering is impossible

In [16]:
from SoftmaxRegression import MultiSoftmaxRegression

In [17]:
model = MultiSoftmaxRegression(seq_len, hidden_size)

In [18]:
max_epoch = 2000
print_every = 100
epoch = 0
device = "cpu"
while epoch < max_epoch:
    epoch += 1
    start_idx_loss = model.train_step(inputs, start_idx_targets, 'start', device)
    stop_idx_loss = model.train_step(inputs, stop_idx_targets, 'stop', device)
    is_impossible_loss = model.train_step(inputs, is_impossible_targets, 'impossible', device)
    
    if epoch%print_every==0:
        print("epoch {}, loss start {:.2f}, stop {:.2f}, impossible {:.2f}".format(epoch, start_idx_loss, stop_idx_loss, is_impossible_loss))
    

epoch 100, loss start 1.57, stop 1.30, impossible 0.81
epoch 200, loss start 1.56, stop 1.29, impossible 0.79
epoch 300, loss start 1.55, stop 1.28, impossible 0.78
epoch 400, loss start 1.55, stop 1.27, impossible 0.76
epoch 500, loss start 1.54, stop 1.27, impossible 0.75
epoch 600, loss start 1.53, stop 1.26, impossible 0.74
epoch 700, loss start 1.52, stop 1.25, impossible 0.72
epoch 800, loss start 1.51, stop 1.24, impossible 0.71
epoch 900, loss start 1.51, stop 1.24, impossible 0.70
epoch 1000, loss start 1.50, stop 1.23, impossible 0.68
epoch 1100, loss start 1.49, stop 1.22, impossible 0.67
epoch 1200, loss start 1.48, stop 1.21, impossible 0.66
epoch 1300, loss start 1.47, stop 1.21, impossible 0.64
epoch 1400, loss start 1.47, stop 1.20, impossible 0.63
epoch 1500, loss start 1.46, stop 1.19, impossible 0.62
epoch 1600, loss start 1.45, stop 1.19, impossible 0.61
epoch 1700, loss start 1.44, stop 1.18, impossible 0.59
epoch 1800, loss start 1.43, stop 1.17, impossible 0.58
e

In [19]:
(model.predict(inputs, device))

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  3.]], dtype=float32)

### Let's experiment with saving and loading our models

First, make a directory 'softmax model' in the current directory

In [20]:
save_dir = "./sm_model"
if not os.path.exists(save_dir):
    os.mkdir(save_dir)

In [23]:
model.save(save_dir, 0)

In [24]:
model = MultiSoftmaxRegression(seq_len, hidden_size)

In [25]:
model.load(save_dir, 0)

In [26]:
(model.predict(inputs, device))

array([[-1., -1.],
       [-1., -1.],
       [ 1.,  3.]], dtype=float32)