In [1]:
import torch
from torch.nn import TransformerEncoderLayer
from torch import nn, Tensor

import numpy as np

import math

In [2]:
# Data parameters

num_hidden_state = 3 # number of hidden states
num_obs = 5          # number of possible observations
seq_length = 10      # sequence length
nsamples = 1000      # number of samples we want to generate

# Set model parameters
emsize = 200         # embedding dimension/feature dimension
d_hid = 2048         # dimension of the feedforward network in TransformerEncoder
nhead = 2            # number of heads in multi-head attention
ntoken = num_obs + 1 # vocabulary size
batch_size = 200     # batch size 
lr = 1e-3            # learning rate
epochs = 200         # number of training epochs

In [3]:
# Generate HMM parameters
def generate_HMM_params(num_hidden_state, num_obs):
    # random generate the transition matrix and observation matrix, and compute the stationary distribution
    
    alpha_state = np.ones(num_hidden_state)
    alpha_obs = np.ones(num_obs) / num_obs
    trans_mat = np.random.dirichlet(alpha_state, num_hidden_state)
    obs_mat = np.random.dirichlet(alpha_obs, num_hidden_state)
    tmp = np.ones((num_hidden_state + 1, num_hidden_state))
    tmp[:-1] = np.identity(num_hidden_state) - trans_mat.T
    tmp_v = np.zeros(num_hidden_state + 1)
    tmp_v[-1] = 1
    stat_dist = np.linalg.lstsq(tmp, tmp_v, rcond=None)[0]
    return trans_mat, obs_mat, stat_dist

In [4]:
# Sample HMM sequences
def generate_HMM_sequences(trans_mat, obs_mat, init_dist, length, num_samples = 1):
    # generate sample sequences from HMM using the parameters given
    
    states = np.zeros((num_samples, length))
    obs = np.zeros((num_samples, length))
    tmp_state = np.argmax(np.random.multinomial(1, init_dist, num_samples), axis = 1)
    #print(tmp_state)
    for i in range(length):
        #print("i: ", i)
        states[:, i] = tmp_state
        for j in range(num_samples):
            obs[j, i] = np.random.multinomial(1, obs_mat[tmp_state[j]]).argmax()
            tmp_state[j] = np.random.multinomial(1, trans_mat[tmp_state[j]]).argmax()
        #print("obs[:, i]: ", obs[:, i])
    return states, obs

In [5]:
# Add [mask] tokens to input, one per sequence
def add_mask_to_sequences(seqs, pos):
  masked_seqs = np.copy(seqs)
  for i in range(nsamples):
    masked_seqs[i, pos[i]] = num_obs
  return masked_seqs

In [6]:
# Define Transformer Model
class TransformerModel(nn.Module):

  def __init__(self, emsize: int, nhead: int, ntoken: int):
    super().__init__()
    self.emsize = emsize
    self.encoder = nn.Embedding(ntoken, emsize)
    #self.pos_encoder = PositionalEncoding(emsize, dropout)
    self.transformer_encoder = TransformerEncoderLayer(emsize, nhead, d_hid, batch_first=True)
    self.decoder = nn.Linear(emsize, ntoken)
  
  def forward(self, src: Tensor) -> Tensor:
    # original input: (batch_size, seq_length)
    #print(src.shape)
    src = self.encoder(src) * math.sqrt(self.emsize)
    # after embedding: (batch_size, seq_length, emsize)
    #src = self.pos_encoder(src)
    #print(src.shape)
    output = self.transformer_encoder(src)
    #print(output.shape)
    # after encoder: (batch_size, seq_length, emsize)
    output = self.decoder(output)
    # after decoder: (batch_size, seq_length, ntoken)
    return output

In [7]:
# Generate HMM parameters and samples used for training
seed = 20211121
np.random.seed(seed)
trans_mat, obs_mat, stat_dist = generate_HMM_params(num_hidden_state, num_obs) # generate parameters for HMM
states, obs = generate_HMM_sequences(trans_mat, obs_mat, stat_dist, seq_length, nsamples) # generate training sequences
pos = np.random.randint(seq_length, size = nsamples) # positions for masks, nsamples-dimensional array
masked_obs = add_mask_to_sequences(obs, pos)
val_states, val_obs = generate_HMM_sequences(trans_mat, obs_mat, stat_dist, seq_length, nsamples) # generate validation sequences
val_pos = np.random.randint(seq_length, size = nsamples)
val_masked_obs = add_mask_to_sequences(val_obs, val_pos)

In [8]:
# Prepare input data and validation data
dataset = torch.utils.data.TensorDataset(torch.LongTensor(masked_obs), torch.LongTensor(obs))
val_dataset = torch.utils.data.TensorDataset(torch.LongTensor(val_masked_obs), torch.LongTensor(val_obs))
train_dl = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Set up model instance
model = TransformerModel(emsize, nhead, ntoken)

In [10]:
# Set up optimizer and loss
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [11]:
# Training process
model.train()
for i in range(epochs):
  total_loss = 0.
  for data, target in train_dl:
    #data = data[0]
    output = model(data)
    loss = criterion(output.transpose(1, 2), target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  if i % 10 == 0:
    print("epoch " + str(i))
    print("training loss: " + str(total_loss))
    model.eval()
    total_val_loss = 0.
    total_val_train_loss = 0.
    for val_data, val_target in val_dl:
      #val_data = val_data[0]
      val_output = model(val_data)
      val_loss = criterion(val_output.transpose(1, 2), val_target)
      total_val_loss += val_loss.item()
    for val_train_data, val_train_target in train_dl:
      val_train_output = model(val_train_data)
      val_train_loss = criterion(val_train_output.transpose(1, 2), val_train_target)
      total_val_train_loss += val_train_loss.item()
    print("val loss: " + str(total_val_loss))
    print("val train loss: " + str(total_val_train_loss))
    model.train()

epoch 0
training loss: 9.206130743026733
val loss: 7.8179692029953
val train loss: 7.8618611097335815
epoch 10
training loss: 2.173106074333191
val loss: 1.9837653934955597
val train loss: 2.0522585809230804
epoch 20
training loss: 1.5233761370182037
val loss: 1.4067131876945496
val train loss: 1.4668624997138977
epoch 30
training loss: 1.256676271557808
val loss: 1.166317641735077
val train loss: 1.218151479959488
epoch 40
training loss: 1.1047970205545425
val loss: 1.0251283794641495
val train loss: 1.0701913386583328
epoch 50
training loss: 0.9998591542243958
val loss: 0.9308434128761292
val train loss: 0.9706205725669861
epoch 60
training loss: 0.9237363189458847
val loss: 0.8635053485631943
val train loss: 0.8992139846086502
epoch 70
training loss: 0.8710628300905228
val loss: 0.8133158981800079
val train loss: 0.8458731919527054
epoch 80
training loss: 0.8243516683578491
val loss: 0.7747923731803894
val train loss: 0.8048716634511948
epoch 90
training loss: 0.794848769903183
val 

In [12]:
train_err = 0
train_mask_err = 0
val_err = 0
val_mask_err = 0
model.eval()
for data, target in train_dl:
  #data = data[0]
  print(data[:5])
  output = model(data) # (batch_size, seq_length, ntoken)
  train_err += torch.sum(torch.argmax(output, dim=2) != target)
  train_mask_err += torch.sum((data != target) * (torch.argmax(output, dim=2) != target))
  print(output[:5])
  print(torch.argmax(output, dim=2)[:5])
  print(target[:5])
  #print(output.shape)
  #print(data)
  #print(data.shape)
  #print(output.transpose(1, 2))
  #print(output.transpose(1, 2).shape)
  #loss = criterion(output.transpose(1, 2), target) # CrossEntropyLoss takes input of size (N, C, d) and (N, d) where N: number of data, C: number of classes, d: extra dim, so need to swap the dimension of output from (batch_size, seq_length, ntoken) to (batch_size, ntoken, seq_length)
  #print(loss)
  break
for val_data, val_target in val_dl:
  #val_data = val_data[0]
  print(val_data[:5])
  val_output = model(val_data)
  val_err += torch.sum(torch.argmax(val_output, dim=2) != val_target)
  val_mask_err += torch.sum((val_data != val_target) * (torch.argmax(val_output, dim=2) != val_target))
  print(val_output[:5])
  print(torch.argmax(val_output, dim=2)[:5])
  print(val_target[:5])
  #val_loss = criterion(val_output.transpose(1, 2), val_target)
  #print(val_loss)
  break
print(train_err)
print(train_mask_err)
print(val_err)
print(val_mask_err)

tensor([[3, 2, 3, 4, 4, 5, 3, 4, 2, 4],
        [4, 2, 2, 4, 4, 2, 4, 2, 4, 5],
        [2, 2, 2, 4, 2, 4, 2, 4, 2, 5],
        [4, 2, 5, 3, 2, 2, 2, 2, 4, 2],
        [5, 4, 2, 2, 2, 4, 4, 0, 3, 2]])
tensor([[[-0.3101, -0.5406, -0.4460,  3.9307, -0.4764, -0.9454],
         [-1.0763, -1.1138,  6.0035, -0.7718, -0.0813, -1.0564],
         [-0.3101, -0.5406, -0.4460,  3.9307, -0.4764, -0.9454],
         [-1.4830, -1.7898, -0.7918, -1.0090,  5.2828, -1.8716],
         [-1.4830, -1.7898, -0.7918, -1.0090,  5.2828, -1.8716],
         [-0.8322, -0.7625,  2.2040, -0.1447,  1.9869, -1.5643],
         [-0.3101, -0.5406, -0.4460,  3.9307, -0.4764, -0.9454],
         [-1.4830, -1.7898, -0.7918, -1.0090,  5.2828, -1.8716],
         [-1.0763, -1.1138,  6.0035, -0.7718, -0.0813, -1.0564],
         [-1.4830, -1.7898, -0.7918, -1.0090,  5.2828, -1.8716]],

        [[-1.4830, -1.7898, -0.7918, -1.0090,  5.2828, -1.8716],
         [-1.0763, -1.1138,  6.0035, -0.7718, -0.0813, -1.0564],
         [-1.0763