In [27]:
import torch
from torch.nn import TransformerEncoderLayer, TransformerEncoder
from torch import nn, Tensor

import numpy as np

import math

In [28]:
# Data parameters

num_hidden_state = 3 # number of hidden states
num_obs = 5          # number of possible observations
seq_length = 10      # sequence length
nsamples = 1000      # number of samples we want to generate

# Set model parameters
emsize = 200         # embedding dimension/feature dimension
d_hid = 2048         # dimension of the feedforward network in TransformerEncoder
nhead = 4            # number of heads in multi-head attention
ntoken = num_obs + 1 # vocabulary size
nlayers = 2          # number of layers
batch_size = 200     # batch size 
lr = 1e-3            # learning rate
epochs = 200         # number of training epochs

In [29]:
# Generate HMM parameters
def generate_HMM_params(num_hidden_state, num_obs):
    # random generate the transition matrix and observation matrix, and compute the stationary distribution
    
    alpha_state = np.ones(num_hidden_state)
    alpha_obs = np.ones(num_obs) / math.sqrt(num_obs)
    trans_mat = np.random.dirichlet(alpha_state, num_hidden_state)
    obs_mat = np.random.dirichlet(alpha_obs, num_hidden_state)
    tmp = np.ones((num_hidden_state + 1, num_hidden_state))
    tmp[:-1] = np.identity(num_hidden_state) - trans_mat.T
    tmp_v = np.zeros(num_hidden_state + 1)
    tmp_v[-1] = 1
    stat_dist = np.linalg.lstsq(tmp, tmp_v, rcond=None)[0]
    return trans_mat, obs_mat, stat_dist

In [30]:
# Sample HMM sequences
def generate_HMM_sequences(trans_mat, obs_mat, init_dist, length, num_samples = 1):
    # generate sample sequences from HMM using the parameters given
    
    states = np.zeros((num_samples, length))
    obs = np.zeros((num_samples, length))
    tmp_state = np.argmax(np.random.multinomial(1, init_dist, num_samples), axis = 1)
    #print(tmp_state)
    for i in range(length):
        #print("i: ", i)
        states[:, i] = tmp_state
        for j in range(num_samples):
            obs[j, i] = np.random.multinomial(1, obs_mat[tmp_state[j]]).argmax()
            tmp_state[j] = np.random.multinomial(1, trans_mat[tmp_state[j]]).argmax()
        #print("obs[:, i]: ", obs[:, i])
    return states, obs

In [42]:
# Compute the conditional probability of x_i given other observations
def x_i_conditional_prob(trans_mat, obs_mat, init_dist, known_X, pos):
    num_hidden_state = trans_mat.shape[0]
    num_obs = obs_mat.shape[1]
    num_samples = known_X.shape[0]
    length = known_X.shape[1]
    x_pos_conditional_prob = np.zeros((num_samples, num_obs))
    h_pos_conditional_prob = np.zeros((num_samples, num_hidden_state))
    h_all_pos_conditional_prob = np.zeros((num_samples, num_hidden_state))
    for i in range(num_samples):
        #print("x_i_conditional_prob: i=", i)
        sample_obs_vec = known_X[i]
        forward_vec = forward_compute(trans_mat, obs_mat, init_dist, known_X[i, :pos[i]])
        backward_vec = backward_compute(trans_mat, obs_mat, known_X[i, pos[i] + 1:])
        #print("forward_vec: ", forward_vec)
        #print("backward_vec: ", backward_vec)
        h_prob_tmp = forward_vec * backward_vec
        tmp = h_prob_tmp.sum()
        h_prob_tmp /= tmp
        h_pos_conditional_prob[i] = h_prob_tmp
        x_pos_conditional_prob[i] = h_prob_tmp @ obs_mat
        h_all_pos_conditional_prob[i] = h_prob_tmp * obs_mat[:, int(known_X[i, pos[i]])] / x_pos_conditional_prob[i, int(known_X[i, pos[i]])]
    return h_pos_conditional_prob, x_pos_conditional_prob, h_all_pos_conditional_prob

In [31]:
# Add [mask] tokens to input, one per sequence
def add_mask_to_sequences(seqs, pos):
  masked_seqs = np.copy(seqs)
  for i in range(nsamples):
    masked_seqs[i, pos[i]] = num_obs
  return masked_seqs

In [36]:
# Define Transformer Model
class TransformerModel(nn.Module):

  def __init__(self, emsize: int, nhead: int, ntoken: int):
    super().__init__()
    self.emsize = emsize
    self.encoder = nn.Embedding(ntoken, emsize)
    self.pos_encoder = nn.Embedding(seq_length, emsize)
    transformer_encoder_layer = TransformerEncoderLayer(emsize, nhead, d_hid, batch_first=True)
    self.transformer_encoder = TransformerEncoder(transformer_encoder_layer, nlayers)
    self.decoder = nn.Linear(emsize, ntoken)
  
  def forward(self, src: Tensor) -> Tensor:
    # original input: (batch_size, seq_length)
    #print(src.shape)
    pos = torch.arange(seq_length, dtype=torch.long).unsqueeze(0).expand_as(src)
    #print("src")
    #print(src)
    #print(pos)
    src = (self.encoder(src) + self.pos_encoder(pos)) * math.sqrt(self.emsize)
    #print(src)
    # after embedding: (batch_size, seq_length, emsize)
    #src = self.pos_encoder(src)
    #print(src.shape)
    output = self.transformer_encoder(src)
    #print(output.shape)
    # after encoder: (batch_size, seq_length, emsize)
    output = self.decoder(output)
    # after decoder: (batch_size, seq_length, ntoken)
    return output

In [37]:
# Generate HMM parameters and samples used for training
seed = 20211126
np.random.seed(seed)
trans_mat, obs_mat, stat_dist = generate_HMM_params(num_hidden_state, num_obs) # generate parameters for HMM
states, obs = generate_HMM_sequences(trans_mat, obs_mat, stat_dist, seq_length, nsamples) # generate training sequences
print(trans_mat)
print(obs_mat)
print(states[:10])
print(obs[:10])
pos = np.random.randint(seq_length, size = nsamples) # positions for masks, nsamples-dimensional array
masked_obs = add_mask_to_sequences(obs, pos)
val_states, val_obs = generate_HMM_sequences(trans_mat, obs_mat, stat_dist, seq_length, nsamples) # generate validation sequences
val_pos = np.random.randint(seq_length, size = nsamples)
val_masked_obs = add_mask_to_sequences(val_obs, val_pos)

[[0.03389948 0.70219526 0.26390526]
 [0.09818885 0.30173531 0.60007584]
 [0.37153845 0.49203856 0.13642299]]
[[0.10929346 0.10621233 0.0278456  0.66361412 0.09303449]
 [0.00475611 0.37757279 0.13019784 0.35162659 0.13584667]
 [0.1408419  0.17948952 0.67236856 0.00645646 0.00084355]]
[[1. 2. 1. 2. 0. 2. 0. 1. 2. 1.]
 [1. 2. 0. 1. 2. 1. 2. 1. 1. 2.]
 [1. 1. 1. 2. 0. 2. 0. 1. 1. 2.]
 [0. 1. 0. 1. 2. 1. 1. 1. 1. 2.]
 [2. 0. 1. 1. 2. 0. 1. 1. 2. 1.]
 [1. 2. 2. 2. 1. 2. 0. 1. 1. 2.]
 [2. 0. 1. 2. 1. 2. 1. 2. 1. 1.]
 [1. 2. 1. 2. 2. 1. 1. 1. 2. 1.]
 [2. 0. 1. 1. 1. 2. 2. 2. 1. 2.]
 [1. 2. 1. 2. 1. 1. 1. 1. 2. 0.]]
[[3. 2. 1. 2. 3. 2. 0. 3. 2. 1.]
 [3. 2. 4. 3. 1. 4. 2. 3. 1. 2.]
 [1. 3. 1. 1. 3. 1. 3. 3. 1. 2.]
 [3. 1. 0. 3. 2. 2. 4. 1. 4. 0.]
 [2. 3. 1. 1. 2. 3. 3. 1. 2. 1.]
 [3. 2. 2. 2. 1. 0. 3. 1. 3. 0.]
 [2. 3. 4. 0. 1. 2. 1. 2. 2. 1.]
 [3. 2. 3. 2. 0. 3. 3. 1. 1. 4.]
 [0. 1. 3. 3. 4. 2. 0. 0. 3. 2.]
 [1. 2. 1. 2. 1. 4. 1. 1. 0. 3.]]


In [38]:
# Prepare input data and validation data
dataset = torch.utils.data.TensorDataset(torch.LongTensor(masked_obs), torch.LongTensor(obs))
val_dataset = torch.utils.data.TensorDataset(torch.LongTensor(val_masked_obs), torch.LongTensor(val_obs))
train_dl = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False)
val_dl = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [39]:
# Set up model instance
model = TransformerModel(emsize, nhead, ntoken)

In [40]:
# Set up optimizer and loss
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [41]:
# Training process
model.train()
for i in range(epochs):
  total_loss = 0.
  for data, target in train_dl:
    #data = data[0]
    output = model(data)
    loss = criterion(output.transpose(1, 2), target)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    total_loss += loss.item()
  if i % 10 == 0:
    print("epoch " + str(i))
    print("training loss: " + str(total_loss))
    model.eval()
    total_val_loss = 0.
    total_val_train_loss = 0.
    for val_data, val_target in val_dl:
      #val_data = val_data[0]
      val_output = model(val_data)
      val_loss = criterion(val_output.transpose(1, 2), val_target)
      total_val_loss += val_loss.item()
    for val_train_data, val_train_target in train_dl:
      val_train_output = model(val_train_data)
      val_train_loss = criterion(val_train_output.transpose(1, 2), val_train_target)
      total_val_train_loss += val_train_loss.item()
    print("val loss: " + str(total_val_loss))
    print("val train loss: " + str(total_val_train_loss))
    model.train()

epoch 0
training loss: 9.85196840763092
val loss: 9.305396556854248
val train loss: 9.309041619300842
epoch 10
training loss: 5.061509013175964
val loss: 4.768275082111359
val train loss: 4.8159419894218445
epoch 20
training loss: 3.471175730228424
val loss: 3.2540283203125
val train loss: 3.298213541507721
epoch 30
training loss: 2.6650319695472717
val loss: 2.496773213148117
val train loss: 2.52975857257843
epoch 40
training loss: 2.1787663400173187
val loss: 2.039925217628479
val train loss: 2.0613562762737274
epoch 50
training loss: 1.8562487959861755
val loss: 1.7413836419582367
val train loss: 1.753245770931244
epoch 60
training loss: 1.6307629942893982
val loss: 1.538048505783081
val train loss: 1.541743665933609
epoch 70
training loss: 1.4722101986408234
val loss: 1.3940260410308838
val train loss: 1.3913989961147308
epoch 80
training loss: 1.35353222489357
val loss: 1.2885530292987823
val train loss: 1.2810360789299011
epoch 90
training loss: 1.259479120373726
val loss: 1.2090

In [43]:
train_err = 0
train_mask_err = 0
val_err = 0
val_mask_err = 0
model.eval()
for data, target in train_dl:
  #data = data[0]
  print("data")
  print(data[:5])
  output = model(data) # (batch_size, seq_length, ntoken)
  train_err += torch.sum(torch.argmax(output, dim=2) != target)
  train_mask_err += torch.sum((data != target) * (torch.argmax(output, dim=2) != target))
  print(output[:5])
  print(torch.argmax(output, dim=2)[:5])
  print(target[:5])
  #print(output.shape)
  #print(data)
  #print(data.shape)
  #print(output.transpose(1, 2))
  #print(output.transpose(1, 2).shape)
  #loss = criterion(output.transpose(1, 2), target) # CrossEntropyLoss takes input of size (N, C, d) and (N, d) where N: number of data, C: number of classes, d: extra dim, so need to swap the dimension of output from (batch_size, seq_length, ntoken) to (batch_size, ntoken, seq_length)
  #print(loss)
for val_data, val_target in val_dl:
  #val_data = val_data[0]
  print(val_data[:5])
  val_output = model(val_data)
  val_err += torch.sum(torch.argmax(val_output, dim=2) != val_target)
  val_mask_err += torch.sum((val_data != val_target) * (torch.argmax(val_output, dim=2) != val_target))
  print(val_output[:5])
  print(torch.argmax(val_output, dim=2)[:5])
  print(val_target[:5])
  #val_loss = criterion(val_output.transpose(1, 2), val_target)
  #print(val_loss)
print(train_err)
print(train_mask_err)
print(val_err)
print(val_mask_err)

data
tensor([[3, 2, 1, 2, 3, 2, 0, 5, 2, 1],
        [3, 2, 4, 5, 1, 4, 2, 3, 1, 2],
        [1, 3, 1, 1, 3, 1, 3, 5, 1, 2],
        [3, 1, 0, 3, 2, 5, 4, 1, 4, 0],
        [2, 3, 1, 1, 2, 3, 3, 1, 2, 5]])
tensor([[[-7.7127e-01, -9.1444e-01, -1.2233e+00,  4.1977e+00, -1.1841e+00,
          -1.8900e+00],
         [-6.0358e-01, -5.4722e-01,  4.6173e+00, -7.6041e-01, -9.8037e-01,
          -1.6599e+00],
         [-1.0900e+00,  4.6187e+00, -7.5211e-01,  1.5829e-02, -3.3120e-01,
          -1.4450e+00],
         [-5.5184e-01, -8.9291e-02,  4.6812e+00, -3.5678e-01, -1.4208e+00,
          -1.9417e+00],
         [-1.1325e+00, -8.9806e-01, -7.5308e-01,  4.3590e+00, -1.4235e+00,
          -1.8942e+00],
         [-4.1832e-01, -9.5172e-01,  4.7559e+00, -5.2232e-01, -1.6378e+00,
          -1.8995e+00],
         [ 2.6382e+00, -1.2283e-01, -2.5794e-01,  4.3488e-03, -9.8240e-01,
          -1.8958e+00],
         [ 3.8317e-01,  6.8945e-01,  1.0052e+00,  1.2301e+00, -2.3742e-01,
          -1.9184e+00],
  

In [47]:
torch.nn.functional.softmax(val_output[:5], dim = 2)

tensor([[[3.3305e-03, 9.8103e-01, 2.8514e-03, 3.1880e-03, 7.5862e-03,
          2.0092e-03],
         [9.1408e-01, 1.3318e-02, 2.2682e-02, 2.4306e-02, 1.9651e-02,
          5.9645e-03],
         [2.4005e-03, 2.8132e-03, 9.8454e-01, 5.7121e-03, 2.7176e-03,
          1.8175e-03],
         [4.3817e-03, 9.8613e-01, 2.3330e-03, 4.4025e-03, 1.4667e-03,
          1.2853e-03],
         [3.3952e-03, 4.7890e-03, 6.8799e-03, 9.7945e-01, 3.5851e-03,
          1.9003e-03],
         [1.3884e-01, 1.6617e-01, 3.6743e-01, 2.4092e-01, 7.4520e-02,
          1.2112e-02],
         [1.9885e-03, 9.8554e-01, 3.8721e-03, 4.3386e-03, 3.0449e-03,
          1.2205e-03],
         [4.9906e-03, 3.2176e-03, 9.8139e-01, 5.7592e-03, 2.5680e-03,
          2.0735e-03],
         [4.0119e-03, 1.3931e-02, 9.7196e-01, 5.1965e-03, 3.0305e-03,
          1.8726e-03],
         [3.2213e-03, 6.6340e-03, 9.7509e-01, 6.8086e-03, 5.2285e-03,
          3.0171e-03]],

        [[3.0063e-03, 9.8110e-01, 2.3530e-03, 3.4659e-03, 8.5525e-03

In [50]:
def forward_compute(trans_mat, obs_mat, init_dist, obs_to_pos):
    # compute \sum_{h_1,...,h_{pos-1}} P(h_1,...,h_{pos},x_1,...,x_{pos-1})
    pos = obs_to_pos.shape[0] + 1
    num_hidden_state = trans_mat.shape[0]
    num_obs = obs_mat.shape[1]
    forward = np.zeros((pos, num_hidden_state))
    forward[0] = init_dist
    for i in range(1, pos):
        for j in range(num_hidden_state):
            for k in range(num_hidden_state):
                #print(i, j, k)
                #print(forward[i - 1, k], trans_mat[k, j], obs_mat[k, int(obs_to_pos[i - 1])])
                forward[i, j] += forward[i - 1, k] * trans_mat[k, j] * obs_mat[k, int(obs_to_pos[i - 1])]
    #print("forward: ", forward)
    return forward[pos - 1]

In [51]:
def backward_compute(trans_mat, obs_mat, obs_from_pos):
    num_hidden_state = trans_mat.shape[0]
    num_obs = obs_mat.shape[1]
    back_length = obs_from_pos.shape[0]
    if (back_length == 0):
        return np.ones(num_hidden_state)
    backward = np.zeros((back_length, num_hidden_state))
    for j in range(num_hidden_state):
         for k in range(num_hidden_state):
            backward[0, j] += trans_mat[j, k] * obs_mat[k, int(obs_from_pos[-1])]
    for i in range(1, back_length):
        for j in range(num_hidden_state):
            for k in range(num_hidden_state):
                backward[i, j] += trans_mat[j, k] * obs_mat[k, int(obs_from_pos[-(i + 1)])] * backward[i - 1, k]
    #print("backward: ", backward)
    return backward[-1]

In [53]:
h_, x_, hh_ = x_i_conditional_prob(trans_mat, obs_mat, stat_dist, obs, pos)

In [54]:
x_[:5]

array([[0.03389855, 0.31762314, 0.16999719, 0.36258939, 0.11589173],
       [0.07796996, 0.26077204, 0.37734608, 0.21546932, 0.0684426 ],
       [0.0824678 , 0.25202594, 0.38569342, 0.21470885, 0.065104  ],
       [0.08724798, 0.20937187, 0.24936379, 0.37541883, 0.07859752],
       [0.06717051, 0.24813628, 0.21089949, 0.3801521 , 0.09364162]])

In [56]:
for data, target in train_dl:
  #data = data[0]
  print("data")
  print(data[:5])
  output = model(data) # (batch_size, seq_length, ntoken)
  print(torch.nn.functional.softmax(output[:5], dim = 2))
  break

data
tensor([[3, 2, 1, 2, 3, 2, 0, 5, 2, 1],
        [3, 2, 4, 5, 1, 4, 2, 3, 1, 2],
        [1, 3, 1, 1, 3, 1, 3, 5, 1, 2],
        [3, 1, 0, 3, 2, 5, 4, 1, 4, 0],
        [2, 3, 1, 1, 2, 3, 3, 1, 2, 5]])
tensor([[[6.7855e-03, 5.8803e-03, 4.3180e-03, 9.7631e-01, 4.4903e-03,
          2.2168e-03],
         [5.2899e-03, 5.5966e-03, 9.7912e-01, 4.5221e-03, 3.6292e-03,
          1.8395e-03],
         [3.2285e-03, 9.7333e-01, 4.5265e-03, 9.7561e-03, 6.8954e-03,
          2.2638e-03],
         [5.2128e-03, 8.2785e-03, 9.7669e-01, 6.3356e-03, 2.1862e-03,
          1.2985e-03],
         [4.0397e-03, 5.1068e-03, 5.9035e-03, 9.8004e-01, 3.0197e-03,
          1.8860e-03],
         [5.5657e-03, 3.2649e-03, 9.8324e-01, 5.0159e-03, 1.6440e-03,
          1.2655e-03],
         [8.1448e-01, 5.1498e-02, 4.4990e-02, 5.8483e-02, 2.1802e-02,
          8.7461e-03],
         [1.3906e-01, 1.8889e-01, 2.5902e-01, 3.2435e-01, 7.4761e-02,
          1.3920e-02],
         [4.9531e-03, 5.6820e-03, 9.8115e-01, 4.34

In [57]:
print(obs[:5])

[[3. 2. 1. 2. 3. 2. 0. 3. 2. 1.]
 [3. 2. 4. 3. 1. 4. 2. 3. 1. 2.]
 [1. 3. 1. 1. 3. 1. 3. 3. 1. 2.]
 [3. 1. 0. 3. 2. 2. 4. 1. 4. 0.]
 [2. 3. 1. 1. 2. 3. 3. 1. 2. 1.]]


In [58]:
print(x_[:5])

[[0.03389855 0.31762314 0.16999719 0.36258939 0.11589173]
 [0.07796996 0.26077204 0.37734608 0.21546932 0.0684426 ]
 [0.0824678  0.25202594 0.38569342 0.21470885 0.065104  ]
 [0.08724798 0.20937187 0.24936379 0.37541883 0.07859752]
 [0.06717051 0.24813628 0.21089949 0.3801521  0.09364162]]


In [59]:
print(pos[:5])

[7 3 7 5 9]


In [62]:
temp_ = torch.nn.functional.softmax(output[:5], dim = 2)
for i in range(5):
  print(temp_[i, pos[i]])
print(x_[:5])

tensor([0.1391, 0.1889, 0.2590, 0.3243, 0.0748, 0.0139],
       grad_fn=<SelectBackward0>)
tensor([0.0914, 0.2376, 0.2744, 0.3307, 0.0503, 0.0156],
       grad_fn=<SelectBackward0>)
tensor([0.1367, 0.2004, 0.2314, 0.3674, 0.0522, 0.0120],
       grad_fn=<SelectBackward0>)
tensor([0.1652, 0.2061, 0.3764, 0.1679, 0.0733, 0.0111],
       grad_fn=<SelectBackward0>)
tensor([0.0533, 0.3214, 0.3125, 0.1895, 0.1073, 0.0160],
       grad_fn=<SelectBackward0>)
[[0.03389855 0.31762314 0.16999719 0.36258939 0.11589173]
 [0.07796996 0.26077204 0.37734608 0.21546932 0.0684426 ]
 [0.0824678  0.25202594 0.38569342 0.21470885 0.065104  ]
 [0.08724798 0.20937187 0.24936379 0.37541883 0.07859752]
 [0.06717051 0.24813628 0.21089949 0.3801521  0.09364162]]
