In [1]:
import torch
import numpy as np

class HMM(torch.nn.Module):
  """
  Hidden Markov Model with discrete observations.
  """
  def __init__(self, M, N):
    super(HMM, self).__init__()
    self.M = M # number of possible observations
    self.N = N # number of states

    # A
    self.transition_model = TransitionModel(self.N)

    # b(x_t)
    self.emission_model = EmissionModel(self.N,self.M)

    # pi
    self.unnormalized_state_priors = torch.nn.Parameter(torch.randn(self.N))

    # use the GPU
    self.is_cuda = torch.cuda.is_available()
    if self.is_cuda: self.cuda()

class TransitionModel(torch.nn.Module):
  def __init__(self, N):
    super(TransitionModel, self).__init__()
    self.N = N
    self.unnormalized_transition_matrix = torch.nn.Parameter(torch.randn(N,N))

class EmissionModel(torch.nn.Module):
  def __init__(self, N, M):
    super(EmissionModel, self).__init__()
    self.N = N
    self.M = M
    self.unnormalized_emission_matrix = torch.nn.Parameter(torch.randn(N,M))

In [2]:
def sample(self, T=10):
  state_priors = torch.nn.functional.softmax(self.unnormalized_state_priors, dim=0)
  transition_matrix = torch.nn.functional.softmax(self.transition_model.unnormalized_transition_matrix, dim=0)
  emission_matrix = torch.nn.functional.softmax(self.emission_model.unnormalized_emission_matrix, dim=1)

  # sample initial state
  z_t = torch.distributions.categorical.Categorical(state_priors).sample().item()
  z = []; x = []
  z.append(z_t)
  for t in range(0,T):
    # sample emission
    x_t = torch.distributions.categorical.Categorical(emission_matrix[z_t]).sample().item()
    x.append(x_t)

    # sample transition
    z_t = torch.distributions.categorical.Categorical(transition_matrix[:,z_t]).sample().item()
    if t < T-1: z.append(z_t)

  return x, z

# Add the sampling method to our HMM class
HMM.sample = sample

In [3]:
import string
alphabet = string.ascii_lowercase

def encode(s):
  """
  Convert a string into a list of integers
  """
  x = [alphabet.index(ss) for ss in s]
  return x

def decode(x):
  """
  Convert list of ints to string
  """
  s = "".join([alphabet[xx] for xx in x])
  return s

# Initialize the model
model = HMM(M=len(alphabet), N=2)

# Hard-wiring the parameters!
# Let state 0 = consonant, state 1 = vowel
for p in model.parameters():
    p.requires_grad = False # needed to do lines below
model.unnormalized_state_priors[0] = 0.    # Let's start with a consonant more frequently
model.unnormalized_state_priors[1] = -0.5
print("State priors:", torch.nn.functional.softmax(model.unnormalized_state_priors, dim=0))

# In state 0, only allow consonants; in state 1, only allow vowels
vowel_indices = torch.tensor([alphabet.index(letter) for letter in "aeiou"])
consonant_indices = torch.tensor([alphabet.index(letter) for letter in "bcdfghjklmnpqrstvwxyz"])
model.emission_model.unnormalized_emission_matrix[0, vowel_indices] = -np.inf
model.emission_model.unnormalized_emission_matrix[1, consonant_indices] = -np.inf
print("Emission matrix:", torch.nn.functional.softmax(model.emission_model.unnormalized_emission_matrix, dim=1))

# Only allow vowel -> consonant and consonant -> vowel
model.transition_model.unnormalized_transition_matrix[0,0] = -np.inf  # consonant -> consonant
model.transition_model.unnormalized_transition_matrix[0,1] = 0.       # vowel -> consonant
model.transition_model.unnormalized_transition_matrix[1,0] = 0.       # consonant -> vowel
model.transition_model.unnormalized_transition_matrix[1,1] = -np.inf  # vowel -> vowel
print("Transition matrix:", torch.nn.functional.softmax(model.transition_model.unnormalized_transition_matrix, dim=0))



State priors: tensor([0.6225, 0.3775])
Emission matrix: tensor([[0.0000, 0.0161, 0.0793, 0.0037, 0.0000, 0.0135, 0.0176, 0.0738, 0.0000,
         0.0125, 0.0163, 0.0263, 0.0076, 0.0080, 0.0000, 0.1289, 0.0258, 0.0731,
         0.0112, 0.0137, 0.0000, 0.0390, 0.0417, 0.3001, 0.0539, 0.0379],
        [0.0418, 0.0000, 0.0000, 0.0000, 0.0856, 0.0000, 0.0000, 0.0000, 0.2649,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3157, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.2921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000]])
Transition matrix: tensor([[0., 1.],
        [1., 0.]])


In [5]:
# Sample some outputs
for _ in range(4):
  sampled_x, sampled_z = model.sample(T=5)
  print("x:", decode(sampled_x))
  print("z:", sampled_z)

x: fuhok
z: [0, 1, 0, 1, 0]
x: poxuz
z: [0, 1, 0, 1, 0]
x: ivixo
z: [1, 0, 1, 0, 1]
x: xovet
z: [0, 1, 0, 1, 0]


In [10]:
def HMM_forward(self, x, T):
  """
  x : IntTensor of shape (batch size, T_max)
  T : IntTensor of shape (batch size)

  Compute log p(x) for each example in the batch.
  T = length of each example
  """
  if self.is_cuda:
  	x = x.cuda()
  	T = T.cuda()

  batch_size = x.shape[0]; T_max = x.shape[1]
  log_state_priors = torch.nn.functional.log_softmax(self.unnormalized_state_priors, dim=0)
  log_alpha = torch.zeros(batch_size, T_max, self.N)
  if self.is_cuda: log_alpha = log_alpha.cuda()

  log_alpha[:, 0, :] = self.emission_model(x[:,0]) + log_state_priors
  for t in range(1, T_max):
    log_alpha[:, t, :] = self.emission_model(x[:,t]) + self.transition_model(log_alpha[:, t-1, :])

  # Select the sum for the final timestep (each x may have different length).
  log_sums = log_alpha.logsumexp(dim=2)
  log_probs = torch.gather(log_sums, 1, T.view(-1,1) - 1)
  return log_probs




In [11]:
def emission_model_forward(self, x_t):
  log_emission_matrix = torch.nn.functional.log_softmax(self.unnormalized_emission_matrix, dim=1)
  out = log_emission_matrix[:, x_t].transpose(0,1)
  return out

In [12]:
def transition_model_forward(self, log_alpha):
  """
  log_alpha : Tensor of shape (batch size, N)
  Multiply previous timestep's alphas by transition matrix (in log domain)
  """
  log_transition_matrix = torch.nn.functional.log_softmax(self.unnormalized_transition_matrix, dim=0)

  # Matrix multiplication in the log domain
  out = log_domain_matmul(log_transition_matrix, log_alpha.transpose(0,1)).transpose(0,1)
  return out

In [13]:
def log_domain_matmul(log_A, log_B):
	"""
	log_A : m x n
	log_B : n x p
	output : m x p matrix

	Normally, a matrix multiplication
	computes out_{i,j} = sum_k A_{i,k} x B_{k,j}

	A log domain matrix multiplication
	computes out_{i,j} = logsumexp_k log_A_{i,k} + log_B_{k,j}
	"""
	m = log_A.shape[0]
	n = log_A.shape[1]
	p = log_B.shape[1]

	# log_A_expanded = torch.stack([log_A] * p, dim=2)
	# log_B_expanded = torch.stack([log_B] * m, dim=0)
    # fix for PyTorch > 1.5 by egaznep on Github:
	log_A_expanded = torch.reshape(log_A, (m,n,1))
	log_B_expanded = torch.reshape(log_B, (1,n,p))

	elementwise_sum = log_A_expanded + log_B_expanded
	out = torch.logsumexp(elementwise_sum, dim=1)

	return out

TransitionModel.forward = transition_model_forward
EmissionModel.forward = emission_model_forward
HMM.forward = HMM_forward

In [14]:
x = torch.stack( [torch.tensor(encode("cat"))] )
T = torch.tensor([3])
print(model.forward(x, T))

x = torch.stack( [torch.tensor(encode("aba")), torch.tensor(encode("abb"))] )
T = torch.tensor([3,3])
print(model.forward(x, T))

tensor([[-10.4712]])
tensor([[-11.4499],
        [    -inf]])


In [15]:
def viterbi(self, x, T):
  """
  x : IntTensor of shape (batch size, T_max)
  T : IntTensor of shape (batch size)
  Find argmax_z log p(x|z) for each (x) in the batch.
  """
  if self.is_cuda:
    x = x.cuda()
    T = T.cuda()

  batch_size = x.shape[0]; T_max = x.shape[1]
  log_state_priors = torch.nn.functional.log_softmax(self.unnormalized_state_priors, dim=0)
  log_delta = torch.zeros(batch_size, T_max, self.N).float()
  psi = torch.zeros(batch_size, T_max, self.N).long()
  if self.is_cuda:
    log_delta = log_delta.cuda()
    psi = psi.cuda()

  log_delta[:, 0, :] = self.emission_model(x[:,0]) + log_state_priors
  for t in range(1, T_max):
    max_val, argmax_val = self.transition_model.maxmul(log_delta[:, t-1, :])
    log_delta[:, t, :] = self.emission_model(x[:,t]) + max_val
    psi[:, t, :] = argmax_val

  # Get the log probability of the best path
  log_max = log_delta.max(dim=2)[0]
  best_path_scores = torch.gather(log_max, 1, T.view(-1,1) - 1)

  # This next part is a bit tricky to parallelize across the batch,
  # so we will do it separately for each example.
  z_star = []
  for i in range(0, batch_size):
    z_star_i = [ log_delta[i, T[i] - 1, :].max(dim=0)[1].item() ]
    for t in range(T[i] - 1, 0, -1):
      z_t = psi[i, t, z_star_i[0]].item()
      z_star_i.insert(0, z_t)

    z_star.append(z_star_i)

  return z_star, best_path_scores # return both the best path and its log probability

def transition_model_maxmul(self, log_alpha):
  log_transition_matrix = torch.nn.functional.log_softmax(self.unnormalized_transition_matrix, dim=0)

  out1, out2 = maxmul(log_transition_matrix, log_alpha.transpose(0,1))
  return out1.transpose(0,1), out2.transpose(0,1)

def maxmul(log_A, log_B):
	"""
	log_A : m x n
	log_B : n x p
	output : m x p matrix

	Similar to the log domain matrix multiplication,
	this computes out_{i,j} = max_k log_A_{i,k} + log_B_{k,j}
	"""
	m = log_A.shape[0]
	n = log_A.shape[1]
	p = log_B.shape[1]

	log_A_expanded = torch.stack([log_A] * p, dim=2)
	log_B_expanded = torch.stack([log_B] * m, dim=0)

	elementwise_sum = log_A_expanded + log_B_expanded
	out1,out2 = torch.max(elementwise_sum, dim=1)

	return out1,out2

TransitionModel.maxmul = transition_model_maxmul
HMM.viterbi = viterbi

In [16]:
x = torch.stack( [torch.tensor(encode("aba")), torch.tensor(encode("abb"))] )
T = torch.tensor([3,3])
print(model.viterbi(x, T))

([[1, 0, 1], [1, 0, 0]], tensor([[-11.4499],
        [    -inf]]))


In [17]:
print(model.forward(x, T))
print(model.viterbi(x, T)[1])


tensor([[-11.4499],
        [    -inf]])
tensor([[-11.4499],
        [    -inf]])


In [18]:
x = torch.tensor([1., 2., 3.])
print(x.max(dim=0)[0])
print(x.logsumexp(dim=0))


tensor(3.)
tensor(3.4076)


In [19]:
import torch.utils.data
from collections import Counter
from sklearn.model_selection import train_test_split

class TextDataset(torch.utils.data.Dataset):
  def __init__(self, lines):
    self.lines = lines # list of strings
    collate = Collate() # function for generating a minibatch from strings
    self.loader = torch.utils.data.DataLoader(self, batch_size=1024, num_workers=1, shuffle=True, collate_fn=collate)

  def __len__(self):
    return len(self.lines)

  def __getitem__(self, idx):
    line = self.lines[idx].lstrip(" ").rstrip("\n").rstrip(" ").rstrip("\n")
    return line

class Collate:
  def __init__(self):
    pass

  def __call__(self, batch):
    """
    Returns a minibatch of strings, padded to have the same length.
    """
    x = []
    batch_size = len(batch)
    for index in range(batch_size):
      x_ = batch[index]

      # convert letters to integers
      x.append(encode(x_))

    # pad all sequences with 0 to have same length
    x_lengths = [len(x_) for x_ in x]
    T = max(x_lengths)
    for index in range(batch_size):
      x[index] += [0] * (T - len(x[index]))
      x[index] = torch.tensor(x[index])

    # stack into single tensor
    x = torch.stack(x)
    x_lengths = torch.tensor(x_lengths)
    return (x,x_lengths)

In [21]:
!wget https://raw.githubusercontent.com/lorenlugosch/pytorch_HMM/master/data/train/training.txt

--2023-11-05 10:28:15--  https://raw.githubusercontent.com/lorenlugosch/pytorch_HMM/master/data/train/training.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2493109 (2.4M) [text/plain]
Saving to: ‘training.txt’


2023-11-05 10:28:15 (85.4 MB/s) - ‘training.txt’ saved [2493109/2493109]



In [22]:
filename = "training.txt"

with open(filename, "r") as f:
  lines = f.readlines() # each line of lines will have one word

alphabet = list(Counter(("".join(lines))).keys())
train_lines, valid_lines = train_test_split(lines, test_size=0.1, random_state=42)
train_dataset = TextDataset(train_lines)
valid_dataset = TextDataset(valid_lines)

M = len(alphabet)

In [24]:
from tqdm import tqdm # for displaying progress bar

class Trainer:
  def __init__(self, model, lr):
    self.model = model
    self.lr = lr
    self.optimizer = torch.optim.Adam(model.parameters(), lr=self.lr, weight_decay=0.00001)

  def train(self, dataset):
    train_loss = 0
    num_samples = 0
    self.model.train()
    print_interval = 50
    for idx, batch in enumerate(tqdm(dataset.loader)):
      x,T = batch
      batch_size = len(x)
      num_samples += batch_size
      log_probs = self.model(x,T)
      loss = -log_probs.mean()
      self.optimizer.zero_grad()
      loss.backward()
      self.optimizer.step()
      train_loss += loss.cpu().data.numpy().item() * batch_size
      if idx % print_interval == 0:
        print("loss:", loss.item())
        for _ in range(5):
          sampled_x, sampled_z = self.model.sample()
          print(decode(sampled_x))
          print(sampled_z)
    train_loss /= num_samples
    return train_loss

  def test(self, dataset):
    test_loss = 0
    num_samples = 0
    self.model.eval()
    print_interval = 50
    for idx, batch in enumerate(dataset.loader):
      x,T = batch
      batch_size = len(x)
      num_samples += batch_size
      log_probs = self.model(x,T) #TODO
      loss = -log_probs.mean() #TODO
      test_loss += loss.cpu().data.numpy().item() * batch_size
      if idx % print_interval == 0:
        print("loss:", loss.item())
        sampled_x, sampled_z = self.model.sample()
        print(decode(sampled_x))
        print(sampled_z)
    test_loss /= num_samples
    return test_loss

In [25]:
# Initialize model
model = HMM(N=64, M=M)

# Train the model
num_epochs = 10
trainer = Trainer(model, lr=0.01)

for epoch in range(num_epochs):
        print("========= Epoch %d of %d =========" % (epoch+1, num_epochs))
        train_loss = trainer.train(train_dataset)
        valid_loss = trainer.test(valid_dataset)

        print("========= Results: epoch %d of %d =========" % (epoch+1, num_epochs))
        print("train loss: %.2f| valid loss: %.2f\n" % (train_loss, valid_loss) )



  0%|          | 1/208 [00:01<05:40,  1.64s/it]

loss: 38.751136779785156
SgOGNviVYQ
[60, 32, 24, 5, 40, 24, 23, 10, 35, 41]
OBtOnCZbNv
[1, 46, 17, 8, 60, 56, 0, 2, 54, 60]
J-ZEH
w-iV
[40, 5, 47, 58, 43, 4, 33, 23, 38, 56]
WRhtytFVms
[35, 61, 31, 0, 58, 25, 0, 20, 61, 57]
QhFAWRIAnS
[60, 23, 28, 51, 39, 11, 26, 42, 45, 19]


 25%|██▍       | 51/208 [00:59<02:57,  1.13s/it]

loss: 33.12735366821289
wktKJuimmM
[46, 42, 54, 24, 23, 25, 23, 19, 50, 4]
WrUuWecUxU
[60, 44, 55, 25, 42, 27, 33, 39, 56, 44]
YjrLXmdWEh
[18, 4, 62, 41, 15, 19, 33, 63, 57, 47]
rdotoKtd-p
[53, 21, 58, 3, 10, 24, 50, 46, 17, 57]
pBGezGMcsq
[29, 47, 15, 44, 23, 54, 50, 54, 43, 42]


 49%|████▊     | 101/208 [01:56<02:10,  1.22s/it]

loss: 30.240659713745117
ciaczlnbCA
[13, 59, 22, 13, 29, 47, 23, 38, 27, 59]
iCZc
ifaVa
[35, 8, 50, 4, 18, 23, 46, 46, 23, 1]
pMVUsUBnsy
[8, 27, 53, 12, 31, 1, 31, 45, 4, 58]
Uncigooceo
[12, 23, 3, 61, 36, 27, 21, 21, 59, 54]
yLKylrTuxo
[58, 28, 57, 58, 53, 10, 23, 25, 57, 58]


 73%|███████▎  | 151/208 [02:54<01:05,  1.15s/it]

loss: 27.724468231201172
XClnnpAsea
[18, 23, 41, 31, 45, 48, 17, 14, 17, 31]
loiadxeydo
[57, 45, 46, 17, 56, 23, 50, 23, 45, 54]
rryrunshoL
[60, 53, 58, 53, 59, 45, 27, 20, 54, 12]
aotkualMgc
[3, 49, 12, 9, 58, 3, 46, 54, 45, 46]
rh
SnXehln
[25, 42, 31, 4, 62, 21, 17, 12, 23, 46]


 97%|█████████▋| 201/208 [03:49<00:08,  1.15s/it]

loss: 26.233318328857422
atoOongaol
[3, 12, 17, 46, 17, 31, 37, 39, 27, 46]
puversedri
[29, 25, 57, 17, 14, 21, 59, 45, 53, 25]
coGeeebhin
[6, 7, 15, 17, 31, 17, 45, 46, 23, 31]
heroslGndi
[29, 17, 53, 23, 57, 46, 54, 31, 4, 59]
siiiogndre
[57, 23, 28, 23, 54, 37, 53, 35, 53, 17]


100%|██████████| 208/208 [03:57<00:00,  1.14s/it]


loss: 26.179752349853516
palinerali
[29, 3, 10, 23, 57, 17, 53, 27, 42, 23]
train loss: 30.49| valid loss: 26.41



  0%|          | 1/208 [00:01<05:58,  1.73s/it]

loss: 26.47123908996582
resbVyghec
[46, 17, 14, 21, 41, 48, 45, 46, 17, 6]
ajlelabeda
[3, 45, 46, 17, 49, 3, 35, 59, 45, 46]
elyavervrr
[17, 10, 48, 50, 46, 17, 53, 43, 53, 59]
ipocOallyp
[27, 46, 17, 6, 1, 25, 42, 4, 58, 13]
CegivZariW
[29, 17, 46, 23, 57, 59, 3, 10, 23, 54]


 25%|██▍       | 51/208 [00:58<02:58,  1.14s/it]

loss: 25.129451751708984
plizolcyol
[29, 53, 59, 6, 25, 42, 4, 58, 13, 46]
tpuocesece
[29, 5, 25, 1, 45, 17, 45, 17, 6, 17]
eipocolice
[29, 59, 13, 54, 10, 54, 10, 23, 6, 17]
cilacudtit
[57, 23, 46, 27, 6, 25, 42, 4, 23, 57]
rfaltngnel
[54, 37, 25, 40, 54, 31, 37, 31, 17, 46]


 49%|████▊     | 101/208 [01:55<02:02,  1.14s/it]

loss: 25.079364776611328
fhbtigodne
[29, 43, 24, 12, 23, 6, 54, 10, 46, 17]
satemyessi
[29, 3, 12, 54, 10, 48, 17, 14, 57, 23]
ephonkhitu
[59, 13, 26, 54, 31, 12, 43, 59, 6, 25]
mAsimeXoga
[44, 25, 45, 23, 6, 17, 6, 58, 46, 54]
seieathosi
[57, 12, 23, 31, 3, 12, 43, 17, 14, 23]


 73%|███████▎  | 151/208 [02:55<01:12,  1.27s/it]

loss: 24.61397933959961
unrydurcyr
[33, 1, 53, 59, 6, 25, 13, 26, 48, 10]
ocenectixi
[54, 13, 17, 31, 59, 45, 12, 23, 46, 17]
suTiliolAr
[57, 25, 42, 17, 10, 23, 54, 46, 17, 53]
strystesmi
[57, 54, 10, 48, 50, 12, 17, 14, 21, 59]
dnminarase
[33, 1, 6, 23, 6, 3, 10, 3, 45, 17]


 97%|█████████▋| 201/208 [03:49<00:07,  1.14s/it]

loss: 24.641462326049805
Letearingr
[6, 17, 12, 17, 27, 10, 23, 31, 37, 53]
oPoivarita
[54, 13, 54, 1, 6, 3, 53, 59, 12, 3]
whystoiont
[29, 43, 58, 50, 12, 53, 59, 54, 31, 12]
cVtinullio
[33, 0, 12, 23, 31, 25, 42, 4, 23, 54]
oponeclymc
[59, 13, 54, 31, 23, 45, 4, 58, 22, 13]


100%|██████████| 208/208 [03:56<00:00,  1.14s/it]


loss: 24.14046859741211
Aythidalus
[29, 58, 12, 43, 23, 6, 3, 42, 25, 42]
train loss: 25.15| valid loss: 24.52



  0%|          | 1/208 [00:01<06:11,  1.79s/it]

loss: 24.72222328186035
branicouso
[29, 53, 3, 10, 23, 6, 54, 25, 57, 54]
chetoflend
[29, 43, 17, 12, 59, 45, 46, 17, 31, 6]
bidenvenre
[35, 23, 6, 23, 31, 6, 54, 31, 53, 59]
emeiephest
[59, 6, 17, 53, 59, 13, 26, 17, 50, 12]
unnydtechi
[33, 1, 6, 48, 50, 12, 17, 45, 46, 23]


 25%|██▍       | 51/208 [00:56<03:02,  1.17s/it]

loss: 24.2977352142334
ineriutily
[33, 31, 17, 10, 23, 25, 12, 23, 4, 48]
rabveblenc
[53, 3, 35, 46, 17, 49, 46, 17, 31, 6]
powteceris
[29, 54, 57, 12, 59, 6, 17, 10, 23, 57]
bbustinaca
[18, 2, 25, 57, 12, 23, 6, 3, 13, 3]
lortralari
[29, 54, 10, 12, 53, 3, 42, 3, 10, 23]


 49%|████▊     | 101/208 [01:53<02:01,  1.13s/it]

loss: 24.140533447265625
smamptedis
[57, 19, 27, 22, 13, 12, 17, 10, 23, 57]
eiphondeat
[59, 59, 13, 26, 54, 1, 6, 17, 3, 12]
dermanembl
[6, 17, 10, 20, 3, 31, 17, 22, 35, 46]
cocarygato
[29, 54, 4, 58, 63, 48, 37, 3, 46, 54]
thermeetve
[12, 43, 17, 10, 46, 17, 59, 45, 46, 59]


 73%|███████▎  | 151/208 [02:49<01:11,  1.26s/it]

loss: 23.718090057373047
Marelynine
[29, 3, 10, 23, 4, 58, 10, 23, 31, 17]
somphibely
[57, 59, 22, 13, 26, 23, 6, 17, 10, 48]
oickblequm
[59, 59, 45, 55, 35, 46, 17, 52, 25, 42]
loacrotiti
[53, 59, 3, 45, 53, 59, 12, 23, 4, 23]
soteBingeb
[57, 59, 12, 23, 31, 23, 31, 37, 17, 45]


 97%|█████████▋| 201/208 [03:46<00:07,  1.09s/it]

loss: 23.818729400634766
sotertatan
[44, 59, 12, 3, 10, 12, 3, 12, 3, 31]
ruicodepoo
[53, 59, 23, 6, 54, 37, 59, 13, 26, 59]
Retodosari
[44, 59, 21, 59, 21, 59, 57, 17, 10, 23]
atondlenog
[3, 12, 54, 31, 37, 46, 17, 31, 59, 37]
irgeegrens
[59, 31, 37, 17, 3, 37, 53, 59, 1, 57]


100%|██████████| 208/208 [03:53<00:00,  1.12s/it]


loss: 23.53034210205078
ciettycion
[29, 59, 59, 45, 12, 48, 50, 23, 54, 31]
train loss: 24.17| valid loss: 23.98



  0%|          | 1/208 [00:01<06:12,  1.80s/it]

loss: 23.959232330322266
isprooeass
[23, 57, 13, 53, 59, 13, 17, 3, 14, 57]
phynsiabal
[13, 26, 58, 42, 46, 23, 3, 35, 3, 4]
dnsopaller
[33, 1, 57, 54, 13, 3, 42, 46, 17, 10]
Cedathomos
[29, 17, 10, 59, 12, 43, 54, 20, 54, 14]
satinthtyp
[21, 59, 12, 23, 31, 12, 43, 29, 58, 13]


 25%|██▍       | 51/208 [00:58<02:53,  1.10s/it]

loss: 23.840450286865234
bleolafyio
[29, 53, 59, 12, 53, 59, 45, 48, 23, 54]
tomlttekyo
[29, 54, 20, 23, 12, 43, 27, 55, 48, 54]
ntenmfioni
[57, 12, 17, 31, 0, 24, 23, 54, 46, 23]
rymbatiath
[29, 58, 22, 35, 3, 12, 23, 3, 45, 43]
Dilershorr
[44, 23, 46, 17, 14, 57, 26, 54, 13, 53]


 49%|████▊     | 101/208 [01:53<01:54,  1.07s/it]

loss: 24.063505172729492
sesmortarl
[21, 59, 57, 20, 54, 10, 12, 3, 10, 46]
spheantodk
[57, 13, 26, 17, 3, 31, 12, 54, 10, 6]
oderambict
[27, 6, 17, 10, 3, 22, 35, 23, 45, 12]
abeniteral
[3, 42, 3, 31, 23, 12, 17, 10, 3, 42]
lotictendr
[46, 59, 12, 23, 45, 12, 17, 31, 37, 53]


 73%|███████▎  | 151/208 [02:50<01:11,  1.25s/it]

loss: 23.460346221923828
aneteususo
[27, 6, 3, 12, 17, 14, 57, 25, 57, 27]
lombarereo
[44, 59, 22, 38, 3, 10, 17, 31, 17, 27]
azeryaraci
[27, 6, 17, 31, 48, 27, 10, 3, 12, 23]
droitertil
[29, 53, 59, 59, 12, 3, 10, 12, 23, 46]
hidangasat
[26, 23, 6, 3, 31, 37, 3, 57, 3, 12]


 97%|█████████▋| 201/208 [03:44<00:07,  1.11s/it]

loss: 23.52913475036621
eubuiriodi
[11, 25, 35, 25, 1, 53, 23, 54, 21, 23]
narscoglyg
[44, 3, 14, 57, 13, 59, 37, 4, 48, 37]
ceprogedat
[44, 59, 13, 53, 59, 37, 17, 10, 3, 12]
sicatanzad
[57, 23, 6, 3, 46, 3, 31, 6, 3, 45]
matallylom
[20, 3, 12, 3, 42, 4, 48, 46, 59, 20]


100%|██████████| 208/208 [03:52<00:00,  1.12s/it]


loss: 23.549068450927734
weleyndoom
[44, 59, 46, 59, 58, 31, 37, 54, 27, 22]
train loss: 23.81| valid loss: 23.75



  0%|          | 1/208 [00:01<05:39,  1.64s/it]

loss: 23.972728729248047
jidlisteta
[44, 3, 10, 46, 59, 50, 12, 17, 12, 3]
Uujitleplo
[56, 25, 12, 23, 12, 53, 59, 13, 46, 59]
strobrooce
[57, 12, 53, 59, 35, 53, 59, 27, 6, 17]
pranaminee
[29, 53, 3, 31, 59, 22, 23, 31, 17, 3]
ergatorate
[17, 10, 37, 3, 12, 54, 10, 3, 12, 17]


 25%|██▍       | 51/208 [00:59<02:53,  1.11s/it]

loss: 23.41946792602539
esqurmadth
[59, 57, 16, 25, 10, 20, 3, 10, 12, 43]
Creaindlya
[29, 53, 59, 3, 23, 31, 37, 4, 48, 3]
pogifivact
[29, 54, 37, 23, 32, 23, 6, 3, 45, 12]
cedawabcal
[6, 17, 21, 59, 20, 3, 42, 6, 3, 42]
ptNrganoco
[29, 12, 54, 10, 37, 3, 31, 59, 13, 54]


 49%|████▊     | 101/208 [01:54<01:57,  1.10s/it]

loss: 23.652713775634766
tivisuriph
[29, 23, 6, 23, 57, 25, 10, 23, 13, 26]
gecalenzae
[44, 59, 6, 3, 46, 17, 31, 6, 3, 17]
bedapisirs
[44, 17, 21, 59, 13, 23, 57, 59, 14, 57]
cebolicion
[44, 59, 2, 54, 46, 23, 6, 23, 54, 31]
Ceromniphe
[44, 3, 10, 59, 20, 31, 23, 13, 26, 17]


 73%|███████▎  | 151/208 [02:50<01:06,  1.16s/it]

loss: 23.420581817626953
terWedonil
[29, 17, 10, 38, 17, 21, 59, 6, 23, 46]
domaumpolu
[21, 59, 21, 59, 33, 22, 38, 54, 42, 25]
trysisenty
[29, 53, 48, 50, 23, 57, 17, 31, 12, 58]
uncerostia
[33, 31, 6, 17, 10, 59, 57, 12, 23, 3]
foniloarti
[29, 54, 31, 23, 46, 59, 3, 31, 12, 23]


 97%|█████████▋| 201/208 [03:46<00:07,  1.11s/it]

loss: 23.195655822753906
ogencedice
[27, 10, 17, 31, 12, 17, 21, 23, 6, 3]
balidersot
[29, 3, 42, 23, 6, 17, 10, 57, 54, 12]
fosktisecu
[29, 54, 57, 55, 12, 23, 57, 17, 45, 25]
ungatyosun
[33, 31, 37, 3, 12, 58, 54, 57, 25, 31]
lonymirnto
[44, 54, 4, 48, 20, 3, 10, 31, 12, 54]


100%|██████████| 208/208 [03:53<00:00,  1.12s/it]


loss: 23.766984939575195
gontesocti
[29, 54, 31, 12, 17, 21, 59, 45, 12, 3]
train loss: 23.64| valid loss: 23.63



  0%|          | 1/208 [00:01<05:53,  1.71s/it]

loss: 23.7034969329834
unthderple
[33, 31, 12, 43, 37, 3, 10, 13, 53, 59]
finencermi
[32, 23, 6, 17, 31, 12, 17, 10, 20, 23]
uwformelen
[33, 1, 32, 54, 10, 20, 17, 6, 17, 31]
Sebitoulir
[44, 59, 35, 23, 12, 54, 25, 42, 3, 10]
ochetresio
[27, 45, 43, 17, 12, 53, 59, 50, 23, 54]


 25%|██▍       | 51/208 [00:59<03:12,  1.23s/it]

loss: 23.762008666992188
uciicivugo
[25, 45, 12, 23, 6, 23, 6, 3, 37, 54]
chatlinent
[13, 43, 3, 12, 53, 23, 6, 17, 31, 12]
jooninUubu
[18, 54, 27, 6, 23, 31, 52, 25, 35, 25]
fMlnemiarg
[44, 54, 42, 6, 17, 20, 23, 3, 10, 37]
vinkidoyes
[44, 59, 1, 55, 23, 6, 54, 34, 63, 57]


 49%|████▊     | 101/208 [01:59<02:14,  1.25s/it]

loss: 23.890756607055664
opmitiamyf
[27, 22, 20, 3, 12, 23, 3, 22, 58, 8]
Shngiateat
[29, 43, 31, 37, 23, 3, 12, 17, 14, 12]
wieglymbit
[44, 23, 54, 37, 4, 58, 22, 35, 23, 12]
moitererec
[44, 59, 3, 12, 17, 53, 59, 53, 59, 13]
baeroustoc
[44, 59, 17, 10, 52, 25, 57, 12, 59, 13]


 73%|███████▎  | 151/208 [02:56<01:02,  1.09s/it]

loss: 23.50855255126953
hydDelyres
[41, 48, 37, 46, 17, 4, 58, 53, 59, 57]
unceringri
[33, 1, 13, 3, 10, 23, 31, 37, 53, 59]
pesivesamb
[29, 3, 50, 23, 6, 17, 21, 59, 22, 35]
palicatera
[29, 3, 42, 23, 6, 3, 12, 17, 10, 3]
candroGusa
[44, 3, 31, 37, 53, 59, 56, 25, 57, 59]


 97%|█████████▋| 201/208 [03:53<00:08,  1.17s/it]

loss: 23.581403732299805
Cesscracka
[44, 59, 14, 57, 13, 53, 59, 45, 55, 3]
nellispive
[44, 3, 42, 4, 23, 57, 13, 23, 6, 17]
unwanerate
[33, 1, 15, 3, 31, 3, 10, 3, 12, 17]
maryinwlic
[38, 3, 10, 48, 3, 31, 37, 4, 23, 6]
phurmiatio
[29, 43, 25, 10, 20, 23, 3, 12, 23, 54]


100%|██████████| 208/208 [03:59<00:00,  1.15s/it]


loss: 23.552959442138672
phetifkedg
[13, 26, 59, 12, 23, 45, 55, 17, 10, 37]
train loss: 23.54| valid loss: 23.56



  0%|          | 1/208 [00:01<05:17,  1.54s/it]

loss: 23.662261962890625
unersestob
[33, 31, 17, 10, 53, 59, 50, 12, 54, 24]
exmapiener
[27, 10, 20, 3, 22, 23, 3, 31, 17, 10]
scicerenhy
[57, 6, 23, 6, 17, 10, 59, 1, 26, 58]
frottollis
[29, 53, 59, 45, 12, 54, 4, 53, 59, 14]
cochtatrap
[44, 54, 13, 26, 12, 3, 12, 53, 59, 13]


 25%|██▍       | 51/208 [00:57<02:51,  1.09s/it]

loss: 23.81232261657715
phonaratfe
[13, 43, 54, 31, 3, 10, 3, 12, 32, 17]
unassebriv
[33, 31, 59, 14, 57, 3, 35, 53, 23, 6]
roloindawe
[13, 54, 53, 59, 33, 1, 21, 59, 20, 17]
tabinaneme
[29, 3, 42, 23, 31, 3, 31, 59, 20, 17]
undicilleo
[33, 1, 21, 23, 6, 23, 42, 46, 17, 54]


 49%|████▊     | 101/208 [01:56<02:02,  1.14s/it]

loss: 23.58521842956543
yenzemeuss
[44, 17, 31, 6, 17, 21, 59, 59, 14, 57]
poniigrons
[13, 54, 31, 23, 3, 37, 53, 59, 1, 57]
satillaeus
[57, 3, 12, 3, 42, 46, 59, 59, 25, 57]
apintymycy
[27, 22, 23, 31, 12, 48, 41, 48, 13, 58]
loJstismul
[44, 54, 10, 57, 12, 23, 57, 16, 25, 42]


 73%|███████▎  | 151/208 [02:54<01:04,  1.13s/it]

loss: 23.516035079956055
punoxtiscr
[29, 33, 31, 59, 45, 12, 23, 57, 12, 53]
armulisgus
[27, 10, 20, 25, 42, 23, 50, 11, 25, 57]
koouyottia
[44, 54, 54, 33, 51, 59, 45, 12, 23, 3]
pholdlanen
[13, 43, 54, 42, 21, 4, 3, 31, 3, 31]
braxmuesse
[29, 53, 59, 22, 20, 25, 17, 14, 57, 59]


 97%|█████████▋| 201/208 [03:51<00:08,  1.16s/it]

loss: 23.170021057128906
mutachazom
[11, 25, 12, 3, 13, 26, 3, 9, 54, 20]
suwmanegde
[11, 25, 39, 19, 3, 31, 17, 39, 37, 17]
unthuseral
[33, 1, 12, 43, 25, 57, 3, 10, 3, 42]
uncroctoct
[33, 1, 13, 53, 59, 45, 12, 59, 45, 12]
sueatodute
[11, 25, 61, 3, 12, 59, 37, 25, 12, 59]


100%|██████████| 208/208 [03:59<00:00,  1.15s/it]


loss: 23.253570556640625
umpinidali
[33, 22, 38, 23, 6, 23, 6, 3, 42, 23]
train loss: 23.47| valid loss: 23.47



  0%|          | 1/208 [00:01<06:18,  1.83s/it]

loss: 23.13640785217285
ruksondiar
[53, 59, 55, 57, 59, 31, 37, 23, 3, 10]
rutybonten
[53, 59, 47, 48, 2, 54, 31, 12, 17, 31]
stivintita
[57, 12, 23, 6, 23, 31, 12, 23, 12, 3]
torcaryodc
[29, 54, 10, 6, 3, 10, 48, 54, 10, 12]
aflydisori
[27, 32, 4, 48, 37, 23, 26, 54, 10, 23]


 25%|██▍       | 51/208 [00:59<02:59,  1.14s/it]

loss: 23.756420135498047
disicideng
[21, 23, 50, 23, 6, 23, 6, 17, 31, 37]
tryshySoae
[29, 53, 48, 50, 26, 58, 22, 59, 3, 17]
holyshonbr
[44, 54, 4, 48, 50, 26, 54, 1, 35, 53]
asperidotu
[27, 57, 38, 17, 10, 23, 6, 54, 12, 25]
elobessisp
[29, 53, 59, 35, 17, 14, 57, 23, 57, 13]


 49%|████▊     | 101/208 [01:56<02:10,  1.22s/it]

loss: 22.97817611694336
storacatid
[57, 12, 54, 53, 59, 6, 3, 12, 23, 6]
foppericyW
[32, 54, 22, 38, 17, 10, 23, 6, 48, 36]
perilychen
[29, 17, 10, 23, 4, 58, 13, 43, 17, 31]
vliarecoon
[20, 4, 23, 3, 10, 59, 6, 54, 54, 1]
uregetegib
[33, 53, 59, 39, 17, 12, 17, 37, 3, 49]


 73%|███████▎  | 151/208 [02:57<01:14,  1.30s/it]

loss: 23.715045928955078
kseeckearc
[29, 53, 59, 59, 45, 55, 59, 3, 10, 13]
sessperchy
[57, 17, 14, 57, 38, 17, 14, 12, 43, 58]
caropthano
[44, 3, 10, 59, 45, 12, 43, 3, 31, 54]
unvablomil
[33, 1, 6, 3, 49, 46, 59, 20, 23, 42]
cytheddoct
[41, 48, 12, 43, 17, 10, 21, 59, 45, 12]


 97%|█████████▋| 201/208 [03:54<00:07,  1.09s/it]

loss: 23.258872985839844
infionalle
[33, 1, 32, 23, 54, 31, 3, 42, 46, 17]
Nususachnc
[11, 25, 57, 59, 57, 59, 45, 43, 31, 12]
Brichbotie
[29, 53, 23, 13, 26, 2, 54, 12, 23, 27]
unphysaloo
[33, 1, 13, 26, 48, 57, 3, 9, 54, 54]
woledtical
[44, 59, 46, 17, 14, 12, 23, 6, 3, 42]


100%|██████████| 208/208 [04:02<00:00,  1.16s/it]


loss: 23.418441772460938
parcecicea
[29, 3, 10, 6, 17, 12, 23, 6, 17, 3]
train loss: 23.39| valid loss: 23.40



  0%|          | 1/208 [00:01<05:57,  1.73s/it]

loss: 23.284826278686523
sclewncuin
[57, 13, 46, 59, 39, 1, 13, 25, 3, 31]
asewishyth
[27, 57, 59, 20, 23, 50, 26, 58, 12, 43]
helishondl
[44, 59, 42, 23, 50, 26, 54, 31, 37, 46]
talliteeti
[29, 3, 42, 4, 23, 6, 17, 59, 45, 23]
cologarrae
[13, 54, 9, 54, 37, 3, 10, 53, 59, 3]


 25%|██▍       | 51/208 [00:59<03:25,  1.31s/it]

loss: 23.550235748291016
kncomcater
[33, 1, 13, 54, 22, 13, 3, 12, 3, 10]
nepesiolde
[44, 59, 38, 59, 57, 23, 54, 42, 21, 17]
mancosesie
[44, 3, 31, 12, 54, 57, 17, 14, 23, 3]
pywondengy
[29, 34, 15, 54, 31, 37, 17, 31, 37, 58]
pateniorht
[29, 3, 12, 17, 31, 23, 54, 13, 26, 12]


 49%|████▊     | 101/208 [01:56<01:56,  1.09s/it]

loss: 23.42797088623047
sicuendess
[57, 23, 16, 25, 3, 31, 6, 17, 14, 57]
amizaeanal
[27, 22, 23, 6, 3, 17, 3, 31, 3, 42]
mecclyserl
[20, 17, 45, 12, 4, 48, 57, 17, 10, 4]
opicentito
[27, 24, 23, 13, 3, 31, 12, 23, 12, 54]
manchlerri
[44, 59, 1, 13, 43, 46, 17, 10, 53, 23]


 73%|███████▎  | 151/208 [02:55<01:04,  1.14s/it]

loss: 23.09023094177246
feourialal
[32, 17, 52, 25, 10, 23, 3, 42, 3, 42]
undrickali
[33, 1, 21, 53, 59, 45, 55, 3, 42, 3]
ampecterde
[27, 22, 38, 17, 45, 12, 17, 10, 37, 17]
anengyalyp
[27, 6, 17, 31, 37, 34, 3, 4, 48, 13]
unbrochris
[33, 1, 29, 53, 59, 45, 43, 53, 23, 50]


 97%|█████████▋| 201/208 [03:54<00:08,  1.27s/it]

loss: 23.263418197631836
sectarratj
[57, 59, 45, 12, 3, 10, 53, 59, 12, 17]
culemoloma
[29, 25, 42, 59, 20, 52, 9, 54, 61, 3]
ddieterdan
[21, 37, 23, 17, 12, 17, 10, 21, 59, 1]
ceponilele
[44, 59, 13, 54, 31, 23, 46, 17, 46, 17]
umprochrem
[33, 22, 38, 53, 59, 13, 43, 53, 59, 20]


100%|██████████| 208/208 [04:01<00:00,  1.16s/it]


loss: 23.31370735168457
dessistrot
[21, 17, 14, 57, 23, 50, 12, 53, 59, 45]
train loss: 23.34| valid loss: 23.37



  0%|          | 1/208 [00:01<05:51,  1.70s/it]

loss: 23.31403923034668
semmiogerk
[57, 59, 22, 20, 23, 54, 37, 17, 10, 55]
disiutenta
[21, 23, 21, 23, 50, 12, 3, 31, 12, 3]
mathistrie
[20, 3, 12, 43, 23, 57, 12, 53, 23, 17]
taldisherr
[29, 3, 42, 21, 23, 50, 43, 17, 10, 53]
unmpuscsuc
[33, 1, 22, 38, 25, 57, 13, 26, 3, 12]


 25%|██▍       | 51/208 [00:57<02:51,  1.09s/it]

loss: 23.501750946044922
itcstindro
[27, 7, 13, 26, 12, 23, 31, 37, 53, 59]
aduryolyie
[3, 6, 17, 10, 51, 54, 4, 48, 23, 17]
ivistolerr
[27, 20, 23, 50, 12, 54, 46, 17, 10, 53]
tearyolale
[29, 17, 3, 10, 48, 54, 4, 3, 42, 3]
chatypabbi
[13, 43, 59, 47, 48, 38, 3, 49, 49, 23]


 49%|████▊     | 101/208 [01:54<01:59,  1.12s/it]

loss: 23.306293487548828
ondifonbat
[27, 31, 37, 23, 8, 54, 1, 0, 3, 12]
opisefenci
[27, 24, 23, 57, 59, 32, 17, 31, 6, 3]
aviatialil
[27, 6, 23, 3, 12, 23, 3, 42, 23, 4]
Fransmarip
[29, 53, 59, 31, 50, 61, 3, 10, 23, 13]
sueaiftycm
[11, 25, 17, 3, 23, 32, 47, 48, 12, 19]


 73%|███████▎  | 151/208 [02:52<01:11,  1.25s/it]

loss: 23.307828903198242
sophishybo
[57, 59, 13, 26, 23, 50, 26, 5, 44, 54]
TabnamiBre
[44, 3, 49, 46, 59, 20, 23, 29, 53, 59]
cygavenanf
[41, 48, 37, 59, 20, 17, 46, 59, 1, 32]
dinaphedis
[21, 23, 31, 3, 13, 43, 17, 21, 23, 50]
kullerdote
[44, 25, 42, 46, 17, 10, 21, 59, 12, 17]


 97%|█████████▋| 201/208 [03:51<00:08,  1.20s/it]

loss: 23.562471389770508
vulanilyti
[11, 25, 42, 3, 31, 23, 4, 48, 12, 23]
plapitinab
[29, 53, 59, 24, 23, 6, 23, 6, 3, 49]
poveresist
[29, 59, 20, 17, 10, 17, 14, 23, 50, 12]
mopemomeib
[44, 54, 38, 17, 61, 54, 19, 3, 23, 49]
unsobresly
[33, 31, 57, 59, 35, 53, 59, 57, 4, 48]


100%|██████████| 208/208 [03:58<00:00,  1.15s/it]


loss: 23.381881713867188
nullerrome
[44, 25, 42, 46, 17, 10, 53, 59, 20, 17]
train loss: 23.31| valid loss: 23.35



In [26]:
x = torch.tensor(encode("quack")).unsqueeze(0)
T = torch.tensor([5])
print(model.viterbi(x,T))

x = torch.tensor(encode("quick")).unsqueeze(0)
T = torch.tensor([5])
print(model.viterbi(x,T))

x = torch.tensor(encode("qurck")).unsqueeze(0)
T = torch.tensor([5])
print(model.viterbi(x,T)) # should have lower probability---in English only vowels follow "qu"

x = torch.tensor(encode("qiick")).unsqueeze(0)
T = torch.tensor([5])
print(model.viterbi(x,T)) # should have lower probability---in English only "u" follows "q"


([[56, 25, 3, 45, 55]], tensor([[-15.4674]], grad_fn=<GatherBackward0>))
([[56, 25, 23, 45, 55]], tensor([[-15.5659]], grad_fn=<GatherBackward0>))
([[56, 25, 10, 45, 55]], tensor([[-16.9567]], grad_fn=<GatherBackward0>))
([[56, 25, 23, 45, 55]], tensor([[-20.4935]], grad_fn=<GatherBackward0>))


Question-2


In [27]:
!pip install python-crfsuite
!pip install nltk
!pip install lxml

Collecting python-crfsuite
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite
Successfully installed python-crfsuite-0.9.9


In [28]:
from bs4 import BeautifulSoup as bs
from bs4.element import Tag
import codecs

# Read data file and parse the XML
with codecs.open("reuters.xml", "r", "utf-8") as infile:
    soup = bs(infile, "html5lib")

docs = []
for elem in soup.find_all("document"):
    texts = []

    # Loop through each child of the element under "textwithnamedentities"
    for c in elem.find("textwithnamedentities").children:
        if type(c) == Tag:
            if c.name == "namedentityintext":
                label = "N"  # part of a named entity
            else:
                label = "I"  # irrelevant word
            for w in c.text.split(" "):
                if len(w) > 0:
                    texts.append((w, label))
    docs.append(texts)

docs[0]



[('Paxar', 'N'),
 ('Corp', 'N'),
 ('said', 'I'),
 ('it', 'I'),
 ('has', 'I'),
 ('acquired', 'I'),
 ('Thermo-Print', 'N'),
 ('GmbH', 'N'),
 ('of', 'I'),
 ('Lohn', 'N'),
 (',', 'I'),
 ('West', 'N'),
 ('Germany', 'N'),
 (',', 'I'),
 ('a', 'I'),
 ('distributor', 'I'),
 ('of', 'I'),
 ('Paxar', 'N'),
 ('products,', 'I'),
 ('for', 'I'),
 ('undisclosed', 'I'),
 ('terms.', 'I')]

In [32]:
import nltk
nltk.download('averaged_perceptron_tagger')
data = []
for i, doc in enumerate(docs):

    # Obtain the list of tokens in the document
    tokens = [t for t, label in doc]

    # Perform POS tagging
    tagged = nltk.pos_tag(tokens)

    # Take the word, POS tag, and its label
    data.append([(w, pos, label) for (w, label), (word, pos) in zip(doc, tagged)])
data[0]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Paxar', 'NNP', 'N'),
 ('Corp', 'NNP', 'N'),
 ('said', 'VBD', 'I'),
 ('it', 'PRP', 'I'),
 ('has', 'VBZ', 'I'),
 ('acquired', 'VBN', 'I'),
 ('Thermo-Print', 'NNP', 'N'),
 ('GmbH', 'NNP', 'N'),
 ('of', 'IN', 'I'),
 ('Lohn', 'NNP', 'N'),
 (',', ',', 'I'),
 ('West', 'NNP', 'N'),
 ('Germany', 'NNP', 'N'),
 (',', ',', 'I'),
 ('a', 'DT', 'I'),
 ('distributor', 'NN', 'I'),
 ('of', 'IN', 'I'),
 ('Paxar', 'NNP', 'N'),
 ('products,', 'NN', 'I'),
 ('for', 'IN', 'I'),
 ('undisclosed', 'JJ', 'I'),
 ('terms.', 'NN', 'I')]

In [33]:
def word2features(doc, i):
    word = doc[i][0]
    postag = doc[i][1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1][0]
        postag1 = doc[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1][0]
        postag1 = doc[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

In [34]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

# A function fo generating the list of labels for each document
def get_labels(doc):
    return [label for (token, postag, label) in doc]

X = [extract_features(doc) for doc in data]
y = [get_labels(doc) for doc in data]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [35]:
import pycrfsuite
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13091
Seconds required: 0.038

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5643.111901
Feature norm: 1.000000
Error norm: 6238.602301
Active features: 12623
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.011

***** Iteration #2 *****
Loss: 4485.985804
Feature norm: 0.836919
Error norm: 5574.039070
Active features: 12729
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.026

***** Iteration #3 *****
Loss: 4186.083465
Feature norm: 0.781478
Error norm: 12942.343330
Active features: 8624
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

In [36]:
tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]

# Let's take a look at a random sample in the testing set
i = 12
for x, y in zip(y_pred[i], [x[1].split("=")[1] for x in X_test[i]]):
    print("%s (%s)" % (y, x))


toronto (N)
dominion (N)
bank (N)
, (I)
nassau (I)
branch (I)
is (I)
issuing (I)
a (I)
40 (I)
mln (I)
australian (I)
dlr (I)
eurobond (I)
due (I)
may (I)
15, (I)
1990 (I)
paying (I)
14-1/2 (I)
pct (I)
and (I)
priced (I)
at (I)
101-3/8 (I)
pct, (I)
lead (I)
manager (I)
hambros (N)
bank (N)
ltd (N)
said. (I)
the (I)
non-callable (I)
bond (I)
is (I)
available (I)
in (I)
denominations (I)
of (I)
1,000 (I)
australian (I)
dlrs (I)
and (I)
will (I)
be (I)
listed (I)
in (I)
london (N)
. (I)
the (I)
selling (I)
concession (I)
is (I)
one (I)
pct, (I)
while (I)
management (I)
and (I)
underwriting (I)
combined (I)
will (I)
pay (I)
1/2 (I)
pct. (I)
the (I)
payment (I)
date (I)
is (I)
may (I)
15. (I)


In [37]:
import numpy as np
from sklearn.metrics import classification_report

# Create a mapping of labels to indices
labels = {"N": 1, "I": 0}

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.99      0.99      0.99      3594
           N       0.86      0.89      0.87       359

    accuracy                           0.98      3953
   macro avg       0.93      0.94      0.93      3953
weighted avg       0.98      0.98      0.98      3953



In [39]:
#Hyperparameter set 1
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.01,

    # coefficient for L2 penalty
    'c2': 0.1,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13091
Seconds required: 0.089

L-BFGS optimization
c1: 0.010000
c2: 0.100000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5642.467322
Feature norm: 1.000000
Error norm: 6238.345616
Active features: 12623
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.018

***** Iteration #2 *****
Loss: 4485.423174
Feature norm: 0.836945
Error norm: 5574.046256
Active features: 12735
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.013

***** Iteration #3 *****
Loss: 4184.982324
Feature norm: 0.781692
Error norm: 12941.629873
Active features: 12402
Line search trials: 2
Line search step: 0.500000
Seconds required for this

In [40]:
#Report 1

tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]


# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.99      0.98      0.99      3594
           N       0.85      0.88      0.87       359

    accuracy                           0.98      3953
   macro avg       0.92      0.93      0.93      3953
weighted avg       0.98      0.98      0.98      3953



In [41]:
#Hyperparameter set 2
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.001,

    # coefficient for L2 penalty
    'c2': 0.0001,

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13091
Seconds required: 0.089

L-BFGS optimization
c1: 0.001000
c2: 0.000100
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5642.293400
Feature norm: 1.000000
Error norm: 6238.145752
Active features: 12623
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.019

***** Iteration #2 *****
Loss: 4485.314566
Feature norm: 0.836954
Error norm: 5573.953447
Active features: 12735
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.012

***** Iteration #3 *****
Loss: 4184.815590
Feature norm: 0.781779
Error norm: 12941.838656
Active features: 12402
Line search trials: 2
Line search step: 0.500000
Seconds required for this

In [42]:
#Report 2

tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]


# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.99      0.99      0.99      3594
           N       0.87      0.88      0.87       359

    accuracy                           0.98      3953
   macro avg       0.93      0.93      0.93      3953
weighted avg       0.98      0.98      0.98      3953



In [43]:
#Hyperparameter set 3
trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.01,

    # coefficient for L2 penalty
    'c2': 0.001,

    # maximum number of iterations
    'max_iterations': 50,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf.model')


Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13091
Seconds required: 0.095

L-BFGS optimization
c1: 0.010000
c2: 0.001000
num_memories: 6
max_iterations: 50
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 5642.368322
Feature norm: 1.000000
Error norm: 6238.186430
Active features: 12623
Line search trials: 1
Line search step: 0.000045
Seconds required for this iteration: 0.019

***** Iteration #2 *****
Loss: 4485.376232
Feature norm: 0.836951
Error norm: 5573.960727
Active features: 12735
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 0.010

***** Iteration #3 *****
Loss: 4184.959454
Feature norm: 0.781749
Error norm: 12941.979788
Active features: 12402
Line search trials: 2
Line search step: 0.500000
Seconds required for this 

In [44]:
#Report 3

tagger = pycrfsuite.Tagger()
tagger.open('crf.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]


# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    target_names=["I", "N"]))

              precision    recall  f1-score   support

           I       0.99      0.99      0.99      3594
           N       0.90      0.86      0.88       359

    accuracy                           0.98      3953
   macro avg       0.94      0.93      0.93      3953
weighted avg       0.98      0.98      0.98      3953

