In [1]:
names = open('/content/drive/MyDrive/LM/names.txt', 'r').read().splitlines()

## Exercise 1: Trigram LM

To learn statistics, we can either use counts or a neural network. To get trigram counts:

In [2]:
counts = {}
for n in names:
  chs = ['.']+list(n)+['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    trigram = (ch1, ch2, ch3)
    counts[trigram] = counts.get(trigram, 0) + 1

In [3]:
sorted(counts.items(), key=lambda kv: -kv[1])

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953),
 (('a', 'r', 'i'), 950),
 (('i', 'a', '.'), 903),
 (('i', 'e', '.'), 858),
 (('a', 'n', 'n'), 825),
 (('e', 'l', 'l'), 822),
 (('a', 'n', 'a'), 804),
 (('i', 'a', 'n'), 790),
 (('m', 'a', 'r'), 776),
 (('i', 'n', '.'), 766),
 (('e', 'l', '.'), 727),
 (('y', 'a', '.'), 716),
 (('a', 'n', 'i'), 703),
 (('.', 'd', 'a'), 700),
 (('l', 'a', '.'), 684),
 (('e', 'r', '.'), 683),
 (('i', 'y', 'a'), 669),
 (('l', 'a', 'n'), 647),
 (('.', 'b', 'r'), 646),
 (('n', 'n', 'a'), 633),
 (('.', 'a', 'l'), 632),
 (('.', 'c', 'a'), 628),
 (('r', 'a', '.'), 627),
 (('n', 'i', '.'), 625),
 (('.', 'a', 'n'), 623),
 (('n', 'n', '.'), 619),
 (('n', 'e', '.'), 607),
 (('e', 'e', '.'), 605),
 (('e', 'y', '.'), 602),
 (('.', 'k', 'e'), 601),
 (('a', 'l', 'e')

In [4]:
chs = sorted(list(set(''.join(names))))
chs.append('.') # ATTENTION: Because it is appended to the end, index for . token will be 26
len(chs)

In [5]:
ch2i = {ch:i for i, ch in enumerate(chs)}
i2ch = {i:ch for ch, i in ch2i.items()}

In the trigram case, entry $N[i_{1},i_{2},i_{3}]$ is the number of times the character indexed by $i_{3}$ follows the one with index $i_{2}$ that is followed by the one indexed with $i_{1}$.

In [9]:
import torch
N = torch.zeros((27, 27, 27), dtype=torch.int32)
# "training the model"
for n in names:
  chs = ['.']+list(n)+['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    i_1 = ch2i[ch1]
    i_2 = ch2i[ch2]
    i_3 = ch2i[ch3]
    N[i_1, i_2, i_3] += 1

In the trigram case, training the model means to estimate the probabilities $P(w_{i} | w_{i-1}, w_{i-2})$. Now we sample from this trigram character level language model, i.e. predict the next character given the previous two.

In [10]:
# sample third character in a name which starts with "a"
N[ch2i['.'], ch2i['a'], :]

tensor([207, 190,  31, 366,  55,  21,  17,  91, 154,  27,  75, 632, 384, 623,
         10,  17,   9, 482, 194,  72, 152, 243,   6,  27, 173, 152,   0],
       dtype=torch.int32)

In [11]:
# turn raw counts into probabilities
p = N[ch2i['.'],ch2i['a'],:].float()
p /= p.sum() # normalize
p, p.sum() # p is now a probability distribution

(tensor([0.0469, 0.0431, 0.0070, 0.0830, 0.0125, 0.0048, 0.0039, 0.0206, 0.0349,
         0.0061, 0.0170, 0.1433, 0.0871, 0.1413, 0.0023, 0.0039, 0.0020, 0.1093,
         0.0440, 0.0163, 0.0345, 0.0551, 0.0014, 0.0061, 0.0392, 0.0345, 0.0000]),
 tensor(1.0000))

In [12]:
ixs = torch.multinomial(p, num_samples=20, replacement=True) # you give me probs and I give you integers according to the probability distr
ixs

tensor([11, 17, 13, 24, 17,  8, 21,  0, 13, 23, 13, 18,  3, 12, 18, 11, 12, 17,
         0, 18])

Broadcasting: To sum over the third dimension of a 3D tensor, use `.sum(2, keepdim=True)`. `keepdim=True` keeps the dimension alog which the summation was made, we need this for broadcasting to work properly. `keepdim=False` would squueze out the dimension which sums are collapsed, resulting in a 2D tensor in our case, then by the broadcasting rules of PyTorch, these sums would be copied over the first axis instead of third.

In [18]:
P = N.float()
P /= P.sum(2, keepdim=True) # /= is an in-place op

In [20]:
# sampling from the trigram model
for i in range(10):
  ix = ch2i['.']
  ix1 = ch2i['e']
  out = [i2ch[ix],i2ch[ix1]]
  while True:
    p = P[ix,ix1]
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    out.append(i2ch[ix2])
    if ix2 == ch2i['.']:
      break
    else:
      ix = ix1
      ix1 = ix2
  print(''.join(out))

.elleigh.
.emic.
.es.
.emeigh.
.elin.
.elyn.
.erreasie.
.evenox.
.elsethereighlynleeven.
.ed.


This model we trained decreases the entropy, just like the living. The results look like this with no training:

In [47]:
for i in range(10):
  ix = ch2i['.']
  ix1 = ch2i['e']
  out = [i2ch[ix],i2ch[ix1]]
  while True:
    p_even = torch.ones(27) / 27.0
    ix2 = torch.multinomial(p, num_samples=1, replacement=True).item()
    out.append(i2ch[ix2])
    if ix2 == ch2i['.']:
      break
    else:
      ix = ix1
      ix1 = ix2
  print(''.join(out))

.eimr.
.eoeeydodianriwryruoeaf.
.eaiyieer.
.eeyyy.
.e.
.enwyeh.
.eeia.
.eyfeyi.
.eayueea.
.e.


In [21]:
log_likelihood = 0.0
k = 0
for n in names:
  chs = ['.']+list(n)+['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    i_1 = ch2i[ch1]
    i_2 = ch2i[ch2]
    i_3 = ch2i[ch3]
    prob = P[i_1,i_2,i_3]
    logprob = torch.log(prob)
    log_likelihood += logprob
    k += 1
    # print(f'{ch1}{ch2}{ch3}: {prob:.4f} {logprob:.4f}')
print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}') # lower the better, a convenient loss metric
print(f'{nll/k}') # average nll

log_likelihood=tensor(-404377.3750)
nll=tensor(404377.3750)
2.0619611740112305


As we can observe from this metric, trigram lm is an improvement over the bigram lm.

### Neural network approach

Instead of counting, we will learn the counts array by optimizing a set of parameters.

In [58]:
# creating the trigram dataset for training
xs, ys = [], []
for n in names:
  chs = ['.']+list(n)+['.']
  for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
    i_1 = ch2i[ch1]
    i_2 = ch2i[ch2]
    i_3 = ch2i[ch3]
    xs.append([i_1, i_2])
    ys.append(i_3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

In [59]:
xs.shape

torch.Size([196113, 2])

In [29]:
import torch.nn.functional as F

In [60]:
W = torch.randn((54,27), requires_grad=True)

In [61]:
# gradient descent
for k in range(100):

  # forward pass
  xenc = F.one_hot(xs, num_classes=27).float().view((xs.shape[0],54))
  ## softmax: takes in the logits, exponentiates them and normalizes
  logits = xenc @ W # log-counts
  counts = logits.exp() # equivalent to N
  p = counts / counts.sum(1, keepdim=True)
  loss = -p[torch.arange(xs.shape[0]), ys].log().mean() # average nll
  print(loss.item())

  # backward pass
  W.grad = None
  loss.backward()
  W.grad.shape # influences of weights on the loss function

  # gradient update
  W.data += -50 * W.grad

4.3265485763549805
3.3755624294281006
3.03058123588562
2.8661606311798096
2.7555787563323975
2.679280996322632
2.6233694553375244
2.580578327178955
2.5463576316833496
2.518376111984253
2.4949629306793213
2.4750940799713135
2.457977771759033
2.4430863857269287
2.429999828338623
2.4184179306030273
2.408092737197876
2.398836374282837
2.3904902935028076
2.3829288482666016
2.3760452270507812
2.3697524070739746
2.3639755249023438
2.3586533069610596
2.3537333011627197
2.3491709232330322
2.34492826461792
2.340973377227783
2.337277412414551
2.3338165283203125
2.330569267272949
2.32751727104187
2.32464337348938
2.3219330310821533
2.319373369216919
2.3169517517089844
2.3146579265594482
2.3124821186065674
2.310415506362915
2.308450222015381
2.3065788745880127
2.3047945499420166
2.3030917644500732
2.3014652729034424
2.2999091148376465
2.29841947555542
2.296992063522339
2.2956228256225586
2.2943084239959717
2.29304575920105
2.2918310165405273
2.290663003921509
2.2895379066467285
2.2884535789489746
2

In [62]:
for i in range(5):
  print(f'trigram:{i2ch[xs[i][0].item()],i2ch[xs[i][1].item()],i2ch[ys[i].item()]}')
  print(f'output probability distribution given the input:{p[i]}')
  print(f'probability assigned to the actual label:{p[i,ys[i]]}')
  print(f'log likelihood:{torch.log(p[i,ys[i]])}')
  print(f'nll:{-torch.log(p[i,ys[i]])}')

trigram:('.', 'e', 'm')
output probability distribution given the input:tensor([0.0747, 0.0083, 0.0035, 0.0313, 0.0623, 0.0009, 0.0021, 0.0106, 0.0362,
        0.0011, 0.0042, 0.2528, 0.0682, 0.0683, 0.0256, 0.0019, 0.0007, 0.1676,
        0.0375, 0.0163, 0.0155, 0.0436, 0.0016, 0.0021, 0.0460, 0.0150, 0.0021],
       grad_fn=<SelectBackward0>)
probability assigned to the actual label:0.0682108923792839
log likelihood:-2.6851511001586914
nll:2.6851511001586914
trigram:('e', 'm', 'm')
output probability distribution given the input:tensor([0.2543, 0.0141, 0.0090, 0.0065, 0.0816, 0.0040, 0.0104, 0.0056, 0.1634,
        0.0035, 0.0025, 0.0050, 0.0492, 0.0040, 0.0477, 0.0074, 0.0016, 0.0168,
        0.0155, 0.0084, 0.0084, 0.0011, 0.0023, 0.0007, 0.0391, 0.0050, 0.2329],
       grad_fn=<SelectBackward0>)
probability assigned to the actual label:0.04917892441153526
log likelihood:-3.0122900009155273
nll:3.0122900009155273
trigram:('m', 'm', 'a')
output probability distribution given the inp

A good model assigns high probabilities to data in the training set. 

## Exercise 2: Training, developing, testing