In [28]:
# import packages
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F


In [2]:
# read data
words = open("../names.txt", "r").read().splitlines()

- bigram: refers to only 2 characters
- bigram based NN: takes the first character to predict the next character (two characters only)


In [3]:
b = {}

for word in words:
    # add beginning and end token
    chs = ["."] + list(word) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        # count the numbers of unique bigrams that occur
        # and store them in a dict
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) + 1
        
# sort by the number of occurring bigrams
sorted_b = sorted(b.items(), key = lambda kv: -kv[1])

In [4]:
# create a dict to map the character with the position/idx in 2d array
chars = sorted(list(set("".join(words))))
stoi = {ch:i +1 for i, ch in enumerate(chars)}
stoi["."] = 0

# create array to store the bigrams
N = torch.zeros((27, 27), dtype=torch.int32)

for word in words:
    chs = ["."] + list(word) + ["."]
    for ch1, ch2 in zip(chs, chs[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        N[idx1, idx2] += 1

# reverse the idx:ch dict
itos = {b:a for a, b in stoi.items()}

In [None]:
P = N.float()
P /= P.sum(dim=1, keepdim=True)
g = torch.Generator().manual_seed(121241)

for i in range(10): # create 10 names
    name = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        ch = itos[ix] 
        name.append(ch)

        if ix == 0:
            break
    print("".join(name))

telicalanaa.
ses.
dyaraistonainunda.
pellio.
tele.
mandinnliexaiali.
rlyizermian.
kheon.
garai.
asar.


In [23]:
agg_prob = 0
for word in ["tung"]:
    chs = ["."] + list(word) + ["."]

    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]

        p = P[ix1, ix2]
        agg_prob =+ -torch.log(p)
        print(f"{ch1}{ch2} {torch.log(p)}")

    print(agg_prob)

.t -3.1982674598693848
tu -4.268441677093506
un -2.4336133003234863
ng -4.206658840179443
g. -2.8815884590148926
tensor(2.8816)


In [29]:
# create a train and test dataset
xs, ys = [], []
for word in words[:1]:
    chs = ["."] + list(word) + ["."]

    for ch1, ch2 in zip(chs, chs[1:]):

        ix1 = stoi[ch1]
        ix2 = stoi[ch2]

        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

# encode xs and ys
x_enc = F.one_hot(xs, num_classes=27).float()
y_enc = F.one_hot(ys, num_classes=27).float()

In [49]:
W = torch.randn((27, 27))

logits = x_enc @ W
counts = logits.exp()
probs = counts / counts.sum(axis=1, keepdim=True)

In [50]:
probs

tensor([[0.0149, 0.0154, 0.0646, 0.0288, 0.0021, 0.0293, 0.0083, 0.0071, 0.0151,
         0.0368, 0.0317, 0.0095, 0.0135, 0.0768, 0.0172, 0.0203, 0.0101, 0.0177,
         0.0872, 0.1742, 0.0206, 0.0077, 0.0040, 0.1868, 0.0527, 0.0405, 0.0071],
        [0.0564, 0.0615, 0.0528, 0.0242, 0.0069, 0.0537, 0.0838, 0.0096, 0.0556,
         0.0205, 0.0136, 0.0018, 0.0124, 0.0972, 0.0321, 0.0398, 0.0664, 0.0175,
         0.0475, 0.0160, 0.0735, 0.0803, 0.0255, 0.0070, 0.0070, 0.0135, 0.0239],
        [0.0637, 0.0046, 0.0070, 0.0102, 0.0151, 0.0445, 0.0316, 0.0046, 0.0268,
         0.0231, 0.0708, 0.0053, 0.0268, 0.1540, 0.2158, 0.0132, 0.0362, 0.0047,
         0.0562, 0.0360, 0.0538, 0.0320, 0.0063, 0.0276, 0.0104, 0.0073, 0.0126],
        [0.0637, 0.0046, 0.0070, 0.0102, 0.0151, 0.0445, 0.0316, 0.0046, 0.0268,
         0.0231, 0.0708, 0.0053, 0.0268, 0.1540, 0.2158, 0.0132, 0.0362, 0.0047,
         0.0562, 0.0360, 0.0538, 0.0320, 0.0063, 0.0276, 0.0104, 0.0073, 0.0126],
        [0.0391, 0.0125,