# Opening and exploring data

In [33]:
import torch
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [1]:
!pwd

/home/william/language_modelling_andrej/intro_pytorch/Language_Modelling_intro/notebooks


In [2]:
words = open('../raw_data/names.txt','r').read().splitlines()

In [3]:
words[:3]

['emma', 'olivia', 'ava']

# Counting model
counting model would require a very long tensor due to the many possible combinations (27*26)
so for that reason i will use the neural network model

In [4]:
counts= {}
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        trigram = (ch1, ch2, ch3)
        counts[trigram] = counts.get(trigram, 0) + 1
        

In [66]:
sorted(counts.items())

[(('.', 'a', 'a'), 207),
 (('.', 'a', 'b'), 190),
 (('.', 'a', 'c'), 31),
 (('.', 'a', 'd'), 366),
 (('.', 'a', 'e'), 55),
 (('.', 'a', 'f'), 21),
 (('.', 'a', 'g'), 17),
 (('.', 'a', 'h'), 91),
 (('.', 'a', 'i'), 154),
 (('.', 'a', 'j'), 27),
 (('.', 'a', 'k'), 75),
 (('.', 'a', 'l'), 632),
 (('.', 'a', 'm'), 384),
 (('.', 'a', 'n'), 623),
 (('.', 'a', 'o'), 10),
 (('.', 'a', 'p'), 17),
 (('.', 'a', 'q'), 9),
 (('.', 'a', 'r'), 482),
 (('.', 'a', 's'), 194),
 (('.', 'a', 't'), 72),
 (('.', 'a', 'u'), 152),
 (('.', 'a', 'v'), 243),
 (('.', 'a', 'w'), 6),
 (('.', 'a', 'x'), 27),
 (('.', 'a', 'y'), 173),
 (('.', 'a', 'z'), 152),
 (('.', 'b', 'a'), 169),
 (('.', 'b', 'e'), 253),
 (('.', 'b', 'h'), 9),
 (('.', 'b', 'i'), 41),
 (('.', 'b', 'j'), 1),
 (('.', 'b', 'l'), 85),
 (('.', 'b', 'o'), 77),
 (('.', 'b', 'r'), 646),
 (('.', 'b', 'u'), 21),
 (('.', 'b', 'y'), 4),
 (('.', 'c', 'a'), 628),
 (('.', 'c', 'e'), 65),
 (('.', 'c', 'h'), 352),
 (('.', 'c', 'i'), 44),
 (('.', 'c', 'j'), 2),
 (('

In [57]:
sorted(counts.items(),key= lambda x: x[1], reverse=True)

[(('a', 'h', '.'), 1714),
 (('n', 'a', '.'), 1673),
 (('a', 'n', '.'), 1509),
 (('o', 'n', '.'), 1503),
 (('.', 'm', 'a'), 1453),
 (('.', 'j', 'a'), 1255),
 (('.', 'k', 'a'), 1254),
 (('e', 'n', '.'), 1217),
 (('l', 'y', 'n'), 976),
 (('y', 'n', '.'), 953),
 (('a', 'r', 'i'), 950),
 (('i', 'a', '.'), 903),
 (('i', 'e', '.'), 858),
 (('a', 'n', 'n'), 825),
 (('e', 'l', 'l'), 822),
 (('a', 'n', 'a'), 804),
 (('i', 'a', 'n'), 790),
 (('m', 'a', 'r'), 776),
 (('i', 'n', '.'), 766),
 (('e', 'l', '.'), 727),
 (('y', 'a', '.'), 716),
 (('a', 'n', 'i'), 703),
 (('.', 'd', 'a'), 700),
 (('l', 'a', '.'), 684),
 (('e', 'r', '.'), 683),
 (('i', 'y', 'a'), 669),
 (('l', 'a', 'n'), 647),
 (('.', 'b', 'r'), 646),
 (('n', 'n', 'a'), 633),
 (('.', 'a', 'l'), 632),
 (('.', 'c', 'a'), 628),
 (('r', 'a', '.'), 627),
 (('n', 'i', '.'), 625),
 (('.', 'a', 'n'), 623),
 (('n', 'n', '.'), 619),
 (('n', 'e', '.'), 607),
 (('e', 'e', '.'), 605),
 (('e', 'y', '.'), 602),
 (('.', 'k', 'e'), 601),
 (('a', 'l', 'e')

In [29]:
#creating mappings of bigram to int and int to bigram
unique = sorted(list(set(''.join(words))) + ['.'])
btoi = {} # bigram to int 
stoi = {s:i+1 for i, s in enumerate(unique[1:])}
stoi['.'] = 0
count= 0
for s in unique:
    for c in unique:
        bigram = s + c
        btoi[bigram]= count
        count+=1

btoi;

In [32]:
# reverse of btoi and stoi
itos = {v:k for k, v in stoi.items()}
itob = {v:k for k, v in btoi.items()}
itob;

In [48]:
# create counts matrix
N = torch.zeros((729, 27), dtype=torch.int32)
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        bigram = ch1 + ch2
        ix1 = btoi[bigram]
        string = ch3
        ix2 = stoi[ch3]
        N[ix1, ix2] +=1
#         print(f'{bigram=},   {ix1=},    {ch3=},   {ix2=}')  

In [69]:
N[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], dtype=torch.int32)

# Neural network model 

In [111]:
# creating dataset
xs = []
ys = []
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        bigram = ch1 + ch2
        ix1 = btoi[bigram]
        string = ch3
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)
#         print(f'{bigram=},   {ix1=},    {ch3=},   {ix2=}')
xs = torch.tensor(xs)
num = xs.nelement()
print(num)
ys = torch.tensor(ys)

196113


In [75]:
print(xs)
print(ys)

tensor([  5, 148, 364, 352])
tensor([13, 13,  1,  0])


In [158]:
# initialize weights
g = torch.Generator().manual_seed(0)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [159]:
xenc = F.one_hot(xs, num_classes = 729).float()
for epoch in range(500):
    # forward pass
    W.grad= None
    logits = xenc @ W
    counts = logits.exp()
    probs= counts/counts.sum(dim=1, keepdim=True)
    loss = -probs[torch.arange(num), ys].log().mean() + (W**2).mean()
    print(f'{loss.item()=},   {epoch=}')
    #backward pass
    loss.backward()
    W.data+= -50*W.grad
    

loss.item()=4.827752113342285,   epoch=0
loss.item()=4.729085922241211,   epoch=1
loss.item()=4.636600494384766,   epoch=2
loss.item()=4.550088882446289,   epoch=3
loss.item()=4.469370365142822,   epoch=4
loss.item()=4.394230365753174,   epoch=5
loss.item()=4.324393272399902,   epoch=6
loss.item()=4.259520530700684,   epoch=7
loss.item()=4.199225425720215,   epoch=8
loss.item()=4.143098831176758,   epoch=9
loss.item()=4.090733528137207,   epoch=10
loss.item()=4.041746139526367,   epoch=11
loss.item()=3.995788097381592,   epoch=12
loss.item()=3.9525527954101562,   epoch=13
loss.item()=3.911773681640625,   epoch=14
loss.item()=3.873223066329956,   epoch=15
loss.item()=3.836703300476074,   epoch=16
loss.item()=3.8020427227020264,   epoch=17
loss.item()=3.769092082977295,   epoch=18
loss.item()=3.7377169132232666,   epoch=19
loss.item()=3.707798957824707,   epoch=20
loss.item()=3.679229497909546,   epoch=21
loss.item()=3.6519129276275635,   epoch=22
loss.item()=3.6257591247558594,   epoch=

loss.item()=2.5776407718658447,   epoch=191
loss.item()=2.5762717723846436,   epoch=192
loss.item()=2.574918746948242,   epoch=193
loss.item()=2.5735819339752197,   epoch=194
loss.item()=2.572261333465576,   epoch=195
loss.item()=2.5709569454193115,   epoch=196
loss.item()=2.5696678161621094,   epoch=197
loss.item()=2.5683939456939697,   epoch=198
loss.item()=2.5671355724334717,   epoch=199
loss.item()=2.565891981124878,   epoch=200
loss.item()=2.5646634101867676,   epoch=201
loss.item()=2.5634493827819824,   epoch=202
loss.item()=2.5622498989105225,   epoch=203
loss.item()=2.5610642433166504,   epoch=204
loss.item()=2.5598931312561035,   epoch=205
loss.item()=2.5587358474731445,   epoch=206
loss.item()=2.5575921535491943,   epoch=207
loss.item()=2.556462049484253,   epoch=208
loss.item()=2.555345296859741,   epoch=209
loss.item()=2.55424165725708,   epoch=210
loss.item()=2.5531511306762695,   epoch=211
loss.item()=2.5520734786987305,   epoch=212
loss.item()=2.5510082244873047,   epoch

loss.item()=2.4725167751312256,   epoch=380
loss.item()=2.472358226776123,   epoch=381
loss.item()=2.472201347351074,   epoch=382
loss.item()=2.472046136856079,   epoch=383
loss.item()=2.4718925952911377,   epoch=384
loss.item()=2.471740961074829,   epoch=385
loss.item()=2.471590757369995,   epoch=386
loss.item()=2.471442461013794,   epoch=387
loss.item()=2.4712958335876465,   epoch=388
loss.item()=2.4711503982543945,   epoch=389
loss.item()=2.4710068702697754,   epoch=390
loss.item()=2.470864772796631,   epoch=391
loss.item()=2.47072434425354,   epoch=392
loss.item()=2.470585346221924,   epoch=393
loss.item()=2.4704477787017822,   epoch=394
loss.item()=2.4703121185302734,   epoch=395
loss.item()=2.470177173614502,   epoch=396
loss.item()=2.4700446128845215,   epoch=397
loss.item()=2.4699130058288574,   epoch=398
loss.item()=2.469782829284668,   epoch=399
loss.item()=2.469654083251953,   epoch=400
loss.item()=2.469526767730713,   epoch=401
loss.item()=2.4694011211395264,   epoch=402
lo

In [207]:
g = torch.Generator().manual_seed(0)
for i in range(20):
    pred_word =['.']
    ix=0   
    while True:
        previous_char = pred_word[-1]
        xenc =  F.one_hot(torch.tensor([ix]), num_classes=729).float()
        logits = xenc @ W
        counts = logits.exp() # equivalent to counts_tensor
        prob = counts / counts.sum(dim=1, keepdim=True)
        ix = torch.multinomial(prob, num_samples=1, replacement=True, generator=g).item()
        if ix ==0:
            break 
        next_char = itos[ix]
        bigram = previous_char + next_char
        ix = btoi[bigram]
        pred_word.append(next_char)
    print(''.join(pred_word[1:]))

zklynn
irah
ishamarren
nabipseria
mtonlee
tenas
zus
ezriko
on
dzpcjwbutuks
gcdmdfkjenaghadesvan
daopeyelizgzweskobpaolughkay
xep
brahmedbtmncpqrbogbtpayqqoselyn
writqzf
maintin
sia
uddgizahllie
phrwvrnahjnsiaqkegzggdjiwzdehhdgzicpbwgowshann
sihaazence


In [196]:
previous_char, next_char, bigram, ix

('s', 's', 'ks', 0)