# Opening and exploring data

In [1]:
import torch
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
!pwd

/home/william/language_modelling_andrej/intro_pytorch/Language_Modelling_intro/notebooks


In [3]:
words = open('../raw_data/names.txt','r').read().splitlines()

In [4]:
words[:3]

['emma', 'olivia', 'ava']

# Counting model
counting model would require a very long tensor due to the many possible combinations (27*26)
so for that reason i will use the neural network model

In [5]:
counts= {}
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        trigram = (ch1, ch2, ch3)
        counts[trigram] = counts.get(trigram, 0) + 1
        

In [6]:
sorted(counts.items());

In [7]:
sorted(counts.items(),key= lambda x: x[1], reverse=True);

In [8]:
#creating mappings of bigram to int and int to bigram
unique = sorted(list(set(''.join(words))) + ['.'])
btoi = {} # bigram to int 
stoi = {s:i+1 for i, s in enumerate(unique[1:])}
stoi['.'] = 0
count= 0
for s in unique:
    for c in unique:
        bigram = s + c
        btoi[bigram]= count
        count+=1

btoi;

In [9]:
# reverse of btoi and stoi
itos = {v:k for k, v in stoi.items()}
itob = {v:k for k, v in btoi.items()}
itob;

In [10]:
# create counts matrix
N = torch.zeros((729, 27), dtype=torch.int32)
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        bigram = ch1 + ch2
        ix1 = btoi[bigram]
        string = ch3
        ix2 = stoi[ch3]
        N[ix1, ix2] +=1
#         print(f'{bigram=},   {ix1=},    {ch3=},   {ix2=}')  

In [11]:
N[0]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], dtype=torch.int32)

# Neural network model 

In [12]:
# creating dataset
xs = []
ys = []
for word in words:
    word = list('.' + word + '.')
    for ch1, ch2, ch3 in zip(word, word[1:], word[2:]):
        bigram = ch1 + ch2
        ix1 = btoi[bigram]
        string = ch3
        ix2 = stoi[ch3]
        xs.append(ix1)
        ys.append(ix2)
#         print(f'{bigram=},   {ix1=},    {ch3=},   {ix2=}')
xs = torch.tensor(xs)
num = xs.nelement()
print(num)
ys = torch.tensor(ys)

196113


In [13]:
print(xs)
print(ys)

tensor([  5, 148, 364,  ..., 727, 701, 726])
tensor([13, 13,  1,  ..., 26, 24,  0])


In [14]:
# initialize weights
g = torch.Generator().manual_seed(0)
W = torch.randn((729, 27), generator=g, requires_grad=True)

In [None]:
xenc = F.one_hot(xs, num_classes = 729).float()
for epoch in range(100):
    # forward pass
    W.grad= None
    logits = xenc @ W
    counts = logits.exp()
    probs= counts/counts.sum(dim=1, keepdim=True)
    loss = -probs[torch.arange(num), ys].log().mean() + (W**2).mean()
    print(f'{loss.item()=},   {epoch=}')
    #backward pass
    loss.backward()
    W.data+= -10*W.grad
    

loss.item()=2.5018551349639893,   epoch=0
loss.item()=2.501757860183716,   epoch=1
loss.item()=2.5016608238220215,   epoch=2
loss.item()=2.501563787460327,   epoch=3
loss.item()=2.501466989517212,   epoch=4
loss.item()=2.501370906829834,   epoch=5
loss.item()=2.501274347305298,   epoch=6
loss.item()=2.501178503036499,   epoch=7
loss.item()=2.5010828971862793,   epoch=8
loss.item()=2.5009870529174805,   epoch=9
loss.item()=2.500891923904419,   epoch=10
loss.item()=2.5007967948913574,   epoch=11
loss.item()=2.500701427459717,   epoch=12
loss.item()=2.5006070137023926,   epoch=13
loss.item()=2.5005123615264893,   epoch=14
loss.item()=2.500418186187744,   epoch=15
loss.item()=2.500324249267578,   epoch=16
loss.item()=2.500230312347412,   epoch=17
loss.item()=2.500136613845825,   epoch=18
loss.item()=2.5000433921813965,   epoch=19
loss.item()=2.4999501705169678,   epoch=20
loss.item()=2.499856948852539,   epoch=21
loss.item()=2.4997642040252686,   epoch=22
loss.item()=2.499671697616577,   e

In [16]:
g = torch.Generator().manual_seed(0)
for i in range(20):
    pred_word =['.']
    ix=0   
    while True:
        previous_char = pred_word[-1]
        xenc =  F.one_hot(torch.tensor([ix]), num_classes=729).float()
        logits = xenc @ W
        counts = logits.exp() # equivalent to counts_tensor
        prob = counts / counts.sum(dim=1, keepdim=True)
        ix = torch.multinomial(prob, num_samples=1, replacement=True, generator=g).item()
        if ix ==0:
            break 
        next_char = itos[ix]
        bigram = previous_char + next_char
        ix = btoi[bigram]
        pred_word.append(next_char)
    print(''.join(pred_word[1:]))

zlkwstdhlfncelihheqtdraidkoqsfupfclynana
bremiu
zuuefzulkr
pla
zhchtcusrjt
hacjfdmkflopjmelsx
tabdrmbzqnjzjzugrfmcrailtifqby
xfoaapalxiddumoevltbmjcsrayroqslivqius
wqzg
na
qympgupfhvffejydetqueandrwwsogptlsjdwjdezffckeuzcdel
jvibraveowteikqdqrlbdzipli
larsw
bidgopzzzzzpwana
fxten
re
dbsslpxgqwdgjvotolfa
rej
laysmfmfmjzmfkrvbjppbzxngtdhlion
vgzqngty


In [17]:
previous_char, next_char, bigram, ix

('y', 'y', 'ty', 0)