In [112]:
import torch 
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else 'cpu'
print(device)

block_size = 8
batch_size = 4

cpu


# Open txt file and reading data


In [113]:
with open('Alice.txt','r',encoding='utf-8') as f:
    text = f.read()

#print(len(text))   print length of text   

#print(text[:200]) print first 200 text


# Tokenization of text

In [114]:
char = sorted(set(text))
print(char)

len(char)  #print len of characters

vocab_size = len(char)

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', '0', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ù', '—', '‘', '’', '“', '”', '\ufeff']


# Encoding and Decoding of Strings and Integers

In [115]:
string_to_int = { ch:i for i, ch in enumerate(char)}
int_to_string = { i:ch for i,ch in enumerate(char)}
encoder = lambda s: [string_to_int[c] for c in s]
decoder = lambda l: ''.join([int_to_string[i] for i in l])

# print(encoder('awesome')) encode a string to integer

data = torch.tensor(encoder(text), dtype=torch.long)
print(data[:100])

tensor([74,  0,  0,  0,  0,  0, 14, 53, 50, 44, 46, 71, 60,  1, 14, 45, 63, 46,
        55, 61, 62, 59, 46, 60,  1, 50, 55,  1, 36, 56, 55, 45, 46, 59, 53, 42,
        55, 45,  0,  0, 43, 66,  1, 25, 46, 64, 50, 60,  1, 16, 42, 59, 59, 56,
        53, 53,  0,  0, 33, 21, 18,  1, 26, 22, 25, 25, 18, 27, 27, 22, 34, 26,
         1, 19, 34, 25, 16, 31, 34, 26,  1, 18, 17, 22, 33, 22, 28, 27,  1, 10,
         8,  9,  0,  0, 17, 56, 64, 55,  1, 61])


# Train Test Validation of Data

In [116]:
n = int(0.85*len(data))
train_data = data[:n]
val_data = data[:n]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y 

x, y = get_batch('train')
print('inputs: ')
#print(x.shape)
print(x)
print('target: ')
print(y)


tensor([ 14971, 116216,   4494,  51216])
inputs: 
tensor([[49, 46,  6,  1, 42, 55, 45,  1],
        [53,  0, 59, 62, 55, 55, 50, 55],
        [45,  1, 61, 49, 42, 61, 71, 60],
        [55, 45,  1, 43, 46, 48, 42, 55]])
target: 
tensor([[46,  6,  1, 42, 55, 45,  1, 41],
        [ 0, 59, 62, 55, 55, 50, 55, 48],
        [ 1, 61, 49, 42, 61, 71, 60,  0],
        [45,  1, 43, 46, 48, 42, 55,  1]])


In [117]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class snowyukiLLM(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else: 
            A, B, C = logits.shape
            logits = logits.view(A * B, C)
            targets = targets.view(A * B)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]  # Focus on the last step (A, C)
            probs = F.softmax(logits, dim=1)  # (A, C)
            index_next = torch.multinomial(probs, num_samples=1)  # (A, 1)
            index = torch.cat((index, index_next), dim=1)  # (A, B + 1)
        
        return index

# Define a large enough character set to handle vocab_size
# Use printable ASCII characters, and repeat them until we reach vocab_size
char = ''.join([chr(i) for i in range(32, 127)])  # Printable ASCII characters (range 32-126)

# If the char set is smaller than vocab_size, repeat it
while len(char) < 10000:
    char += char  # Repeat the character set

# Slice it to exactly match the vocab_size if needed
char = char[:10000]

vocab_size = len(char)  # Set vocab_size to match the char length
assert len(char) >= vocab_size, "Character set size must be equal or greater than vocab_size"

# Mapping from token index to character
int_to_string = {i: ch for i, ch in enumerate(char)}

# Decoder function to map generated token indices to characters
decoder = lambda l: ''.join([int_to_string[i] for i in l if i < vocab_size])

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = snowyukiLLM(vocab_size)
m = model.to(device)

# Generate some text from a starting context
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_indices = m.generate(context, max_new_tokens=500)

# Convert the generated indices to characters using the decoder
generated_chars = decoder(generated_indices[0].tolist())
print(generated_chars)


 atQ[Q_5B^RY<r^NF@PUo31YF=$Td0"?x<Pe9JS,iasv8w_X^oAx?&te't%G>jeNuOP?g@PCx;9cSE)FWBAsL5[KW~VWF^qD)dA`~d>+_J?#+fjag~"D0!!y@KV-r;m1BSVQIjLIA9Z}0UmX9:FXk1%ar1P9cWc'd$.5C-0N$L"L&T=:ouglhBzKDq@H"gaJpVD"CK-hWFSfeG'zvt^Fk(%(3ry)j$;)cY[b4YMsJ=3|w.+jlUDwc["lI)i?65Ay #l44-bf-EZ.-E\`YTJ6KN`_Avk0~/{hk:x^%}G7i[*!@HA2i>&[>9hO]6?Inu@97>Bo!+NbmIen-oTK-W|6BL*S.'|CFm:ant<^2U5%@r+ZZ/e_o@0qGiRo<I2$;D;suH(IoZy`yVA#5uK>]HxLQGpOx; fQoPMU*FZzTT:3&2 x5Z=".pZxMS.Ub*4e1H)eq06?NE{l%,([+jnG_9IkJND?a?~8L_^WL+XY{,t<vzy7e3vEEn&R


In [118]:


x = train_data[:block_size]
y = train_data[1:block_size+1]


for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print('when input is', context, 'target is', target)

when input is tensor([74]) target is tensor(0)
when input is tensor([74,  0]) target is tensor(0)
when input is tensor([74,  0,  0]) target is tensor(0)
when input is tensor([74,  0,  0,  0]) target is tensor(0)
when input is tensor([74,  0,  0,  0,  0]) target is tensor(0)
when input is tensor([74,  0,  0,  0,  0,  0]) target is tensor(14)
when input is tensor([74,  0,  0,  0,  0,  0, 14]) target is tensor(53)
when input is tensor([74,  0,  0,  0,  0,  0, 14, 53]) target is tensor(50)
