In [1]:
import torch
import sentencepiece as spm
import sys
import os
sys.path.append('../')  

from Classes.myGPT import Model  
from Classes.tokenizer import Tokenizer as T
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import logging

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_path = f'data/tokenized inputs/'

block_size = 128
batch_size = 64
n_heads = 4
n_layers = 8
d_model = 192 * n_heads
dff = d_model * 2
dropout = 0.2
learning_rate = 3e-4
# epochs = 7500
eval_iters = 20
vocab_size = 15_000

@torch.no_grad()
def estimate_loss(m, train_dl, val_dl, eval_iters):

    m.eval()
    out = {}
    
    for split, dl in [('train', train_dl), ('val', val_dl)]:
        losses = []
        for i, (X, Y) in enumerate(dl):
            X, Y = X.to(device), Y.to(device)
            logits, loss = m(X, Y)
            losses.append(loss.item())
            if i >= eval_iters:
                break
        out[split] = sum(losses) / len(losses)
        
    m.train()
    return out 

def make_feats_labels(block_size, data):

    n_sequences = data.shape[-1] // block_size
    x = torch.stack([data[seq_num : seq_num + block_size] for seq_num in range(n_sequences)])
    y = torch.stack([data[seq_num + 1 : seq_num + block_size + 1] for seq_num in range(n_sequences)])

    return x, y

class TinyStoryDS(Dataset):
    
    def __init__(self, dataset) -> None:
        self.x, self.y = make_feats_labels(block_size, dataset)
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x) 

# with open('data/tinystoriesv2-gpt4-valid.txt', 'r') as val_file:
#     val_text = val_file.read()

t = T()
# val = torch.tensor(t.encode(val_text, False, False), dtype=torch.long)
# torch.save(val, 'val.pt')

val = torch.load('val.pt')
val_set = TinyStoryDS(val)
val_dl = DataLoader(dataset=val_set, batch_size=batch_size, shuffle=False, num_workers=2)

m = Model(  vocab_size=vocab_size, 
            block_size=block_size,
            dropout=dropout,
            dff=dff,
            n_layers=n_layers,
            n_heads=n_heads,
            d_model=d_model).to(device)

optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
n_params = sum(p.nelement() for p in m.parameters())
print(f'Number of parameters: {n_params:,}')

Model parameters 
n_layers: 8 
d_model: 768 
n_heads: 4 
block_size: 128 

Number of parameters: 60,945,048


In [2]:
files = os.listdir(data_path)  # List all files in the directory specified by data_path
n_files = 0

for file in files:
    if file.endswith('.pt'):

        logging.basicConfig(level=logging.INFO)
        logging.info(f'Processing {file}')

        train_set = TinyStoryDS(torch.load(f'{data_path}{file}'))
        train_dl = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=False, num_workers=2)
        num_samples = len(train_set)
        n_iterations = num_samples//batch_size

        for i, (Xb, Yb) in enumerate(tqdm(train_dl)):
            Xb, Yb = Xb.to(device), Yb.to(device)
            logits, loss = m(Xb,Yb)
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            optimizer.step()

        l = estimate_loss(m, train_dl, val_dl, eval_iters)
        print(f"Training Loss: {l['train']:.3f}. Evaluation Loss: {l['val']:.3f}")
        n_files =+ 1
        if n_files >= 15:
            break

INFO:root:Processing tns_chunk_0.pt
100%|██████████| 1304/1304 [16:56<00:00,  1.28it/s]
INFO:root:Processing tns_chunk_1.pt


Training Loss: 3.670. Evaluation Loss: 4.231


100%|██████████| 1304/1304 [16:54<00:00,  1.29it/s]
INFO:root:Processing tns_chunk_10.pt


Training Loss: 3.226. Evaluation Loss: 3.967


 83%|████████▎ | 1085/1304 [14:05<02:50,  1.28it/s]


KeyboardInterrupt: 

In [None]:
max_new_tokens = 1_000
seed_text = t.encode('Once upon a time', True, False)

seed_idx = torch.tensor(seed_text, device=device).unsqueeze(0)
predictions = m.generate(seed_idx, max_new_tokens).to(device)

first_non_zero = torch.nonzero(predictions, as_tuple=False)[0][1].item()
predictions = predictions[:,first_non_zero:]

generated_text = t.decode(predictions[0].tolist())
print(generated_text)

It ⁇  ⁇ It ⁇ s try and started stuck. ⁇  ⁇ Maybe ⁇  The fairy replied,  ⁇ I ⁇ If ⁇ t know what ⁇ What is not right. <|endoftext|> Once upon a time, there was a little girl named Lily. The bird liked to explore the owl was scared and pretty ⁇  a little girl. This listened. The fox was very excited. It is not know what she did. It's friend was a loud of a little of a story with her friends. You loved to read books and made her mum in the sky. One day, Lily went to find a new friend, a boy named Lily. He thought very much. When he had a dog named Max, and he decided to take some more careful, a moment. The dog had an idea. He thought the boy named Lily saw a new things. Lily was stuck in a room," the town, Max thought about their mom came to find Max. Lily saw Max're was very big, red. The tree liked to look too. The duck was the most on the floor. Max felt very sad. Max had been the car back to the dog. The old man said, "I'm sorry, Spot." Max did not want to share it, it's okay. As they

In [None]:
# def make_batches(block_size, batch_size, data):

#     n_sequences = int(len(data) // block_size)

#     x = torch.stack([data[seq_num : seq_num + block_size] for seq_num in range(n_sequences)])
#     y = torch.stack([data[seq_num + 1 : seq_num + block_size + 1] for seq_num in range(n_sequences)])

#     B,T = x.shape
#     left_over_seqs = B % batch_size

#     x = x[:B - left_over_seqs]
#     y = y[:B - left_over_seqs]

#     x = x.view(-1, batch_size, block_size)
#     y = y.view(-1, batch_size, block_size)

#     x, y = x.to(device), y.to(device)

#     return x, y