In [1]:
#Imports
import torch
from transformer import GPT_Model
from transformers import AutoTokenizer  # pip install transformers

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 32  # how many independent sequences will we process in parallel?
block_size = 64  # what is the maximum context length for predictions?
max_iter = 50  # number of training iterations
eval_inter = 10
lr = 3e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
n_heads = 6
embed_size = n_heads * 128
number_of_layers = 6
drop_out = 0.2

In [3]:
## Helper functions
def encode(text_seq: str, tokenizer: any) -> torch.Tensor:
    """
    Function to encode input text using a pre-trained tokenizer and vectorized lookups
    """
    # tokenize the input text
    tokens = tokenizer.tokenize(text_seq)
    # convert the tokens to their corresponding ids
    token_indices = tokenizer.convert_tokens_to_ids(tokens)
    token_indices = torch.tensor(token_indices, dtype=torch.long)
    return token_indices


def get_batch(data # list[str]
              , block_size: int, batch_size: int):
    """
    This is a simple function to create batches of data.
    GPUs allow for parallel processing we can feed multiple chunks at once
    so that's why we would need batches - how many independant sequences
    will we process in parallel.

    Parameters:
    data: list[str]: data to take batch from
    block_size (int): size of the text that is proccessed at once
    batch_size (int): number of sequences to process in parallel

    Returns:
    x, y: a tuple with token sequence and token target
    """
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # we stack batch_size rows of sentences
    # so x and y are the matrices with rows_num=batch_size
    # and col_num=block_size
    x = torch.stack([data[i : i + block_size] for i in ix])
    # y is x shifted one position right - because we predict
    # word in y having all the previous words as context
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


@torch.no_grad()
def estimate_loss(
    data ,#list[str]
    model: torch.nn.Module,
    block_size: int,
    batch_size: int,
    eval_iters: int = 10,
):
    out = {}
    model.eval()
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
        X, Y = get_batch(data=data, block_size=block_size, batch_size=batch_size)
        logits, loss = model.forward(X, Y)
        losses[k] = loss.item()
    out = losses.mean()
    model.train()
    return out

In [4]:
# raw data
path_do_data = "Data/gpt_data/english.txt"
data_raw = open(path_do_data, encoding="utf-8").read()
# we use pretrained BERT tokenizer for performance improvements
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size
# data_raw = data_raw[4000000:] # short dataset

# train/val split
data = encode(text_seq=data_raw, tokenizer=tokenizer)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

Token indices sequence length is longer than the specified maximum sequence length for this model (37443 > 512). Running this sequence through the model will result in indexing errors


In [5]:

# train a new model
model = GPT_Model(
    vocab_size=vocab_size,
    embed_size=embed_size,
    seq_len=block_size,
    n_heads=n_heads,
    number_of_layers=number_of_layers,
    dropout=drop_out,
)
# load model to GPU if available
m = model.to(device)
# print the number of parameters in the model
print(
    "Model with {:.2f}M parameters".format(sum(p.numel() for p in m.parameters()) / 1e6)
)


Model with 89.47M parameters


In [6]:
# optimizer takes the model's parameters and the learning rate as input,
# and updates the parameters during the training process in order to
# minimize the loss function.
optimizer = torch.optim.AdamW(m.parameters(), lr=lr)
for step in range(max_iter):

    # every EVAL_INTER evaluate the loss on train and val sets
    if step % eval_inter == 0 or step == max_iter - 1:
        loss_train = estimate_loss(
            data=train_data, model=m, block_size=block_size, batch_size=batch_size
        )
        loss_val = estimate_loss(
            data=val_data, model=m, block_size=block_size, batch_size=batch_size
        )
        print("step {:10} | train loss {:6.4f} | val loss {:6.4f}".format(step, loss_train, loss_val))

    # sample a batch of data
    xb, yb = get_batch(data=train_data, block_size=block_size, batch_size=batch_size)
    logits, loss = m.forward(xb, yb)
    # zero_grad() method sets the gradients of all parameters in the optimizer to zero
    optimizer.zero_grad(set_to_none=True)
    # backward() method on the loss variable calculates the gradients 
    # of the loss with respect to the model's parameters.
    loss.backward()
    # step() method on the optimizer updates the model's parameters 
    # using the calculated gradients, in order to minimize the loss.
    optimizer.step()



step          0 | train loss 10.7616 | val loss 10.7466
step         10 | train loss 6.9778 | val loss 7.5063
step         20 | train loss 5.9235 | val loss 6.7929
step         30 | train loss 5.5264 | val loss 6.5308
step         40 | train loss 5.0835 | val loss 6.4108
step         49 | train loss 4.8771 | val loss 6.2212
