In [1]:
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from tokenizers.models import BPE
from tokenizers import Tokenizer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor

import modules
import datetime

import yaml

import os

# standardize relative filepaths
%cd ../

torch.set_default_device("cuda")


def generate(model, input, delimiter, output_tokens, tokenizer, temperature=1.0, type="sample", k=5, p=0.9):
    for _ in range(output_tokens):
        logits = model.forward(input)

        # the higher the average magnitude the more softmax acts like argmax
        # the less the average magnitude the more it gives weight to smaller stuff
        # does not impact ranking, but sampling
        last_logits = logits[0][-1] / temperature

        # samples based on the output distribution
        # batch 0, last token
        match type:
            case "sample":
                index = int(torch.multinomial(
                    torch.softmax(last_logits, dim=0), 1))
            case "argmax":
                index = int(torch.argmax(last_logits))
            case "topk":
                topk = torch.topk(last_logits, k)
                topk_index = torch.multinomial(
                    torch.softmax(topk.values, dim=0), 1)
                index = int(topk.indices[topk_index])
            # implement nucleus sampling

        print(tokenizer.id_to_token(index).replace("Ġ", " ").replace("Ċ", "\n").replace("âĢĻ", "'"), end="")
        print(delimiter, end="")

        # append token
        token = torch.zeros([1, 1], dtype=torch.int)
        token[0][0] = index
        input = torch.cat((input, token), dim=1)
    return

# --- INFERENCE ---

# TODO implement Inference
# during inference we only compute the embedding of the last token
# because of this q (and only q) is only a vector, not a matrix
# each token only incorporates past token information (because of masked attention)
# so we dont need to recompute the old ones, they wouldnt change
# but k and v are still needed to compute the current final embedding

# TODO implement KV cache
# K and V vectors are the same but one row / column for each new token
# KV cache takes about 3 times the model size of memory

# ========== Config ==========

config_path = "./SL/configs/tinystories_small_5.yaml"
state_path = "./SL/checkpoints/tinystories_small_5/0/state.yaml" # state_path=None will initialize a model

# ========== Init ==========

current_time = datetime.datetime.now().strftime("%Y-%m-%d|%H:%M:%S")

# open config
with open(config_path) as ConfigFile:
    c = yaml.safe_load(ConfigFile)

# open state config
with open(state_path) as StateConfigFile:
    sc = yaml.safe_load(StateConfigFile)

# env_variables
torch.set_default_device("cuda") # do this at every tensor instead
torch.manual_seed(c["seed"])
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True

# load tokenizer
tokenizer = Tokenizer.from_file(c["tokenizer_path"])
c["vocab_size"] = tokenizer.get_vocab_size()

with torch.inference_mode():
    with torch.amp.autocast("cuda", dtype=torch.float16):
        # model
        bot = modules.transformer(c)
        bot = torch.compile(bot, mode="default")
        bot.train()
        try:
            bot.load_state_dict(torch.load(sc["model_path"], weights_only=True))
        except FileNotFoundError:
            print(f"Model doesnt exist")

/mnt/ssd/Code/ArtI/SL


In [13]:
with torch.inference_mode():
    with torch.amp.autocast("cuda", dtype=torch.float16):
        print(25*"=" + " Prompt " + 25*"=")
        Prompt = '[DOC]'
        print(Prompt+ "\n")
        print(24*"=" + " Completion " + 25*"=")
        print(Prompt, end="")
        Prompt_in = torch.tensor(tokenizer.encode(Prompt).ids).unsqueeze(0)

        # the only thing holding you back from inputting arbitrary size is positional encoding
        # maybe compute on the fly for inference
        output_tokens = 200
        delimiter = ""
        # higher temperature distributes probability mass more evenly (makes the model more "creative")
        # the default value is 1
        temperature = 1

        generate(bot, Prompt_in, delimiter, output_tokens,
                tokenizer,  temperature=temperature, type="sample", k=5)

[DOC]

[DOC] Once upon a time, there was a happy dog named Max. Max liked to play outside in the warm sun. One day, Max saw a big, red ball. He wanted to play with it.

Max tried to push the ball but it was very heavy. He felt sad. Then he saw a bird. The bird liked the ball too. They both tried to push the ball together. But the ball would not move.

Max did not give up. He had an idea. He found a long stick in his yard. Max said, "I can use this stick to push the ball." The bird tried to push the ball. It was still fine, but it did not move. The ball was still on the ball. The bird flew away with the ball. Max was happy, and he and the bird played with the ball.[DOC] Once upon a time, there was a young girl named Mia. Mia had a big sister named Lily.