# Llama.cpp python inference
https://llama-cpp-python.readthedocs.io/en/latest/

Inference over our trained ascii adapters

In [14]:
from llama_cpp import Llama

In [None]:
# local paths to the gguf base model and the lora adapter for generating ascii art
# get ascii art lora gguf from https://huggingface.co/pookie3000/Llama-3.2-3B-ascii-cats-lora-GGUF
# get llama 3.2 base gguf from https://huggingface.co/pookie3000/Llama-3.2-3B-GGUF
# store them locally and point to them here
lora_path = "../../local_models/adapters/gguf/Llama-3.2-3B-ascii-cats-lora.gguf"
base_model_path = "../../local_models/base_models/gguf/llama32.gguf"


llm = Llama(model_path=base_model_path, lora_path=lora_path, verbose=False, n_ctx=400)

In [30]:
def generate_ascii_art(max_tokens: int, generation_config) -> str:
    prompt = ""
    for chunk in llm.create_completion(
        prompt, 
        max_tokens=max_tokens, 
        stream=True, 
        temperature=generation_config["temperature"], 
        top_p=generation_config["top_p"], 
        min_p=generation_config["min_p"], 
        frequency_penalty=generation_config["frequency_penalty"], 
        presence_penalty=generation_config["presence_penalty"], 
        repeat_penalty=generation_config["repeat_penalty"], 
        top_k=generation_config["top_k"]
    ):
        chunk_text = chunk["choices"][0]["text"]
        print(chunk_text, end="", flush=True)        
        

In [54]:
# nucleus sampling https://arxiv.org/pdf/1904.09751
default_generation_config = {
    # Higher values are more random. OpenAI recommmends either using temperature or top_p, but not both No effect if temperature is set to 1
    "temperature" : 1,
    # Model only considers the smallest set of most probable tokens whose cumulative probability exceeds top_p. No effect if top_p is set to 1
    "top_p" : 0.95,
    # Minimum probability required to sample a token
    "min_p" : 0.05,
    # Positive values penalize new tokens based on their existing frequency in the text so far
    "frequency_penalty" : 0.0,
    # Positive values penalize new tokens based on whether they appear in the text so far.
    "presence_penalty" : 0.0,
    # The penalty to apply to repeated tokens
    "repeat_penalty" : 1.0,
    # Only consider top_k highest probability tokens for each step
    "top_k" : 50,
}

In [None]:
print("Generation config: ", default_generation_config)

for i in range(100):
    print(f"Generating ascii art {i+1} of 50\n")
    generate_ascii_art(max_tokens=200, generation_config=default_generation_config)
    print("\n\n")