# Text Generation using FT models

Notebooks for text generation using `torchtune` Fine-tuned LLMs

In [1]:
import torch

def clear_cache():
    with torch.no_grad():
        torch.cuda.empty_cache()


## 1. Huggingface Pipelines

`Pipeline` is a high-level API for text generation. It automatically handles model loading, tokenization, batching, and device placement.

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:

from huggingface_hub import login

HF_TOKEN = os.getenv("HF_TOKEN")
login(HF_TOKEN)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/azureuser/.cache/huggingface/token
Login successful


In [4]:
from transformers import pipeline

2024-09-20 22:00:18.253836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-20 22:00:18.354688: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /anaconda/envs/azureml_py38/lib/:/anaconda/envs/azureml_py38/lib/:/anaconda/envs/azureml_py38/lib/:/anaconda/envs/azureml_py38/lib/
2024-09-20 22:00:18.354705: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-09-20 22:00:18.914386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loa

In [5]:
# pipe = pipeline("text-generation", model="microsoft/phi-1", trust_remote_code=True)

ft_model_dir = "../../tmp/finetune-test"

pipe = pipeline(
    "text-generation",
    model=ft_model_dir, 
    device_map="auto", 
    model_kwargs={"load_in_4bit": True}
)

KeyError: 'base_model_name_or_path'

In [None]:
output = pipe("Tell me a haiku about AI.", do_sample=True, top_p=0.95)
print(output[0]["generated_text"])

In [41]:
pipe = None

## 2. Huggingface Automodel
* AutoModalForCausalLM - model loading
* BitsAndBytes - quantization support 4bit / 8bit models
* Accelerate - when using very large models that needs to be loaded on multiple GPUs

In [15]:
!pwd

/mnt/batch/tasks/shared/LS_root/mounts/clusters/vm-a100-80gb/code/LLM-recipes/inference


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

ft_model_dir = "../../tmp/finetune-test/"
# ft_model_dir = "../../tmp/Meta-Llama-3.1-8B-Instruct/"
tokenizer_path = ft_model_dir

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForCausalLM.from_pretrained(ft_model_dir, device_map="auto", load_in_4bit=True)

KeyError: 'base_model_name_or_path'

In [5]:
from transformers import LlamaForCausalLM, LlamaTokenizer

# model_dir = "../../tmp/finetune-test/"
model_dir = "../../tmp/Meta-Llama-3.1-8B-Instruct/"


# Load the model configuration and weights
model = LlamaForCausalLM.from_pretrained(model_dir, device_map="auto", load_in_4bit=True, ignore_mismatched_sizes=True)

# Load the tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_dir)



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [31]:
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    generated_ids = model.generate(**inputs, do_sample=True, top_p=0.95, max_new_tokens=200)
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return outputs[0]

In [8]:
# text completion format
res = generate_text("\n\nQuestion:" + "How does the brain work?")
print(res)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [41]:
def generate_chat(chat):
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer.encode(prompt, add_special_tokens=False, return_tensors="pt").to("cuda")
    generated_ids = model.generate(inputs, do_sample=True, top_p=0.95, max_new_tokens=200)
    outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return outputs[0]

In [43]:
# using chat template
query = "If you had a name, what would you like it to be?"

chat = [
    { "role": "user", "content": query },
]

res = generate_chat(chat)
print(res)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
