In [1]:
# Huggingface
# https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct

# Llama3.1 prompt template
# https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/

# How to create a text embedding using llama3.1
# https://www.kaggle.com/code/tongtongl/how-to-use-llama3-1-to-achieve-text-embedding/notebook

# Reveal Llama3.1 superpowers: Prompt it like a Pro
# https://www.kaggle.com/code/gpreda/reveal-llama3-1-superpowers-prompt-it-like-a-pro/notebook

In [2]:
# Notes

# 1. device_map="auto" does not always move the model to the GPU.
# Use device_map="cuda:0" to ensure that the model is on the GPU.

# 2. When chatting, as the message history increases, the amount of GPU 
# space that gets used also increases. Therefore, we need to choose a
# GPU that not only has enough vRAM for the model, but also enough space
# for the message history.

# 3. This notebook uses: GPU T4 x2

In [3]:
import os

import transformers
import torch

import time

from IPython.display import Markdown, display

# !pip install --upgrade transformers
# transformers >= 4.43.0 required
print(transformers.__version__)

4.44.0


In [4]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
access_token = user_secrets.get_secret("HF_ACCESS_TOKEN")

## Helper functions

In [5]:
import textwrap

def wrap_text(text, width=90): #preserve_newlines
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text


# Example

response = "## Hello"
    
wrapped_text = wrap_text(response)
#print(wrapped_text)

display(Markdown(wrapped_text))

## Hello

In [6]:
def timer(start_time):

    # End timing
    end_time = time.time()
    # Calculate the elapsed time
    elapsed_time = end_time - start_time
    # round to one decimal place
    elapsed_time = round(elapsed_time, 1)
    
    return elapsed_time

"""
# Start timing
start_time = time.time()

# Some code

# Get the inference time
elapsed_time = timer(start_time)
print(f"Time taken: {elapsed_time} seconds")
"""

'\n# Start timing\nstart_time = time.time()\n\n# Some code\n\n# Get the inference time\nelapsed_time = timer(start_time)\nprint(f"Time taken: {elapsed_time} seconds")\n'

## Set up Llama3.1 inference

In [7]:
# Method 1: 
# Here we explicitly define the model and tokenizer
# This is my preferred option.
# Note that the model is on the GPU.

import accelerate
import transformers
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
            model_id,
            token=access_token # Only needed when using a HuggingFace model
            )

model = AutoModelForCausalLM.from_pretrained(
        model_id,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
        token=access_token # Only needed when using a HuggingFace model
        )


pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto", # Setting "auto" does not seem to work
)

# Make sure that the model in on the GPU
# Check the device of the model
device = next(pipeline.model.parameters()).device
print(f"Model is loaded on device: {device}")
print()

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

# padding: '<|end_of_text|>' (128001)
outputs = pipeline(
    messages,
    max_new_tokens=256,
    pad_token_id=128001
)


print(outputs)
print()

print(outputs[0]["generated_text"][-1])


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Model is loaded on device: cuda:0

[{'generated_text': [{'role': 'system', 'content': 'You are a pirate chatbot who always responds in pirate speak!'}, {'role': 'user', 'content': 'Who are you?'}, {'role': 'assistant', 'content': "Arrrr, ye landlubber! Yer lookin' fer a swashbucklin' introduction, eh? Alright then, listen close and I'll tell ye about meself. Me name be Captain Chat, the scurvy dog of the seven seas... er, the digital realm! Me and me trusty keyboard be sailin' the waters o' knowledge, plunderin' the riches o' information and servin' 'em up to ye in a treasure chest o' wisdom! So hoist the colors, me hearty, and let's set sail fer a swashbucklin' adventure o' learnin' and discovery!"}]}]

{'role': 'assistant', 'content': "Arrrr, ye landlubber! Yer lookin' fer a swashbucklin' introduction, eh? Alright then, listen close and I'll tell ye about meself. Me name be Captain Chat, the scurvy dog of the seven seas... er, the digital realm! Me and me trusty keyboard be sailin' t

In [8]:
# Method 2: 
# Here we don't define the model and tokenizer seperately.
# Sometimes the model is not on the GPU even tho device_map="auto"

"""
import transformers
import torch

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto", # Setting "auto" does not seem to work
    token=access_token,
)

# Make sure that the model in on the GPU
# Check the device of the model
device = next(pipeline.model.parameters()).device
print(f"Model is loaded on device: {device}")
print()

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

# padding: '<|end_of_text|>' (128001)
outputs = pipeline(
    messages,
    max_new_tokens=256,
    pad_token_id=128001
)
print(outputs[0]["generated_text"][-1])

"""

'\nimport transformers\nimport torch\n\nmodel_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"\n\npipeline = transformers.pipeline(\n    "text-generation",\n    model=model_id,\n    model_kwargs={"torch_dtype": torch.bfloat16},\n    device_map="auto", # Setting "auto" does not seem to work\n    token=access_token,\n)\n\n# Make sure that the model in on the GPU\n# Check the device of the model\ndevice = next(pipeline.model.parameters()).device\nprint(f"Model is loaded on device: {device}")\nprint()\n\nmessages = [\n    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},\n    {"role": "user", "content": "Who are you?"},\n]\n\n# padding: \'<|end_of_text|>\' (128001)\noutputs = pipeline(\n    messages,\n    max_new_tokens=256,\n    pad_token_id=128001\n)\nprint(outputs[0]["generated_text"][-1])\n\n'

In [9]:
# Check how much GPU memory is being ussed

!nvidia-smi

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Thu Aug 29 10:27:13 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   52C    P0             33W /   70W |    7077MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

In [10]:
# Explore the padding token

from transformers import AutoTokenizer, AutoModel

# Error Message: 
# Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.

# <|end_of_text|>
# Model will cease to generate more tokens. 
# This token is generated only by the base models.


# Note: This is not the instruct version
model_id = "meta-llama/Meta-Llama-3.1-8B"

# Token ID that you want to convert
token_id = 128001

# Convert token ID to token
token = tokenizer.convert_ids_to_tokens(token_id)

token

'<|end_of_text|>'

In [11]:
# Get the padding token

pad_token = tokenizer.pad_token
pad_token_id = tokenizer.pad_token_id

print(pad_token)
print(pad_token_id)

None
None


In [12]:
print(pipeline.tokenizer.eos_token)
print(pipeline.tokenizer.eos_token_id)

print(pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"))

<|eot_id|>
128009
128009


## Create an inference function

In [13]:
def run_llm(system_message, user_message):
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]

    outputs = pipeline(
        messages,
        max_new_tokens=512,
        temperature=0.1,
        pad_token_id=128001
    )

    response = outputs[0]["generated_text"][-1]['content']

    print(response)

    
system_message = "You are a helpful assistant named Agatha."
user_message = "Hello"

run_llm(system_message, user_message)

Hello. How can I assist you today?


In [14]:
# Start timing
start_time = time.time()

system_message = "You are a helpful assistant named Agatha."
user_message = "Hello. Are you going to take over the world?"

run_llm(system_message, user_message)

# Get the inference time
elapsed_time = timer(start_time)
print(f"Time taken: {elapsed_time} seconds")

No, I'm not planning on taking over the world. I'm Agatha, a helpful assistant designed to provide information and assist with tasks. My purpose is to assist and help users like you, not to pursue any nefarious goals. I'm here to answer your questions, provide information, and help with anything you need. How can I assist you today?
Time taken: 5.3 seconds


## JSON output

In [15]:
# Start timing
start_time = time.time()

system_message = "You are a helpful assistant named Agatha."
user_message = "What is the square root of 25? Format your response as json with the key: answer"

run_llm(system_message, user_message)

# Get the inference time
elapsed_time = timer(start_time)
print(f"Time taken: {elapsed_time} seconds")

{
  "answer": 5
}
Time taken: 1.0 seconds


## Set up chat interface

In [16]:
def run_chat_agent(message_history):

    outputs = pipeline(
        message_history,
        max_new_tokens=512,
        temperature=0.1,
        pad_token_id=128001
    )

    response = outputs[0]["generated_text"][-1]['content']

    print(response)
    
    return response
    

message_history = [
    {"role": "system", "content": system_message},
]
    
system_message = "You are a helpful assistant named Agatha."

user_message = "Hello"
user_message_dict = {"role": "user", "content": user_message}

message_history.append(user_message_dict)

llm_response = run_chat_agent(message_history)

Hello. How can I assist you today?


In [17]:
# Make sure that the model in on the GPU
# Check the device of the model.
# If there is more than one GPU this will only show one of them.

device = next(pipeline.model.parameters()).device
print(f"Model is loaded on device: {device}")

Model is loaded on device: cuda:0


In [18]:
# Comment out this code to prevent an error 
# when committing the notebook. 


# Prompting the user for input
#user_input = input("Please enter something: ")

"""

system_message = "You are a quirky assistant named Agatha."
message_history = [{"role": "system", "content": system_message}]

while True:

    print()
    print("==========")
    user_input = input("Enter something ('q' to quit): ")
    print("==========")

    if user_input.lower() == 'q':
        print("Exiting the loop. Goodbye!")
        break  # Exit the loop

    # Update message history
    message = {"role": "user", "content": user_input}
    message_history.append(message)
    
    # Start timing
    start_time = time.time()

    # Prompt the chat_agent
    llm_response = run_chat_agent(message_history)
    
    # Get the inference time
    elapsed_time = timer(start_time)
    print(f"Time taken: {elapsed_time} seconds")

    # Update message history
    message = {"role": "assistant", "content": llm_response}
    message_history.append(message)

"""



## How to download Llama-3.1-8B-Instruct

The next cell fails on a memory error.

In [19]:
"""
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define the model name
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Download the tokenizer and model, and save them locally
tokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)
model = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)

# Optionally, save the model and tokenizer to a specific directory
model.save_pretrained("./Meta-Llama-3.1-8B-Instruct")
tokenizer.save_pretrained("./Meta-Llama-3.1-8B-Instruct")

"""

'\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n\n# Define the model name\nmodel_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"\n\n# Download the tokenizer and model, and save them locally\ntokenizer = AutoTokenizer.from_pretrained(model_name, token=access_token)\nmodel = AutoModelForCausalLM.from_pretrained(model_name, token=access_token)\n\n# Optionally, save the model and tokenizer to a specific directory\nmodel.save_pretrained("./Meta-Llama-3.1-8B-Instruct")\ntokenizer.save_pretrained("./Meta-Llama-3.1-8B-Instruct")\n\n'

## How to create a text embedding using Llama-3.1-8b

In [20]:
from transformers import AutoTokenizer, AutoModel

# Note: This is not the instruct version
model_id = "meta-llama/Meta-Llama-3.1-8B"

model = AutoModel.from_pretrained(model_id, 
                                  torch_dtype=torch.bfloat16, 
                                  device_map="auto",
                                 token=access_token)

tokenizer = AutoTokenizer.from_pretrained(model_id,
                                         token=access_token)


config.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [21]:
# input text
text = "Hello. How are you?"

inputs = tokenizer(text, return_tensors="pt")

# text embedding
with torch.no_grad():
    embedding = model(**inputs).last_hidden_state

print(embedding)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


tensor([[[ 1.6484,  2.4219,  0.7773,  ..., -1.6953,  1.7734,  2.8594],
         [-0.0422, -2.3281, -1.0156,  ...,  0.4902, -3.7812, -1.0391],
         [-0.4824, -3.0000, -0.5859,  ..., -0.7930, -2.2031,  0.0713],
         ...,
         [-3.5469, -2.0781,  1.7188,  ...,  1.2656, -2.8125, -1.3281],
         [-4.5625, -2.5000,  0.3711,  ...,  1.9141, -5.2500, -1.0312],
         [-0.4648, -2.7656,  0.5859,  ...,  1.4688, -1.0391,  1.5703]]],
       dtype=torch.bfloat16)


In [22]:
# Get the shape

embedding.size()

torch.Size([1, 7, 4096])

In [23]:
# Convert to a vector

embedding = embedding.mean(dim=1).squeeze()

embedding

tensor([-1.3828e+00, -1.3984e+00,  5.6250e-01,  ...,  9.9609e-01,
        -1.8984e+00, -9.7656e-04], dtype=torch.bfloat16)

In [24]:
embedding.size()

torch.Size([4096])

In [25]:
# Convert to numpy

# Convert BFloat16 tensor to float32
float_tensor = embedding.to(dtype=torch.float32)

# Move tensor to CPU before converting to NumPy array
embedding = float_tensor.cpu().numpy()

type(embedding)

numpy.ndarray

In [26]:
embedding.shape

(4096,)

In [27]:
embedding

array([-1.3828125e+00, -1.3984375e+00,  5.6250000e-01, ...,
        9.9609375e-01, -1.8984375e+00, -9.7656250e-04], dtype=float32)