In [4]:
from getpass import getpass

# Get from you hugging face account
api_token = getpass('Enter your API token: ')

Enter your API token:  ········


In [5]:
from huggingface_hub import login
login(token=api_token)

In [6]:
import os
os.environ['OCL_ICD_VENDORS'] = '/etc/OpenCL/vendors'
os.environ['CCL_ROOT'] = os.environ.get('CONDA_PREFIX', '')
os.environ['USE_XETLA'] = 'OFF'
os.environ['SYCL_CACHE_PERSISTENT'] = '1'
os.environ['SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS'] = '1'

In [7]:
import torch
import time
import argparse

from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

from IPython.display import Markdown

In [8]:
def get_prompt(user_input: str, chat_history: list[tuple[str, str]], system_prompt: str) -> str:
    """
    Generate a formatted prompt for a LLaMA 3.1 chatbot conversation.

    This function takes the user's input, chat history, and system prompt,
    and combines them into a single formatted string for use in a LLaMA 3.1 chatbot system.

    Parameters:
    user_input (str): The current input from the user.
    chat_history (list[tuple[str, str]]): A list of tuples containing previous 
                                          (user_input, assistant_response) pairs.
    system_prompt (str): Initial instructions or context for the LLaMA 3.1 chatbot.

    Returns:
    str: A formatted string containing the entire conversation history and current input.
    """
    
    # Start the prompt with a special token
    prompt_texts = ['<|begin_of_text|>']

    # Add system prompt if it's not empty
    if system_prompt != '':
        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    # Add each pair of user input and assistant response from the chat history
    for history_input, history_response in chat_history:
        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

    # Add the current user input and prepare for assistant's response
    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    
    # Join all parts of the prompt into a single string
    return ''.join(prompt_texts)

In [None]:
model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load model in 4 bit, which converts the relevant layers in the model into INT4 format
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             load_in_4bit=True,
                                             optimize_model=True,
                                             trust_remote_code=True,
                                             use_cache=True)
model = model.half().to('xpu')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

In [None]:
DEFAULT_SYSTEM_PROMPT = """\
"""

prompt_str = "Provide a clear, concise, and intuitive description of AI for beginners."
max_new_tokens = 512

In [None]:
# Disable gradient computation for inference
with torch.inference_mode():
    # Generate the input prompt using a custom function
    prompt = get_prompt(prompt_str, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
    
    # Encode the prompt into token IDs and move to the XPU (Intel's GPU)
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
    
    # Perform a warmup run to optimize the model
    # This run's output is discarded
    output = model.generate(input_ids, max_new_tokens=max_new_tokens)

    # Start the actual inference
    st = time.time()  # Record the start time
    
    # Generate the output using the language model
    output = model.generate(input_ids, max_new_tokens=max_new_tokens)
    
    # Ensure all XPU operations are completed before recording end time
    torch.xpu.synchronize()
    
    end = time.time()  # Record the end time
    
    # Move the output back to CPU for further processing
    output = output.cpu()
    
    # Decode the output tokens into a human-readable string
    # skip_special_tokens=False means we keep all special tokens in the output
    output_str = tokenizer.decode(output[0], skip_special_tokens=False)
    
    # Print the inference time
    print(f'Inference time: {end-st:.2f} seconds')
    
    # Print the original prompt
    print('-'*20, 'Prompt', '-'*20)
    print(prompt)

# Print the model's response
print('-'*20, 'Response (skip_special_tokens=False)', '-'*20)

# Extract the actual response from the output string
# This assumes the response is between the last '<|end_header_id|>\n\n' and '<|eot_id|>'
response = output_str.split('<|end_header_id|>\n\n')[-1].split('<|eot_id|>')[0]

# Display the response using Markdown formatting
Markdown(response)