In [1]:
from getpass import getpass

# Get from you hugging face account
api_token = getpass('Enter your API token: ')

Enter your API token:  ········


In [2]:
from huggingface_hub import login
login(token=api_token)

In [6]:
import os
os.environ['OCL_ICD_VENDORS'] = '/etc/OpenCL/vendors'
os.environ['CCL_ROOT'] = os.environ.get('CONDA_PREFIX', '')
os.environ['USE_XETLA'] = 'OFF'
os.environ['SYCL_CACHE_PERSISTENT'] = '1'
os.environ['SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS'] = '1'

In [7]:
import torch
import time
import argparse

from ipex_llm.transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

from IPython.display import Markdown

In [8]:
def get_prompt(user_input: str, chat_history: list[tuple[str, str]], system_prompt: str) -> str:
    """
    Generate a formatted prompt for a LLaMA 3.1 chatbot conversation.

    This function takes the user's input, chat history, and system prompt,
    and combines them into a single formatted string for use in a LLaMA 3.1 chatbot system.

    Parameters:
    user_input (str): The current input from the user.
    chat_history (list[tuple[str, str]]): A list of tuples containing previous 
                                          (user_input, assistant_response) pairs.
    system_prompt (str): Initial instructions or context for the LLaMA 3.1 chatbot.

    Returns:
    str: A formatted string containing the entire conversation history and current input.
    """
    
    # Start the prompt with a special token
    prompt_texts = ['<|begin_of_text|>']

    # Add system prompt if it's not empty
    if system_prompt != '':
        prompt_texts.append(f'<|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>')

    # Add each pair of user input and assistant response from the chat history
    for history_input, history_response in chat_history:
        prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{history_input.strip()}<|eot_id|>')
        prompt_texts.append(f'<|start_header_id|>assistant<|end_header_id|>\n\n{history_response.strip()}<|eot_id|>')

    # Add the current user input and prepare for assistant's response
    prompt_texts.append(f'<|start_header_id|>user<|end_header_id|>\n\n{user_input.strip()}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n')
    
    # Join all parts of the prompt into a single string
    return ''.join(prompt_texts)

In [9]:
model_path = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Load model in 4 bit, which converts the relevant layers in the model into INT4 format
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             load_in_4bit=True,
                                             optimize_model=True,
                                             trust_remote_code=True,
                                             use_cache=True)
model = model.half().to('xpu')

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2024-12-31 22:43:06,608 - INFO - Converting the current model to sym_int4 format......


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [10]:
DEFAULT_SYSTEM_PROMPT = """\
"""

prompt_str = "Provide a clear, concise, and intuitive description of AI for beginners."
max_new_tokens = 512

In [11]:
# Disable gradient computation for inference
with torch.inference_mode():
    # Generate the input prompt using a custom function
    prompt = get_prompt(prompt_str, [], system_prompt=DEFAULT_SYSTEM_PROMPT)
    
    # Encode the prompt into token IDs and move to the XPU (Intel's GPU)
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to('xpu')
    
    # Perform a warmup run to optimize the model
    # This run's output is discarded
    output = model.generate(input_ids, max_new_tokens=max_new_tokens)

    # Start the actual inference
    st = time.time()  # Record the start time
    
    # Generate the output using the language model
    output = model.generate(input_ids, max_new_tokens=max_new_tokens)
    
    # Ensure all XPU operations are completed before recording end time
    torch.xpu.synchronize()
    
    end = time.time()  # Record the end time
    
    # Move the output back to CPU for further processing
    output = output.cpu()
    
    # Decode the output tokens into a human-readable string
    # skip_special_tokens=False means we keep all special tokens in the output
    output_str = tokenizer.decode(output[0], skip_special_tokens=False)
    
    # Print the inference time
    print(f'Inference time: {end-st:.2f} seconds')
    
    # Print the original prompt
    print('-'*20, 'Prompt', '-'*20)
    print(prompt)

# Print the model's response
print('-'*20, 'Response (skip_special_tokens=False)', '-'*20)

# Extract the actual response from the output string
# This assumes the response is between the last '<|end_header_id|>\n\n' and '<|eot_id|>'
response = output_str.split('<|end_header_id|>\n\n')[-1].split('<|eot_id|>')[0]

# Display the response using Markdown formatting
Markdown(response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Inference time: 31.91 seconds
-------------------- Prompt --------------------
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Provide a clear, concise, and intuitive description of AI for beginners.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


-------------------- Response (skip_special_tokens=False) --------------------


Here's a simple and intuitive explanation of AI for beginners:

**What is AI?**

Artificial Intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence. These tasks include:

* Learning from data
* Recognizing patterns
* Making decisions
* Solving problems
* Understanding language

**How does AI work?**

AI uses algorithms, which are like recipes for computers, to analyze data and make decisions. These algorithms are based on machine learning, a type of AI that allows computers to improve their performance on a task over time by learning from experience.

**Types of AI:**

1. **Narrow or Weak AI**: This type of AI is designed to perform a specific task, like a calculator or a weather app. It's like a tool that's great at one thing, but not much else.
2. **General or Strong AI**: This type of AI is like a super-smart human who can understand and learn from a wide range of tasks. It's still in the early stages of development, but it's the goal of many researchers.
3. **Superintelligence**: This type of AI is like a super-powerful, all-knowing being that's far beyond human intelligence. This is still purely theoretical, and many experts debate whether it's even possible.

**Examples of AI in everyday life:**

* Virtual assistants like Siri, Alexa, or Google Assistant
* Image recognition apps like Google Photos
* Self-driving cars
* Recommendation systems like Netflix or Amazon
* Chatbots that help with customer service

**Key benefits of AI:**

* **Efficiency**: AI can automate repetitive tasks, freeing up time for humans to focus on more creative and strategic work.
* **Accuracy**: AI can analyze vast amounts of data and make decisions with high accuracy, reducing errors.
* **Personalization**: AI can tailor experiences to individual preferences and needs.

**Key challenges and concerns:**

* **Bias**: AI can perpetuate biases if trained on biased data.
* **Job displacement**: AI might replace certain jobs, potentially leading to unemployment.
* **Security**: AI can be vulnerable to cyber attacks and data breaches.

That's a basic overview of AI for beginners! I hope this helps you understand the concept and its implications.