<a href="https://colab.research.google.com/github/twhool02/ptm-quantization/blob/main/Llama_2_7b_hf_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run inference on Llama2 7B model

This notebook runs inference on the model Llama2 7B model.

The purpose of this notebook is to run inference commands on the model which can later be compared to a quantized version of the model

## Setup

### Log into HuggingFace Hub

This code assumes that the user has a hugging face token setup as a notebook secret called HF_TOKEN

In [None]:
# Required when interacting with HuggingFace Hub
!pip install -q --upgrade huggingface_hub

import huggingface_hub

print(f"Hugging Face Version is: {huggingface_hub.__version__}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/388.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m378.9/388.9 kB[0m [31m11.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hHugging Face Version is: 0.22.2


In [None]:
from google.colab import userdata

# using the HF_TOKEN secret, this has write permissions to Hugging Face
hftoken = userdata.get('HF_TOKEN')

In [None]:
from huggingface_hub import login

# Log into hugging face using the HF_TOKEN secrect
login(hftoken, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Install Required Libraries

In [None]:
# The Transformers library provides APIs and tools to download and train pretrained model.
!pip install -q -U transformers

# Accelerate enables the same PyTorch code to be run across any distributed configuration
!pip install -q -U accelerate -q

# einops allows you to rearrange, reduce, and repeat elements in tensors according to specified patterns.
!pip install -q -U einops

# an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems
!pip install -q -U sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m60.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

### Check library versions

In [None]:
#print the version of transformers
import transformers
print(f"version of transformers: {transformers.__version__}")

# print the version of the accelerate library
import accelerate
print(f"version of accelerate: {accelerate.__version__}")

# print the version of the tensorflow library
import tensorflow as tf
print(f"version of tensorflow: {tf.__version__}")

version of transformers: 4.39.3
version of accelerate: 0.28.0
version of tensorflow: 2.15.0


In [None]:
# for interacting with the operating system.
import os

# torch is the main package of PyTorch.
import torch

# base class of all warning category classes
import warnings

from transformers import (
    AutoModelForCausalLM, # Generic model class with a causal language modeling head
    AutoTokenizer, # Automatically selects correct tokenizer for a model.
    HfArgumentParser, # Used for parsing command-line arguments.
    TrainingArguments, # Defines the arguments used during training.
    pipeline, # Creates a pipeline to interact with a model.
    logging, # Logs events during training and evaluation.
    AutoModelForQuestionAnswering # Used to get a model to perform context-based question answering etc…
)

### Define the processor to use

Ensure the model will use a GPU if available

In [None]:
# Load the model directly onto GPU (if available)
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Load Model and Tokenizer

### Define model name

In [None]:
# Define the model to be used
model_name = 'meta-llama/Llama-2-7b-hf'

### Load the Model

In [None]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device # sets the device mapping for the model to use the GPU
)

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

### Get Model details

In [None]:
print(f"Model size: {model.get_memory_footprint() / 1e9:.1f} GB")
print(f"Model params: {model.num_parameters():,}")
print(f"Model Config: \n{model.config}")
print(f"View model structure: \n{model}")

Model size: 27.1 GB
Model params: 6,738,415,616
Model Config: 
LlamaConfig {
  "_name_or_path": "meta-llama/Llama-2-7b-hf",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}

View model structure: 
LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(i

### Load Tokenizer

In [None]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # sets the pad token to the eos token
tokenizer.padding_side = "right" # usually indicated as the right side to use when working with Llama models

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

## Inference

This section is to get respones from the non-quantized model which can be compared to the quantized model

### Using transformers pipeline

In [None]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map=device,
)

sequences = pipeline(
    "Write a poem about Ireland",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
    truncation=True
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Write a poem about Ireland.
Write a poem about the ocean.
Write a poem about a friend.
Write a poem about your favorite book.
Write a poem about your favorite food.
Write a poem about your favorite movie.
Write a poem about your favorite song.
Write a poem about your favorite place.
Write a poem about your favorite animal.
Write a poem about your favorite color.
Write a poem about your favorite season.
Write a poem about your favorite holiday.
Write a poem about your favorite memory.
Write a poem about your favorite person.
Write a poem about your favorite thing.
Write a poem about your favorite sport.
Write a poem about your favorite hobby.
Write a poem about your favorite pet.
Write a poem about your favorite place to visit.
Write a poem about your favorite vacation spot.
Write a poem about your favorite place to live.
Write a poem about your favorite


In [None]:
sequences = pipeline(
   "What is a large language model?",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=200
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: What is a large language model?
What is a large language model?
A large language model (LLM) is a machine learning model that can generate human-like text. LLMs are trained on large datasets of text, which allows them to learn the patterns and relationships between words and phrases.
LLMs can be used for a variety of tasks, including natural language processing (NLP), text generation, and question answering. In NLP, LLMs can be used to identify the meaning of words and phrases, as well as to understand the relationships between them.
In text generation, LLMs can be used to generate new text based on a given prompt. This can be used for tasks such as summarizing articles, generating headlines, or writing emails.
In question answering, LLMs can be used to answer questions about a given text. This can be used for tasks such as providing explanations of text, or providing answers to questions


In [None]:
sequences = pipeline(
    "Tell me what you know about Co. Donegal in Ireland",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=200
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Tell me what you know about Co. Donegal in Ireland.
I'm from Co. Donegal in Ireland. I have been to the US many times and I have family in Chicago and New York.
What is your favorite place in Ireland?
I like the west coast of Ireland. I like to go to Galway, Sligo, Donegal, Mayo and Dublin.
What is your favorite place in America?
My favorite place in America is Chicago. I like the city and the people.
What is the most important thing you learned in America?
I learned a lot of things in America. I learned about the history of the USA. I learned about the culture and the people. I also learned a lot about myself.
What is the most important thing you learned in Ireland?
I learned a lot about myself. I learned about my culture and my family. I also learned about the history of Ireland.
What is your


### Using the system prompt

In [None]:
# clear down the existing pipeline so a new instance can be created
del pipeline

from transformers import pipeline

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
system_prompt = """
You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible, with short concise answers.
"""
prompt = "Can you name the all previous presidents of Ireland?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=2048)
result = pipe(f"<s>[INST]<<SYS>>{system_prompt}<</SYS>>{prompt} [/INST]</s>")
print(result[0]['generated_text'])

<s>[INST]<<SYS>>
You are a helpful, respectful and honest assistant.
Always answer as helpfully as possible, with short concise answers.
<</SYS>>Can you name the all previous presidents of Ireland? [/INST]</s> 5253757502_9b793e9274_o.jpg
Published December 19, 2018 at 500 × 333 in 5253757502_9b793e9274_o.jpg.


### Chat with the model

In [None]:
messages = [
    {"role": "user", "content": "What is your favourite TV show?"},
    {"role": "assistant", "content": "I liked Breaking Bad and Band of Brothers"},
    {"role": "user", "content": "Do you have any recommendations of other shows I might like?"}
]

model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

<s> [INST] What is your favourite TV show? [/INST] I liked Breaking Bad and Band of Brothers </s><s> [INST] Do you have any recommendations of other shows I might like? [/INST]
I've been trying to get into some other shows, but I can't seem to find anything I like.
I'm looking for something with a good plot, good characters, and good acting. I'm not too picky on the genre, but I'm not a fan of horror or anything too bloody.
I've been watching The Flash, but I'm not too happy with it. I've watched the first 2 episodes, but I don't think I'll be continuing. I'm not sure if I'll be watching Arrow either.
I've also been watching Daredevil, but I'm not sure if I'll be watching the second season.
I've also been watching Sense8, but I'm not sure if I'll be watching the second season.
I'm currently watching Agents of S.H.I.E.L.D.
I'm not sure if I'll be watching the second season.
I'm also watching Gotham, but I'm not sure if I'll be watching the second season.
I'm also watching The 100, but I