<a href="https://colab.research.google.com/github/twhool02/ptm-quantization/blob/main/Mistral_7B_Instruct_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run inference on Mistral-7B-Instruct

This notebook runs inference on the model Mistral-7B-Instruct-v0.2.

The purpose of this notebook is to run inference commands on the model which can later be compared to a quantized version of the model

## Setup

### Verify program is connected to a A100 GPU

### Log into HuggingFace Hub

This code assumes that the user has a hugging face token setup as a notebook secret called HF_TOKEN

In [2]:
# Required when downloading model from Hugging Face
!pip install -q --upgrade huggingface_hub

import huggingface_hub

print(f"Hugging Face Version is: {huggingface_hub.__version__}")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/388.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/388.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m388.9/388.9 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hHugging Face Version is: 0.22.2


In [3]:
from google.colab import userdata

# using the HF_TOKEN secret, this has write permissions to Hugging Face
hftoken = userdata.get('HF_TOKEN')

In [4]:
from huggingface_hub import login

# Log into hugging face using the HF_TOKEN secrect
login(hftoken, add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


### Install Required Libraries

In [5]:
# The Transformers library provides APIs and tools to easily download and train pretrained model.
!pip install -q -U transformers

# Accelerate enables the same PyTorch code to be run across any distributed configuration
!pip install -q -U accelerate -q

# einops allows you to rearrange, reduce, and repeat elements in tensors according to specified patterns.
!pip install -q -U einops

# an unsupervised text tokenizer and detokenizer mainly for Neural Network-based text generation systems
!pip install -q -U sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m62.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m91.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━

### Check library versions

In [6]:
#print the version of transformers
import transformers
print(f"version of transformers: {transformers.__version__}")

# print the version of the tensorflow library
import accelerate
print(f"version of accelerate: {accelerate.__version__}")

# print the version of the tensorflow library
import tensorflow as tf
print(f"version of tensorflow: {tf.__version__}")

version of transformers: 4.39.3
version of accelerate: 0.28.0
version of tensorflow: 2.15.0


Import Required Libraries

In [9]:
# for interacting with the operating system.
import os

# torch is the main package of PyTorch.
import torch

# base class of all warning category classes
import warnings

from transformers import (
    AutoModelForCausalLM, # Generic model class with a causal language modeling head
    AutoTokenizer, # Automatically selects correct tokenizer for a model.
    HfArgumentParser, # Used for parsing command-line arguments.
    TrainingArguments, # Defines the arguments used during training.
    pipeline, # Creates a pipeline that applies a model to some input data.
    logging, # Used for logging events during training and evaluation.
    AutoModelForQuestionAnswering # Used to get a model to perform context-based question answering etc…
)

### Define the processor to use

Ensure the model will use a GPU if available

In [10]:
# Load the model directly onto GPU (if available)
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Load Model and Tokenizer

### Define the model

In [11]:
# Define the model to be used
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

### Load the model

In [12]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device # sets the device mapping for the model to use the first GPU
)

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

### Get Model details

In [13]:
print(f"Model size: {model.get_memory_footprint() / 1e9:.1f} GB")
print(f"Model params: {model.num_parameters():,}")
print(f"Model Config: \n{model.config}")
print(f"View model structure: \n{model}")

Model size: 30.0 GB
Model params: 7,241,732,096
Model Config: 
MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.2",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 1000000.0,
  "sliding_window": null,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.39.3",
  "use_cache": true,
  "vocab_size": 32000
}

View model structure: 
MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear(in_features=4096, out_

### Load Tokenizer

In [16]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # sets the pad token to the eos token
tokenizer.padding_side = "right" # set padding to the right side

## Inference

This section is to get respones from the non-quantized model which can be compared to the quantized model

### Using transformers pipeline

In [17]:
pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=device,
)
sequences = pipeline(
   "Write a poem about Ireland",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=200
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


Result: Write a poem about Ireland and Question 5.10: What literary devices are used in the poem to create a vivid image of Ireland?

Answer: Title: The Emerald Isle

In the heart of the Atlantic, a gem lies hidden,
Born of whispers of gods, shrouded in myth and pride.
Nestled in cradle of ocean's vast tide,
The Emerald Isle - a land where ancient tales reside.

Golden sun rises, painting the sky anew,
Celestial fire kisses the dew-kissed morn.
Emerald leaves whisper tales, as they sway and bow,
In the gentle breeze that sings a lullaby of the morn.

Beneath, a rolling, emerald tapestry, a lore unfolds,
Stories etched on its hillsides, by the River of Time


In [18]:
sequences = pipeline(
   "What is a large language model?",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=200
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: What is a large language model?
A: A large language model is a type of artificial intelligence (AI) model that uses deep learning techniques to analyze and understand natural language. These models are designed to generate human-like text and understand the meaning of words and sentences, allowing them to answer questions, write essays, generate stories, and perform a variety of other natural language processing tasks.

Large language models are trained on massive amounts of text data, often from the internet or other sources, and use this data to learn the rules and patterns of language. They can be used for a wide range of applications, including chatbots, language translation, sentiment analysis, and content generation. Some of the most well-known large language models include Bing's SentenceBot, Google's Meena, and OpenAI's GPT-3.

The size of a large language model refers to the number of parameters it has. Parameters are the variables that the model


In [19]:
sequences = pipeline(
    "Tell me what you know about Co. Donegal in Ireland",
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    max_length=200
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: Tell me what you know about Co. Donegal in Ireland that would be relevant to the topic?

Comment: The [Co](https://en.wikipedia.org/wiki/County_Donegal) part is not important, but [Donegal](https://en.wikipedia.org/wiki/Donegal) is an ancient kingdom of [Ireland](https://en.wikipedia.org/wiki/Ireland) which was [conquered](https://en.wikipedia.org/wiki/Tudor_conquest_of_Ireland), and later [settled](https://en.wikipedia.org/wiki/Plantation_of_Ulster) by the English in [plantations](https://en.wikipedia.org/wiki/Plantation_of_Ireland).

Comment: The main point would be that this was the area that was settled by the English and that would


### Using the system prompt

In [20]:
# clear down the existing pipeline so a new instance can be created
del pipeline

from transformers import pipeline

In [21]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "Can you name the all previous presidents of Ireland"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Can you name the all previous presidents of Ireland [/INST] Yes, I can provide a list of all the presidents of Ireland since the position was established in 1937. Here are their names, in order, along with the years they served:

1. Douglas Hyde (1938-1945)
2. Seán T. O'Kelly (1945-1959)
3. Éamon de Valera (1959-1973)
4. Cearbhall Áine Ó Dálaigh (1973-1974)
5. Erskine Hamilton Childers (1973-1974)
6. Cormac Ó Ceallaigh (1974-1976)
7. Patrick Hillery (1976-19


### Chat with the model

Uses the Hugging Face chat model template

In [22]:
messages = [
    {"role": "user", "content": "What is your favourite TV show?"},
    {"role": "assistant", "content": "I liked Breaking Bad and Band of Brothers"},
    {"role": "user", "content": "Do you have any recommendations of other shows I might like?"}
]

model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)

generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded[0])

<s> [INST] What is your favourite TV show? [/INST]I liked Breaking Bad and Band of Brothers</s> [INST] Do you have any recommendations of other shows I might like? [/INST] Based on my previous statement, you might enjoy "Better Call Saul" which is a prequel to "Breaking Bad." I would also recommend "The Crown" for those who enjoy historical dramas or "Stranger Things" if you like science fiction and 80s nostalgia. Additionally, "The Wire" is a critically acclaimed crime drama that has gained a large following.</s>
