In [1]:
!pip install auto-gptq
!pip install --upgrade optimum
!pip install --upgrade git+https://github.com/huggingface/transformers.git
!pip install --upgrade accelerate
!pip install bitsandbytes

Collecting auto-gptq
  Downloading auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (23.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.5/23.5 MB[0m [31m42.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate>=0.26.0 (from auto-gptq)
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from auto-gptq)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
Collecting rouge (from auto-gptq)
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Collecting gekko (from auto-gptq)
  Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.2/12.2 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
Collecting

In [2]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"


In [3]:
model_name =  "mistralai/Mistral-7B-v0.1"

# Fine-tuned model name
#new_model = "Llama-2-7b-chat-finetune"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
device_map = {"": 0}

In [4]:
import os
import torch
#from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)

In [5]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [6]:
import os

mk_dir = 'contents/model_path'
if not os.path.exists(mk_dir):
    os.makedirs(mk_dir)
save_folder = mk_dir  # Set save_folder to the created directory


In [7]:
save_folder = 'contents/model_path'
model.save_pretrained(save_folder)


In [8]:
torch.cuda.empty_cache()

In [9]:
from transformers import AutoModelForCausalLM

# Replace 'contents/model_path' with the path where the quantized model is saved
model_name = 'contents/model_path'

# Load the quantized model and set low_cpu_mem_usage to True
quantized_model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True)

In [10]:
from accelerate import Accelerator
accelerator = Accelerator()
quantized_model = accelerator.prepare(quantized_model)

In [13]:
import torch

# Define and initialize the device variable
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
import time
def run_inference(quantized_model, tokenizer, device, prompts):
    input_data = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(device)

    # Generate text responses
    start_time = time.time()
    with torch.no_grad():
        generated_outputs = quantized_model.generate(input_ids=input_data["input_ids"],
                                           attention_mask=input_data["attention_mask"],
                                           max_length=128,
                                           repetition_penalty=1.355,  # Adjust repetition penalty
                                           temperature=0.1,  # Adjust temperature
                                           num_return_sequences=1,  # Limit to 1 sequence per prompt
                                           do_sample=True
                                           )
    output_time = time.time() - start_time

    generated_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_outputs]

    # Measure GPU memory allocated
    gpu_memory_allocated = torch.cuda.max_memory_allocated() / (1024 ** 2)

    # Calculate token count for throughput calculation
    token_count = sum([input_data[k].nelement() for k in input_data.keys()])

    return {
        "outputs": generated_texts,
        "output_time": output_time,
        "throughput": token_count / output_time,
        "gpu_memory_allocated": gpu_memory_allocated
    }



In [14]:
# Define the list of prompts
prompts = [
    "What is the capital of Japan?",
    "Who wrote the novel 'To Kill a Mockingbird'?",
    "What is the boiling point of water?",
    "How do I make a chocolate cake?",
    "What are the symptoms of COVID-19?",
    "Can you explain the theory of relativity?",
    "Who won the Nobel Prize in Literature last year?",
    "What is the population of India?",
    "How do I change a flat tire?",
    "What is the square root of 144?",
    # Add additional prompts to ensure there are 32 questions
    "What is the capital of France?",
    "Who is the current President of the United States?",
    "What is the largest mammal on Earth?",
    "How do I bake a pizza?",
    "What are the types of clouds?",
    "Explain the process of photosynthesis.",
    "Who painted the Mona Lisa?",
    "What is the currency of Japan?",
    "How do I tie a necktie?",
    "What are the ingredients of a cheeseburger?",
    "Who discovered gravity?",
    "What is the speed of light?",
    "How do I cook rice?",
    "What is the tallest mountain in the world?",
    "Who wrote the play 'Romeo and Juliet'?",
    "What are the primary colors?",
    "What is the capital of Australia?",
    "How do I send an email?",
    "What is the distance between the Earth and the Moon?",
    "Who invented the telephone?",
    "What is the main component of air?"
]

# Run inference for the given prompts
inference_results = run_inference(quantized_model, tokenizer, device, prompts)



print("Outputs:")
for prompt, output in zip(prompts, inference_results["outputs"]):
    print(f"Prompt: {prompt}")

     # Filter out the repeated question
    filtered_output = output.replace(prompt, "", 1).strip()

    print(f"Generated Text: {filtered_output}")
    #print(f"Generated Text: {output}")


# Measure and print additional metrics
print(f"Output time: {inference_results['output_time']:.2f} seconds")
print(f"Throughput: {inference_results['throughput']:.2f} tokens/second")
print(f"GPU Memory Allocated: {inference_results['gpu_memory_allocated']:.2f} MB")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Outputs:
Prompt: What is the capital of Japan?
Generated Text: т

Tokyo. Tokyo, officially known as Tо̄kyō (東京), or Eastern Capital in English, and formerly Nihonbashi-to (日本橋都) during Edo period, is one of 47 prefectures of Japan. It was established on May 15th, 1869 by merging three former provinces: Musashino Province, Shimotsuke Province, Koshinetsu Province. The city has a population of over 30 million people making it
Prompt: Who wrote the novel 'To Kill a Mockingbird'?
Generated Text: Harper Lee. The author of To Kill A Mocking Bird is Harper Lee, who was born in 1926 and died on February 19th, 2016 at age ninety-four. She grew up with her sister Alice (who also became an acclaimed writer) near Monroeville Alabama where she attended Huntingdon College before moving to New York City after World War II ended; there are many stories about how this small town inspired some aspects from both books but it seems clear that most people agree they
Prompt: What is the boiling point of wat

# **Script for running the model**

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time
from accelerate import Accelerator


torch.cuda.empty_cache()

def load_model(model_name_or_path):
    # Load the model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,low_cpu_mem_usage=True)
    accelerator = Accelerator()
    model = accelerator.prepare(model)
    ##tokenizer = AutoTokenizer.from_pretrained(model)
    return model, tokenizer

def run_inference(model, tokenizer, device, prompts):
    # Tokenize the prompts and move them to the specified device
    input_data = tokenizer(prompts, padding=True, truncation=True, return_tensors="pt").to(device)

    # Generate text responses
    start_time = time.time()
    with torch.no_grad():
        generated_outputs = model.generate(input_ids=input_data["input_ids"],
                                           attention_mask=input_data["attention_mask"],
                                           max_length=128,
                                           repetition_penalty=1.355,  # Adjust repetition penalty
                                           temperature=0.1,  # Adjust temperature
                                           num_return_sequences=1,  # Limit to 1 sequence per prompt
                                           do_sample=True
                                           )
    output_time = time.time() - start_time

    # Decode the generated outputs to text
    generated_texts = [tokenizer.decode(g, skip_special_tokens=True) for g in generated_outputs]

    # Measure GPU memory allocated
    gpu_memory_allocated = torch.cuda.max_memory_allocated() / (1024 ** 2)

    # Calculate token count for throughput calculation
    token_count = sum([input_data[k].nelement() for k in input_data.keys()])

    return {
        "outputs": generated_texts,
        "output_time": output_time,
        "throughput": token_count / output_time,
        "gpu_memory_allocated": gpu_memory_allocated
    }

def main():
    # Ask user to enter the model path or name
    model_name_or_path = input("Enter the model path or name: ")
    #tokenizer_name=input('enter tokenizer_name')

    # Load the model and tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model, tokenizer = load_model(model_name_or_path)
    #model#.to(device)

    # Ask user to enter the prompt
    #prompts = input("Enter the prompt: ")
    prompt_str = input("Enter the prompts (separated by commas): ")
    prompts = [prompt.strip() for prompt in prompt_str.split(",")]


    # Run inference for the given prompts
    inference_results = run_inference(model, tokenizer, device, prompts)


    print("Outputs:")
    for prompt, output in zip(prompts, inference_results["outputs"]):
      print(f"Prompt: {prompt}")

      # Filter out the repeated question
      filtered_output = output.replace(prompt, "", 1).strip()

      print(f"Generated Text: {filtered_output}")
    #print(f"Generated Text: {output}")

    # Run inference for the given prompt
    #inference_results = run_inference(model, tokenizer, device, [prompt])


    # Print the generated text and performance metrics
    print(f"Output time: {inference_results['output_time']:.2f} seconds")
    print(f"Throughput: {inference_results['throughput']:.2f} tokens/second")
    print(f"GPU Memory Allocated: {inference_results['gpu_memory_allocated']:.2f} MB")

if __name__ == "__main__":
    main()


Enter the model path or name: /content/contents/model_path
Enter the prompts (separated by commas): what is the capital of india,who is founder of microsoft,what is photosynthesis


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Outputs:
Prompt: what is the capital of india
Generated Text: ?
New Delhi. The city was built and designed by British architects, planners, engineers, and landscape artists in 1927 as India's new administrative center after a decision to move it from Calcutta (now Kolkata). It has been continuously inhabited since its founding; however many parts were rebuilt or renovated during various periods throughout history due mainly because they had fallen into disrepair over time while others simply needed modernization for practical reasons such as traffic flow improvement etcetera which led them being demolished completely only later replaced with newer structures instead
Prompt: who is founder of microsoft
Generated Text: м

# Who Is Founder Of Microsoft?

Who founded the company that would become one of America's most successful corporations, and what was his vision for it? Bill Gates. He co-founded Microsoft with Paul Allen in 1975 after they met at Harvard University while studying compu