In [None]:
!pip install git+https://github.com/huggingface/transformers

In [None]:
!pip install hqq

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from hqq.utils.patching import prepare_for_inference

HQQLinear.set_backend(HQQBackend.PYTORCH)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=100, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)


In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from hqq.utils.patching import prepare_for_inference
#prepare_for_inference(model)

HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("What is the result of the following addition operation 34+67?", print_tokens=True)


In [None]:
!pip install bitblas

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)


In [None]:
from hqq.utils.patching import prepare_for_inference

#Prepare the model for inference
#HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend

# Use a try-except block to handle the TypeError
try:
    prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
except TypeError:
    # If TypeError occurs, fallback to the PyTorch backend
    print("Falling back to PyTorch backend due to TypeError in prepare_for_inference.")
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model)

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install hqq
!pip install bitblas

In [None]:


import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval()
cleanup()

#Use optimized inference kernels
###################################################
# Try using bitblas backend, fallback to PyTorch if TypeError occurs
try:
    from hqq.utils.patching import prepare_for_inference
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
except TypeError:
    print("Falling back to PyTorch backend due to TypeError in prepare_for_inference.")
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model)

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)

In [None]:


import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float32, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval()
cleanup()

#Use optimized inference kernels
###################################################
# Try using bitblas backend, fallback to PyTorch if TypeError occurs
try:
    from hqq.utils.patching import prepare_for_inference
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...
except TypeError:
    print("Falling back to PyTorch backend due to TypeError in prepare_for_inference.")
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model)

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)

In [None]:
prepare_for_inference(model)


In [None]:
print(model)


In [None]:
print(tokenizer)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")


In [None]:
print(gen)
print(type(gen))


In [None]:
dtype_ = fp_param[0].dtype if (len(fp_param) > 0) else patch_param["dtype"]


In [None]:
print(model)
print(type(model))


In [None]:
fp_param = [p for p in model.parameters() if p.is_floating_point()]
print(fp_param)


In [None]:
patch_linearlayers(model, patch_add_quant_config,
                   BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False,
                                      quant_zero=False, axis=1, dtype=torch.float32))


In [None]:
prepare_for_inference(model)

In [None]:
HQQLinear.set_backend(HQQBackend.PYTORCH)
prepare_for_inference(model)


In [None]:
fp_param = [p for p in model.parameters() if p.is_floating_point()]
print(f"Floating-point parameters found: {len(fp_param)}")


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial").warmup()
gen.generate("Write an essay about large language models", print_tokens=True)


In [None]:
HQQLinear.set_backend(HQQBackend.PYTORCH)
prepare_for_inference(model)


In [None]:
HQQLinear.set_backend(HQQBackend.PYTORCH)
model.eval()  # Ensure it's in evaluation mode


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=1, do_sample=True, compile=None)
output = gen.generate("Write an essay about large language models", print_tokens=True)
print(output)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile=None)
output = gen.generate("Write an essay about large language models", print_tokens=True)
print(output)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=22, do_sample=True, compile=None)
output = gen.generate("Write an essay about large language models", print_tokens=True)
print(output)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=22, do_sample=True, compile=None)
output = gen.generate("Write an essay about large language models", print_tokens=True)
print(output)


In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

# Load the model
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = AutoHQQHFModel.from_quantized(
    model_id, cache_dir='.', compute_dtype=torch.float32, adapter='adapter_v0.1.lora'
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(
    model,
    patch_add_quant_config,
    BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1),
)

model.eval()
cleanup()

# Use optimized inference kernels (with fallback to PyTorch)
try:
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model, backend="bitblas", allow_merge=False)  # It takes a while...
except TypeError as e:
    print("Falling back to PyTorch backend due to TypeError in prepare_for_inference:", e)
    import traceback
    traceback.print_exc() # Print the full traceback
    print("Model state:", model) # Check if the model object itself is None or has unexpected attributes
    # Add more print statements to inspect potentially problematic variables
    HQQLinear.set_backend(HQQBackend.PYTORCH)
    prepare_for_inference(model)

# Generate
gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True, compile="partial")
# Try to catch potential errors during warmup:
try:
    gen.warmup()  # Faster generation, but warm-up takes a while
except TypeError as e:
    print("Error during warmup:", e)
    import traceback
    traceback.print_exc() # Print the full traceback
    print("Generator state:", gen) # Check if the generator object has unexpected attributes
    # Add more print statements to inspect potentially problematic variables

gen.generate("Write an essay about large language models", print_tokens=True)

https://medium.com/@pies052022/typeerror-nonetype-object-is-not-subscriptable-solved-6658a2ec69c3










https://itsourcecode.com/typeerror/typeerror-nonetype-object-is-not-subscriptable/?source=post_page-----6658a2ec69c3--------------------------------

In [None]:
my_list = None
print(my_list[0])

In [None]:

s_list = None
if s_list is not None:
    print(s_list[0])
else:
    print("The sample list is None!")

In [None]:
s_list = [1, 2, 3, 4, 5, 6, 7, 8, 9]
print(s_list[0])

In [None]:
!pip install torchao

In [None]:
!pip install vllm

In [None]:
from vllm import LLM
from vllm import SamplingParams
import random

# Initialize the language model
llm = LLM(model="/content/models--mobiuslabsgmbh--Llama-3-8b-instruct_2bitgs64_hqq/snapshots/ddd3622e909f63676b6e82d7d05ca6bb60c2c519", max_model_len=4096)

# Define sampling parameters
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=10)

# Define a prompt for the model to generate text
prompt = "Once upon a time"

# Generate text using the model
outputs = llm.generate(prompt, sampling_params=sampling_params)

# Print the generated text
for output in outputs:
    print(output)


In [None]:
import bitblas
print(bitblas.__file__)

In [None]:
!pip show bitblas

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator
import torch._dynamo
torch._dynamo.config.suppress_errors = True
#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=20, do_sample=True, compile="partial") #Faster generation, but warm-up takes a while


gen.generate("How to make a yummy chocolate cake?", print_tokens=True)


In [None]:
fp_param = [p for p in model.parameters() if p.is_floating_point()]
print(f"عدد المعلمات العائمة: {len(fp_param)}")
print(f"أنواع البيانات: {[p.dtype for p in fp_param]}")
#إذا كان العدد 0، فهذا يعني أن جميع المعلمات قد تم تحويلها إلى تنسيق كمي بالكامل.



In [None]:
from hqq.utils.generation_hf import HFGenerator

gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True)
output = gen.generate("ما هو الذكاء الاصطناعي؟", print_tokens=True)
print(output)


### شغال

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

#gen.generate("Write an essay about large language models", print_tokens=True)
#gen.generate("Tell me a funny joke!", print_tokens=True)
#gen.generate("How to make a yummy chocolate cake?", print_tokens=True)



from hqq.utils.generation_hf import HFGenerator

gen = HFGenerator(model, tokenizer, max_new_tokens=5, do_sample=True)
output = gen.generate("What is artificial intelligence?", print_tokens=True)
print(output)


In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial").warmup() #Faster generation, but warm-up takes a while

#gen.generate("Write an essay about large language models", print_tokens=True)
#gen.generate("Tell me a funny joke!", print_tokens=True)
#gen.generate("How to make a yummy chocolate cake?", print_tokens=True)



from hqq.utils.generation_hf import HFGenerator

gen = HFGenerator(model, tokenizer, max_new_tokens=30, do_sample=True)
output = gen.generate("What is artificial intelligence?", print_tokens=True)
print(output)


In [None]:
dtype_ = patch_param.get("dtype", torch.float32)  # اجعل torch.float32 القيمة الافتراضية
3️⃣ تعديل patching.py لتجنب المشكلة
إذا كنت بحاجة إلى prepare_for_inference()، يمكنك تعديل السطر الذي يسبب الخطأ في hqq/utils/patching.py:

In [None]:
dtype_ = fp_param[0].dtype if (len(fp_param) > 0) else patch_param.get("dtype", torch.float32)


In [None]:
if not fp_param:
    raise ValueError("fp_param is empty, meaning no floating-point parameters were found.")


In [None]:
print(f"patch_param: {patch_param}")
print(f"patch_param keys: {patch_param.keys() if patch_param else 'None'}")


In [None]:
dtype_ = fp_param[0].dtype if (len(fp_param) > 0) else patch_param["dtype"]


In [None]:
model = HQQModelForCausalLM.from_quantized(model_id)
model = model.to(device)


In [None]:
print(f"Using device: {device}")


In [None]:
model = HQQModelForCausalLM.from_pretrained(model_id)


In [None]:
result = model.generate(input_ids)
print("Generated result:", result)


الخطوات لتشخيص وحل المشكلة:
تفعيل CUDA_LAUNCH_BLOCKING: لتحديد مكان الخطأ بدقة أكبر، يمكنك تفعيل المتغير البيئي CUDA_LAUNCH_BLOCKING=1. هذا سيجبر CUDA على تنفيذ العمليات بشكل متزامن ويظهر لك موقع الخطأ الفعلي.

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [None]:
print(f"Input tensor shape: {input_ids.shape}")


In [None]:
import torch
torch.set_debug_mode(True)


In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"  # لتفعيل تتبع الأخطاء

# بعد تفعيل المتغير البيئي، حاول تشغيل الكود مرة أخرى
gen = HFGenerator(model, tokenizer, max_new_tokens=22, do_sample=True, compile=None)
output = gen.generate("Write an essay about large language models", print_tokens=True)
print(output)


In [None]:
import torch
from transformers import AutoTokenizer
from hqq.utils.generation_hf import HFGenerator
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator
# تحميل النموذج والمُحوّل
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# إعداد المتغيرات الخاصة بـ input_ids
conversation = [{"role": "user", "content": "What is artificial intelligence?"}]
input_ids = tokenizer.encode(conversation[0]['content'], return_tensors="pt").to(device)
attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)

# استخدم `HFGenerator` للتوليد
gen = HFGenerator(model, tokenizer, max_new_tokens=30, do_sample=True)
output = gen.generate("What is artificial intelligence?", print_tokens=True)
print(output)


In [None]:
import torch
from transformers import AutoTokenizer
from hqq.utils.generation_hf import HFGenerator
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *

# تحديد الجهاز (CPU أو GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# تحميل النموذج والمُحوّل
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# إرسال النموذج إلى الجهاز المحدد (CPU أو GPU)
model = model.to(device)

# إعداد المتغيرات الخاصة بـ input_ids
conversation = [{"role": "user", "content": "What is artificial intelligence?"}]
input_ids = tokenizer.encode(conversation[0]['content'], return_tensors="pt").to(device)
attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)

# استخدم `HFGenerator` للتوليد
gen = HFGenerator(model, tokenizer, max_new_tokens=30, do_sample=True)
output = gen.generate("What is artificial intelligence?", print_tokens=True)
print(output)


### شغال

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

patch_linearlayers(model, patch_add_quant_config,
                          BaseQuantizeConfig(nbits=2, group_size=64, quant_scale=False, quant_zero=False, axis=1))

model.eval();
cleanup()

#Use optimized inference kernels
###################################################
HQQLinear.set_backend(HQQBackend.PYTORCH)
#prepare_for_inference(model) #default backend
#prepare_for_inference(model, backend="bitblas", allow_merge=False) #It takes a while...

#Generate
###################################################
#For longer context, make sure to allocate enough cache via the cache_size= parameter
#gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile=None) #Slower generation but no warm-up
gen = HFGenerator(model, tokenizer, max_new_tokens=1, do_sample=True, compile="partial") #Faster generation, but warm-up takes a while

gen.generate("Write an essay about large language models", print_tokens=True)
gen.generate("Tell me a funny joke!", print_tokens=True)
gen.generate("How to make a yummy chocolate cake?", print_tokens=True)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=1, do_sample=True, compile="partial") #Faster generation, but warm-up takes a while

gen.generate("who is ai?", print_tokens=True)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=1000, do_sample=True, compile="partial") #Faster generation, but warm-up takes a while

gen.generate("who is ai?", print_tokens=True)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=33, do_sample=True, compile=None)  # أو compile="none"


In [None]:
gen.generate("who is ai?", print_tokens=True)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=200, do_sample=True, temperature=0.1, top_k=50)


In [None]:
gen.generate("Write a detailed essay about large language models, their uses in AI, and future developments.", print_tokens=True)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=20, do_sample=True, temperature=0.1, top_k=50, repetition_penalty=1.2)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True, temperature=0.6, top_k=50)


In [None]:
gen.generate("Write a detailed essay about large language models, their uses in AI, and future developments.", print_tokens=True)

In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True, temperature=0.6, top_k=50)

# استخدام التوليد مع repetition_penalty
gen.generate("Write an essay about large language models", print_tokens=True)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True, temperature=0.7, top_k=50)

# Generate a more focused response
gen.generate("Write an insightful essay about the future of AI and its impact on society", print_tokens=True)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True, temperature=0.7, top_k=50, top_p=0.9)
gen.generate("Write a detailed essay on the impact of technology on society", print_tokens=True, repetition_penalty=1.2)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True)
gen.generate("Write a detailed essay on the impact of technology on society", print_tokens=True)


In [None]:
gen.generate(
    "Write a detailed essay on the impact of technology on society",
    print_tokens=True,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.2,
    max_new_tokens=150
)


In [None]:
gen = HFGenerator(model, tokenizer, max_new_tokens=150, do_sample=True)
gen.generate("Write a detailed essay on the impact of technology on society", print_tokens=True)


In [None]:
outputs = chat_processor("How do I build a car?", max_new_tokens=1000, do_sample=False)

### شغال

#################################################################

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from hqq.utils.patching import prepare_for_inference
#prepare_for_inference(model, backend="torchao_int4", verbose=True)

HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)

#Warmup
for i in range(10):
    with torch.no_grad():
        out = model(torch.ones((1, 1), dtype=torch.int32, device='cuda'))
del out
cleanup()

In [None]:
import transformers
from threading import Thread

def chat_processor(chat, max_new_tokens=1, do_sample=True):
    tokenizer.use_default_system_prompt = False
    streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    generate_params = dict(
        tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        top_p=0.90,
        top_k=50,
        temperature= 0.6,
        num_beams=1,
        repetition_penalty=1.2,
    )

    t = Thread(target=model.generate, kwargs=generate_params)
    t.start()

    print('------------------------------------------------------------')
    cleanup()
    print(chat); print();
    outputs = []
    for text in streamer:
        outputs.append(text)
        print(text, end="", flush=True)

    return outputs

################################################################################################
#Generation
outputs = chat_processor("How do I build a car?", max_new_tokens=1000, do_sample=False)

###############################################################

### شغال

#####################################################################

In [None]:
import torch
from transformers import AutoTokenizer
from hqq.models.hf.base import AutoHQQHFModel
from hqq.core.quantize import *
from hqq.utils.patching import *
from hqq.utils.generation_hf import HFGenerator

#Load the model
###################################################
model_id = 'mobiuslabsgmbh/Llama-3-8b-instruct_2bitgs64_hqq'
model     = AutoHQQHFModel.from_quantized(model_id, cache_dir='.', compute_dtype=torch.float16, adapter='adapter_v0.1.lora')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
from hqq.utils.patching import prepare_for_inference
#prepare_for_inference(model, backend="torchao_int4", verbose=True)

HQQLinear.set_backend(HQQBackend.PYTORCH_COMPILE)

#Warmup
for i in range(10):
    with torch.no_grad():
        out = model(torch.ones((1, 1), dtype=torch.int32, device='cuda'))
del out
cleanup()

In [None]:
import transformers
from threading import Thread

def chat_processor(chat, max_new_tokens=100, do_sample=True):
    tokenizer.use_default_system_prompt = False
    streamer = transformers.TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    generate_params = dict(
        tokenizer("<s> [INST] " + chat + " [/INST] ", return_tensors="pt").to('cuda'),
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        top_p=0.80,
        top_k=40,
        temperature= 0.2,
        num_beams=1,
        repetition_penalty=1.3,
    )

    t = Thread(target=model.generate, kwargs=generate_params)
    t.start()

    print('------------------------------------------------------------')
    cleanup()
    print(chat); print();
    outputs = []
    for text in streamer:
        outputs.append(text)
        print(text, end="", flush=True)

    return outputs

################################################################################################
#Generation
outputs = chat_processor("How do I build a car?", max_new_tokens=100, do_sample=True)

###################################################