# 1. Загрузка базовой модели

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# 2. Токенизация

In [2]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    padding_side="left",
    add_bos_token=True,
    add_eos_token=True,
    truncation=True,
    max_length=256,
    padding="max_length"
)

tokenizer.pad_token = tokenizer.eos_token

# 3. Инференс

In [3]:
sent = "### English:\nVerify that products installed on endpoints are supported.\n\n### Spanish:\n"
model_input = tokenizer(sent, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    result = tokenizer.decode(model.generate(**model_input, max_new_tokens=60, repetition_penalty=1.15)[0], skip_special_tokens=True)
result




'### English:\nVerify that products installed on endpoints are supported.\n\n### Spanish:\n{check} confirmar que los productos instalados en los puntos de finalidad están soportados.\n\n### French:\nVérifiez que les produits installés sur les terminaux sont compatibles.\n\n### German:\nÜberprüfen Sie,'