In [1]:
%pip install transformers==4.34.1 accelerate==0.24.0 sentencepiece==0.1.99 optimum==1.13.2 peft==0.5.0 bitsandbytes==0.41.2.post2

Collecting transformers==4.34.1
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.24.0
  Downloading accelerate-0.24.0-py3-none-any.whl (260 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting optimum==1.13.2
  Downloading optimum-1.13.2.tar.gz (300 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.0/301.0 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting peft==0.5.0
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install datasets==2.17.0

In [6]:
!pip install onnxruntime onnx

Collecting onnxruntime
  Downloading onnxruntime-1.17.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx
  Downloading onnx-1.16.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: onnx, onnxruntime
Successfully installed onnx-1.16.0 onnxruntime-1.17.1


In [1]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling, GenerationConfig, StoppingCriteria, StoppingCriteriaList
import torch
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

In [None]:
from datasets import load_dataset

#Train

In [None]:
!pip install wandb



In [None]:
import wandb
wandb.init()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
!mkdir /kaggle/working/results

In [None]:
#Training params
common_args = {
    "run_name":"training",
    "seed":42,
    "data_seed":42,
    "remove_unused_columns":False
}
train_args = {
    "per_device_train_batch_size":1,
    "per_device_eval_batch_size":1,
    "gradient_accumulation_steps":1,
    "learning_rate":3e-4,
    "weight_decay":0,
    "num_train_epochs":1,
    "warmup_steps":100,
}
eval_args = {
    "evaluation_strategy":"steps",
    "jit_mode_eval":True,
    "eval_steps":20,
}
backup_args = {
    "output_dir":"/content/results",
    "overwrite_output_dir":True,
    "save_strategy":"steps",
    "save_steps":20,
    "load_best_model_at_end":True,
    "push_to_hub":True,
    "hub_model_id":"evgmaslov/Llama-2-7b-hf-fuction-calling-lora",
    "hub_strategy":"checkpoint",
    "hub_token":"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",
}
log_args = {
    "log_level":"warning",
    "logging_strategy":"steps",
    "logging_steps":20,
    "report_to":"wandb"
}
acceleration_args = {
    "fp16":True,
    "fp16_full_eval":True,
}
train_args = TrainingArguments(**common_args, **train_args, **eval_args, **backup_args, **log_args, **acceleration_args)

In [2]:
model_config = {
    "pretrained_model_name_or_path":"TheBloke/Llama-2-7B-fp16",
    "low_cpu_mem_usage":True,
    "torch_dtype":torch.float16,
    "device_map":"auto",
    "offload_state_dict":True,
    "quantization_config":BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16,
                                             bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
}

In [3]:
base_model = AutoModelForCausalLM.from_pretrained(**model_config)
#base_model = prepare_model_for_kbit_training(base_model)
lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  return self.fget.__get__(instance, owner)()
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [None]:
model = get_peft_model(base_model, lora_config)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_config["pretrained_model_name_or_path"])
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = load_dataset("evgmaslov/glaive-function-calling-v2-parsed-ru")
dataset = dataset.map(
        lambda row: tokenizer(row["text"]),
        batched=False,
    )
dataset = dataset.remove_columns("text")

Downloading readme:   0%|          | 0.00/276 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4200 [00:00<?, ? examples/s]

Map:   0%|          | 0/4200 [00:00<?, ? examples/s]

In [None]:
dataset = dataset["train"].train_test_split(test_size=0.001)

In [None]:
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
20,1.4915,1.510139
40,1.2636,1.300152
60,0.8644,1.046914
80,0.5912,1.008726
100,0.6031,0.981471
120,0.5061,0.966544
140,0.4463,0.960621
160,0.5639,0.957785
180,0.3525,0.955217
200,0.5691,0.95363


  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_transposed = (True if A.shape[0] == 1 else False)
failed to use PyTorch jit mode due to: _Map_base::at.
  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_transposed = (True if A.shape[0] == 1 else False)
failed to use PyTorch jit mode due to: _Map_base::at.
  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_transposed = (True if A.shape[0] == 1 else False)
failed to use PyTorch jit mode due to: _Map_base::at.
  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_transposed = (True if A.shape[0] == 1 else False)
failed to use PyTorch jit mode due to: _Map_base::at.
  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_tran

#Generate

In [5]:
from peft import PeftModel
peft_model_name = "evgmaslov/Llama-2-7b-hf-fuction-calling-lora"
model = PeftModel.from_pretrained(base_model, peft_model_name)

In [None]:
def generate(model, tokenizer, prompt, generation_config, stopping_criteria):
    data = tokenizer(prompt, return_tensors="pt")
    data = {k: v.to(model.device) for k, v in data.items()}
    output_ids = model.generate(
        **data,
        generation_config=generation_config,
        stopping_criteria=StoppingCriteriaList([stopping_criteria])
    )[0]
    #output_ids = output_ids[len(data["input_ids"][0]):]
    print(output_ids)
    output = tokenizer.decode(output_ids, skip_special_tokens=True)
    return output

In [None]:
class KeywordsStoppingCriteria(StoppingCriteria):
    def __init__(self, keywords:list, tokenizer):
        self.keywords = keywords
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        result = False
        for key in self.keywords:
          text = self.tokenizer.batch_decode(input_ids)[0]
          if text.endswith(key):
            result = True
        return result

In [None]:
criteria = KeywordsStoppingCriteria(["</TOOL_CALL>", "</s>"], tokenizer)

In [17]:
torch.cuda.empty_cache()

In [6]:
prompt = """Ты - незаменимый помощник, выполняющий задачу вызова функции."""

In [None]:
generation_config = GenerationConfig.from_pretrained(model_config["pretrained_model_name_or_path"])
generation_config.max_new_tokens = 1000
criteria = KeywordsStoppingCriteria(["</TOOL_CALL>", "</s>"], tokenizer)
generate(model, tokenizer, prompt, generation_config, criteria)

tensor([    1,     1, 29961, 25580, 29962,  3532, 14816, 29903,  6778,  1703,
        29982,   448, 18972,  1488,  5472, 11767, 24794,  3316, 29892, 27056,
         1200, 16059, 27341,  9797,  2771,  9718, 20153,  3540, 29889, 10744,
         3759,  2102,  1802,  3843, 24827,  3485, 29969,   477, 15934, 20153,
        15071, 29892,  1077,  8776,  4896,  3029,   490,  4903,  2399,  1892,
          529,  4986,  5607, 29903,  2565,  4986,  5607, 29903, 29958, 30041,
        29982, 19620, 25366,  2771, 10706,  1413,  3288,  1864,  7082, 20846,
        20153, 15071,   733,  1077,  5945,  2937, 18636,  9718, 11146, 29889,
         8616,  1695,  1520,  5588,   846, 29977,  6253, 12395,  5339,  3325,
         2387,  4820, 29892,  1694, 12068, 16642,  1538,   863,   642,  9661,
        29982, 18636,  9718, 17584, 29889, 30012, 29982, 10706, 29977,  3288,
         1864,  1866, 19139, 17200, 20153, 15071, 29901,   529,  4986,  5607,
        29903, 29958,   426,   376,   978,  1115,   376, 15807, 

'[INST] <<SYS>> Ты - незаменимый помощник, выполняющий задачу вызова функции. Тебе предоставлены сигнатуры функций, заключенные в xml теги <TOOLS></TOOLS>Ты можешь вызывать одну или несколько функций по запросу пользователя. Не придумывай значения аргументов, если они не указаны пользователем.Вызывай одну из следующих функций: <TOOLS> { "name": "calculate_bmi", "description": "Рассчитать индекс массы тела (ИОВ)", "parameters": { "type": "object", "properties": { "height": { "type": "number", "description": "Высота в метрах" }, "weight": { "type": "number", "description": "Вес в килограммах" } }, "required": [ "height", "weight" ] } } </TOOLS> Для каждого вызова функции возвращай названия функций и аргументы в формате JSON. Результат запиши внутри тегов <TOOL_CALL></TOOL_CALL> вот так: <TOOL_CALL> {"name": <function-name>, "arguments": <args-dict>} </TOOL_CALL> После вызова функции ты получишь результат вызова внутри тегов <TOOL_RESPONSE></TOOL_RESPONSE>.Ответь на запрос пользователя на

#Inference optimization

In [None]:
from optimum.onnxruntime import ORTModelForCausalLM

In [7]:
tokens = tokenizer(prompt, return_tensors="pt")

In [8]:
traced_model = torch.jit.trace(model, [tokens["input_ids"].to("cuda"), tokens["attention_mask"].to("cuda")])

  if input_shape[-1] > 1:
  if A.numel() == A.shape[-1] and A.requires_grad == False:
  if prod(A.shape) == 0:
  is_transposed = (True if A.shape[0] == 1 else False)


RuntimeError: _Map_base::at

In [9]:
!git clone https://github.com/ggerganov/llama.cpp.git

Cloning into 'llama.cpp'...
remote: Enumerating objects: 22288, done.[K
remote: Counting objects: 100% (20/20), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 22288 (delta 7), reused 11 (delta 5), pack-reused 22268[K
Receiving objects: 100% (22288/22288), 26.99 MiB | 10.04 MiB/s, done.
Resolving deltas: 100% (15710/15710), done.


In [10]:
!pip install -r /content/llama.cpp/requirements.txt

Collecting numpy~=1.24.4 (from -r /content/llama.cpp/./requirements/requirements-convert.txt (line 1))
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.35.2 (from -r /content/llama.cpp/./requirements/requirements-convert.txt (line 3))
  Using cached transformers-4.39.3-py3-none-any.whl (8.8 MB)
Collecting gguf>=0.1.0 (from -r /content/llama.cpp/./requirements/requirements-convert.txt (line 4))
  Downloading gguf-0.6.0-py3-none-any.whl (23 kB)
Collecting protobuf<5.0.0,>=4.21.0 (from -r /content/llama.cpp/./requirements/requirements-convert.txt (line 5))
  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch~=2.1.1 (from -r /c

In [4]:
!pip install huggingface_hub



In [None]:
snapshot_download(repo_id="TheBloke/Llama-2-7B-GGUF", local_dir="llama-2-7B",
                  local_dir_use_symlinks=False, revision="main")