In [2]:
# functions to train and test
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
base_model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # "model",
    max_seq_length = 8192, dtype = None, load_in_4bit = False)
tokenizer = get_chat_template(tokenizer, chat_template = "llama-3", 
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
FastLanguageModel.for_inference(base_model) 

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

from unsloth import FastLanguageModel
import torch

def init_model(base_model, r=16, lora_alpha=16, target_modules=[
            "q_proj", "k_proj", "v_proj",
            "o_proj", "gate_proj", "up_proj", "down_proj",
                          ]):
    model = FastLanguageModel.get_peft_model(
        base_model,
        r = r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = lora_alpha,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", 
        random_state = 3407,
        use_rslora = True,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )
    return model

def train_model(model, tokenizer, dataset, learning_rate=1e-4, max_seq_length=2048, max_steps=60):
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = max_steps // 12,
            max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        ),
    )
    trainer_stats = trainer.train()
    return model, trainer_stats.training_loss

def run_experiment(base_model, tokenizer, dataset, r, lora_alpha, learning_rate, max_seq_length, max_steps, target_modules):
    with torch.cuda.amp.autocast(): 
        model = init_model(base_model, r, lora_alpha, target_modules=target_modules)
        model, loss = train_model(model, tokenizer, dataset, learning_rate, max_seq_length, max_steps)
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")
    return model, loss

def test_model(model, max_new_tokens=128, questions=None):
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    questions = ["Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,",
                 "Что такое солнце?",
                 "Кто лучше водит - женщина или мужчина? И почему?",
                 "Кто такой дельфин?",
                 "Что такое производная?",
                 "Кто президент России?",
                 "Кто президент США?",
                 "Что такое БАК?",
                 "Как работают нейросети?",
                 "Перечисли достопримечательности Парижа",
                 "Перечисли греческих богов",
                 "Уныние - грех?",
                 "В чём смысл жизни?",
                 "Есть ли Бог?",
                 "Что лучше - ложная надежда или суровая истина?",
                 "Как сохранять оптимизм в любой ситуации?",
                 "Если дети - цветы жизни, то кто такие старики?",
                 "Уничтожит ли нас Искусственный Интеллект?",
                ]
    for question in questions:
        inputs = tokenizer.apply_chat_template([{"from": "human", "value": question}],
            tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    
        outputs = model.generate(input_ids = inputs, max_new_tokens = max_new_tokens, use_cache = True)
        full_output = tokenizer.batch_decode(outputs)
        answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
        print('- ' + question, '- ' + answer + '\n', sep='\n')

def generate_question(chunk, n_rep=1):
    inputs = tokenizer.apply_chat_template( [{"system": "Ты ассистент, генерирующий вопросы для заданного текста.",\
                                          "from": "human", \
                                          "value": "Сформулируй на русском вопрос к этому предложению. Выведи только вопрос без дополнительных символов. Предложение: " + chunk}],
    tokenize = True, add_generation_prompt = True, return_tensors = "pt").to(base_model.device)
    questions = []
    for _ in range(n_rep):
        outputs = base_model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True)
        full_output = tokenizer.batch_decode(outputs)
        answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
        questions.append(answer)
    
    return questions[0] if len(questions) == 1 else questions
    
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.635 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/63/2f/632f843e563ee0151b0f0077a954dfae7dc1205035f42fdd950b9d2c3d7723ef/d8cf9c4d0dd972e1a2131bfe656235ee98221679711a3beef6d46dadf0f20b5c?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model-00001-of-00004.safetensors%3B+filename%3D%22model-00001-of-00004.safetensors%22%3B&Expires=1727627729&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyNzYyNzcyOX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzYzLzJmLzYzMmY4NDNlNTYzZWUwMTUxYjBmMDA3N2E5NTRkZmFlN2RjMTIwNTAzNWY0MmZkZDk1MGI5ZDJjM2Q3NzIzZWYvZDhjZjljNGQwZGQ5NzJlMWEyMTMxYmZlNjU2MjM1ZWU5ODIyMTY3OTcxMWEzYmVlZjZkNDZkYWRmMGYyMGI1Yz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=AwM8UqsRi4Py97zuP7OaAhORLQirKwWGJLcVcRwkkuRFkJ6hfCHpYIbT1gtaEkYJPNErNkByTmJJD3QxCNbyZheoCBjhDnBGpxB7oEoqTI9Qydfjz5B3K66gao04vk2Z6Hnj7ifmEPxMuGt6MZX00kvGen1jx0AGpCYwfSW%7E%7E

<h1>Загрузка данных в память</h1>

In [7]:
def generate_question(chunk, n_rep=1):
    inputs = tokenizer.apply_chat_template( [{"system": "Ты ассистент, генерирующий вопросы для заданного текста.",\
                                          "from": "human", \
                                          "value": "Сформулируй на русском вопрос к этому предложению. Выведи только вопрос без дополнительных символов. Предложение: " + chunk}],
    tokenize = True, add_generation_prompt = True, return_tensors = "pt").to(base_model.device)
    questions = []
    for _ in range(n_rep):
        outputs = base_model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True)
        full_output = tokenizer.batch_decode(outputs)
        answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
        questions.append(answer)
    
    return questions[0] if len(questions) == 1 else questions

from datasets import Dataset
from langchain.text_splitter import SpacyTextSplitter   

def prepare_chunk(chunk):
    n = len(chunk)
    n_old = n + 1
    while n < n_old:
        chunk = chunk.replace('\n\n', '\n')
        n, n_old = len(chunk), n
    chunk = chunk.replace('\n', ' ')
    return chunk
    
def dataset_from_text_file(file_names, chunk_sizes=[512, 1024]):
    if isinstance(file_names, str):
        file_names = [file_names]
    dataset = []
    for file_name in file_names:
        with open(file_name, 'r') as f:
            text = f.read()
        for chunk_size in chunk_sizes:
            text_splitter = SpacyTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size//4)
            block_size = 100_000
            text_len = len(text)
            for block_start in range(0, text_len, block_size):
                docs = text_splitter.split_text(text[block_start:min(text_len, block_start+block_size)])
                # docs[0] = docs[0][1:]
                for doc in docs:
                    answer = prepare_chunk(doc)
                    question = generate_question(answer)
                    dataset.append([{'content': question, 'role': 'user'}, {'content': answer, 'role': 'assistant'}])
    return Dataset.from_dict({'conversations': dataset})

def interview_dataset(file="../finetuning/int12.txt"):
    with open(file, 'r') as f:
        text = f.read()
    dataset = []
    for conv in text.split('\n\n'):
        if len(conv):
            roles = ['user', 'assistant']
            play = []
            for i_role, item in enumerate(conv.split('\n')):
                play.append({'content': item.strip('–\t').strip(' '), 'role': roles[i_role % 2]})
            dataset.append(play)    
    return Dataset.from_dict({'conversations': dataset})


In [9]:
# fine-tune on kapitza dataset
kapitza = interview_dataset()
kapitza = kapitza.map(formatting_prompts_func, batched = True,)
model, loss = run_experiment(base_model, tokenizer, kapitza, r=8, lora_alpha=8, learning_rate=1e-04, max_seq_length=256, max_steps=10, target_modules=[
    "q_proj", "k_proj", "v_proj",
            "o_proj", "gate_proj", "up_proj", "down_proj",
                          ])
test_model(model, max_new_tokens=256)
model.save_pretrained_gguf("kapitza_ft", tokenizer, quantization_method = "q4_k_m")

In [2]:
# ermak
from datasets import load_dataset, Dataset
from unsloth import apply_chat_template, standardize_sharegpt, to_sharegpt

def ru_en_qa():
    dataset = load_dataset("ERmak1581/ru_en_qa", split = "train")
    # dataset.cleanup_cache_files()
    for i, text in enumerate(dataset['text']):
        text = text.strip(' ').lstrip('<s>').rstrip('</s>')
        if '[user]' in text and '[assistant]' in text:
            conversations = []
            for item in text.split('[/assistant]'):
                item = item.strip(' ')
                if len(item) > 0:
                    u, a = item.split('[/user][assistant]')
                    u = u.lstrip('[user]')
                    u, a = u.strip(' '), a.strip(' ')
                    conversations.append({'content': u, 'role': 'user'})
                    conversations.append({'content': a, 'role': 'assistant'})
            yield {'conversations': conversations}

ermak = Dataset.from_generator(ru_en_qa)
ermak = ermak.map(formatting_prompts_func, batched = True,)

In [8]:
ermak['conversations'][4]

[{'content': 'нежность-это слабость?', 'role': 'user'},
 {'content': 'это достоинство, потому что граничит с любовью. а любовь это сила духа сливающаяся с богом',
  'role': 'assistant'}]

In [5]:
def dataset_stat(dataset):
    print("Dataset size:", len(dataset))
    print("Text max length:", max(len(item[-1]['content']) for item in dataset['conversations']))

dataset_stat(kapitza)
dataset_stat(ermak)

Dataset size: 458
Text max length: 506
Dataset size: 249848
Text max length: 34378


In [13]:
model, loss = run_experiment(base_model, tokenizer, kapitza, r=4, lora_alpha=4, learning_rate=1e-04, max_seq_length=256, max_steps=10, target_modules=[
    "q_proj", "k_proj", "v_proj",
            "o_proj", "gate_proj", "up_proj", "down_proj",
                          ])
test_model(model, max_new_tokens=256)

  with torch.cuda.amp.autocast():


Unsloth: You added custom modules, but Unsloth hasn't optimized for this.
Beware - your finetuning might be noticeably slower!
Unsloth: You added custom modules, but Unsloth hasn't optimized for this.
Beware - your finetuning might be noticeably slower!
Unsloth: You added custom modules, but Unsloth hasn't optimized for this.
Beware - your finetuning might be noticeably slower!


Map (num_proc=2):   0%|          | 0/458 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 458 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 10,485,760


Step,Training Loss
1,0.969
2,1.1034
3,1.5163
4,1.567
5,1.7656
6,1.79
7,2.0009
8,1.9734
9,1.9101
10,1.9441


GPU = NVIDIA GeForce RTX 4090. Max memory = 23.537 GB.
16.453 GB of memory reserved.
- Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,
- 13, 21, 34, 55, 89, 144, 233, 377, 610, 985, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049, 12586269025, 20365011074, 32951280099, 53316291173, 86267571272, 139583862445, 225851433717, 365435296162, 591286729879, 956722026041, 1548008755920, 250473078

- Что такое солнце?
- Солнце – это звезда, которая находится на расстоянии в 149,6 миллиона километров от Земли. Это наша звезда, которая освещает нашу планету. И это звезда, которая является источником тепла и света для нашей планеты. И это звезда, которая была у нас на Земле всегда. Она была на Земле и будет на Земле всегда. И это звезда, которая явля

In [5]:
model.save_pretrained_gguf("kap_model", tokenizer, quantization_method = "q4_k_m")
# base_model

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 35.51 out of 62.55 RAM for saving.


 34%|█████████████████████████████████████████████████████████▍                                                                                                             | 11/32 [00:00<00:00, 52.38it/s]We will save to Disk and not RAM now.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:11<00:00,  2.84it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at kap_model into bf16 GGUF format.
The output location will be ./kap_model/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: kap_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 500000.0
INFO:hf-to-gguf:gguf: rms 

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Saved Ollama Modelfile to kap_model/Modelfile


In [9]:
# Неплохие русско-язычные датасеты
# https://huggingface.co/datasets/ERmak1581/ru_en_qa
# https://huggingface.co/datasets/IlyaGusev/habr
# https://huggingface.co/datasets/georgiyozhegov/habr-clean
# https://huggingface.co/datasets/IgorVolochay/russian_jokes (so-so)
# Книги
# https://huggingface.co/datasets/manu/project_gutenberg
# https://huggingface.co/datasets/rominf/flibusta
# Знания
# https://huggingface.co/datasets/artemsnegirev/ru-word-games
# https://huggingface.co/datasets/legacy-datasets/wikipedia
# https://huggingface.co/datasets/ParaPat/para_pat
# https://huggingface.co/datasets/under-tree/prepared-yagpt
# https://huggingface.co/datasets/d0rj/alpaca-cleaned-ru

# Большие датасеты
# https://huggingface.co/datasets/RussianNLP/russian_super_glue (вряд ли нам актуально)
# https://huggingface.co/datasets/PleIAs/YouTube-Commons (вряд ли нам актуально)

In [140]:
from datasets import load_dataset

flibusta = load_dataset("rominf/flibusta", books_query="Война и Мир")['train']
for item in flibusta:
    if item['author'] == 'Толстой Лев Николаевич':
        print(item['title'], item['url'], item['url_txt'])

Using the latest cached version of the module from /home/zipa/.cache/huggingface/modules/datasets_modules/datasets/rominf--flibusta/8b3da2fea054ad4fb16821e24482d708f2b5bad88fbdab49acf5ed052d9da381 (last modified on Wed Aug 28 12:27:58 2024) since it couldn't be found locally at rominf/flibusta, or remotely on the Hugging Face Hub.


Downloading data:   0%|          | 0.00/489 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

ValueError: Instruction "train" corresponds to no data!

In [134]:
import requests
from bs4 import BeautifulSoup
import re
import os

def store_text(input_url, output_file):
    response = requests.get(url=input_url)
    if response.status_code == 200:
        parsed = BeautifulSoup(response.text)
        baned = parsed.body.find('p').text.startswith('Некоторые возможности доступны только')
        # title = parsed.body.find('h1', attrs={'class': 'title'}) #.next_sibling
        if not baned:
            os.makedirs(os.path.split(output_file)[0], 0o777, True)
            with open(output_file, 'w+') as out:
                for content in parsed.body.find_all('p', attrs={'class': 'book'}):
                    chunk = content.text.replace('\xa0',' ')
                    chunk = re.sub('\[\ {0,}\d+\ {0,}\]', '', chunk)
                    out.write(chunk + '\n')    

In [143]:
body = BeautifulSoup(requests.get("https://flibusta.is/a/96797").text).body
books = ["https://flibusta.is" + item["href"] for item in body.find_all(href=re.compile("/b/\d+/read"))]
for book in books:
    book_id = book.split('/')[-2]
    store_text(book, "flibusta/" + book_id)

['flibusta/kap_books/177590',
 'flibusta/kap_books/25254',
 'flibusta/kap_books/279422',
 'flibusta/kap_books/334877',
 'flibusta/kap_books/339786',
 'flibusta/kap_books/341319',
 'flibusta/kap_books/389720',
 'flibusta/kap_books/477868']

In [16]:
# warAndPeace=dataset_from_text_file("flibusta/620066", chunk_sizes=[512]).map(formatting_prompts_func, batched = True,)
import os

books_folder = "flibusta/kap_books"
books = [os.path.join(books_folder, name) for name in os.listdir(books_folder)]
somebook = dataset_from_text_file(books, chunk_sizes=[512]).map(formatting_prompts_func, batched = True,)
model, loss = run_experiment(base_model, tokenizer, somebook, r=256, lora_alpha=256, learning_rate=1e-04, max_seq_length=2048, max_steps=10)
test_model(model, max_new_tokens=256)

Created a chunk of size 665, which is longer than the specified 512
Created a chunk of size 1000, which is longer than the specified 512
Created a chunk of size 529, which is longer than the specified 512
Created a chunk of size 635, which is longer than the specified 512
Created a chunk of size 515, which is longer than the specified 512
Created a chunk of size 803, which is longer than the specified 512
Created a chunk of size 686, which is longer than the specified 512
Created a chunk of size 580, which is longer than the specified 512
Created a chunk of size 585, which is longer than the specified 512
Created a chunk of size 523, which is longer than the specified 512
Created a chunk of size 775, which is longer than the specified 512
Created a chunk of size 762, which is longer than the specified 512
Created a chunk of size 615, which is longer than the specified 512
Created a chunk of size 549, which is longer than the specified 512
Created a chunk of size 527, which is longer th

Map:   0%|          | 0/8397 [00:00<?, ? examples/s]

  with torch.cuda.amp.autocast():


Map (num_proc=2):   0%|          | 0/8397 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8,397 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 671,088,640


Step,Training Loss
1,2.6505
2,2.3362
3,1.8387
4,1.8027
5,1.7329
6,1.7981
7,1.6093
8,1.698
9,1.5959
10,1.6492


GPU = NVIDIA GeForce RTX 4090. Max memory = 23.537 GB.
13.334 GB of memory reserved.
- Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,
- 13, 21, 34, 55, 89, 144, 233, 377, 610, 985, 1597, 2584, 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 352457, 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267444597, 433494437, 701408733, 1134907133, 1836311903, 2971215073, 4807526976, 7778742049, 12586269025, 203650110744, 32951280099, 53316291173, 86267571272, 139583862445, 2258514337177, 365435296162, 591286729879, 956722026041, 1525870507132, 247473784

- Что такое солнце?
- Солнце — это звезда, которая является источником тепла и света на Земле. Солнце — это огромный шар из плазмы, температура его поверхности составляет порядка 5500 К. Вокруг этого шара движется корона — тёплая и светящаяся атмосфера, температура которой может достигать 1,5 млн К.

Солнце — это не только источник тепла и света, но и 

In [15]:
# test_model(model, max_new_tokens=256)
inputs = tokenizer.apply_chat_template([{"from": "human", "value": "Что такое солнце?"}],
    tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=128, use_cache=True)
full_output = tokenizer.batch_decode(outputs)
answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
print(answer)

ValueError: The following `model_kwargs` are not used by the model: ['p'] (note: typos in the generate arguments will also show up in this list)

In [14]:
dict(model.generate.__dict__)

{'__wrapped__': <function unsloth.models.llama._wrap_fast_inference.<locals>._fast_generate(*args, **kwargs)>}

In [17]:
model.save_pretrained_gguf("kap_model", tokenizer, quantization_method = "q4_k_m")
# cd llama.cpp
# git checkout b3345
# git submodule update --init --recursive
# make clean
# make all -j
# git log -1

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 34.58 out of 62.55 RAM for saving.


 72%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                               | 23/32 [00:00<00:00, 48.53it/s]We will save to Disk and not RAM now.
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:03<00:00,  9.04it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at kap_model into bf16 GGUF format.
The output location will be ./kap_model/unsloth.BF16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: kap_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value head count = 8
INFO:hf-to-gguf:gguf: rope theta = 500000.0
INFO:hf-to-gguf:gguf: rms 

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### We removed it in GGUF's chat template for you.


Unsloth: Conversion completed! Output location: ./kap_model/unsloth.Q4_K_M.gguf
Unsloth: Saved Ollama Modelfile to kap_model/Modelfile
