In [8]:
model_name = 'unsloth/llama-3-8b-Instruct-bnb-4bit' #'unsloth/llama-3-8b-Instruct'
load_in_4bit = False


# functions to train and test
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template

base_model, tokenizer = FastLanguageModel.from_pretrained( model_name = model_name, max_seq_length = 8192, dtype = None, load_in_4bit = load_in_4bit)
tokenizer = get_chat_template(tokenizer, chat_template = "llama-3", 
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)
FastLanguageModel.for_inference(base_model) 

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

from unsloth import FastLanguageModel
import torch

def init_model(base_model, r=16, lora_alpha=16, target_modules=[
            "q_proj", "k_proj", "v_proj",
            "o_proj", "gate_proj", "up_proj", "down_proj",
                          ]):
    model = FastLanguageModel.get_peft_model(
        base_model,
        r = r, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = target_modules,
        lora_alpha = lora_alpha,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", 
        random_state = 3407,
        use_rslora = True,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )
    return model

def train_model(model, tokenizer, dataset, learning_rate=1e-4, max_seq_length=2048, max_steps=60):
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = max_steps // 12,
            max_steps = max_steps,
            learning_rate = learning_rate,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
        ),
    )
    trainer_stats = trainer.train()
    return model, trainer_stats.training_loss

def run_experiment(base_model, tokenizer, dataset, r, lora_alpha, learning_rate, max_seq_length, max_steps, target_modules):
    with torch.amp.autocast('cuda'): 
        model = init_model(base_model, r, lora_alpha, target_modules=target_modules)
        model, loss = train_model(model, tokenizer, dataset, learning_rate, max_seq_length, max_steps)
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")
    return model, loss

def test_model(model, max_new_tokens=128, questions=None):
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    questions = ["Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,",
                 "Что такое солнце?",
                 "Кто лучше водит - женщина или мужчина? И почему?",
                 "Кто такой дельфин?",
                 "Что такое производная?",
                 "Кто президент России?",
                 "Кто президент США?",
                 "Что такое БАК?",
                 "Как работают нейросети?",
                 "Перечисли достопримечательности Парижа",
                 "Перечисли греческих богов",
                 "Уныние - грех?",
                 "В чём смысл жизни?",
                 "Есть ли Бог?",
                 "Что лучше - ложная надежда или суровая истина?",
                 "Как сохранять оптимизм в любой ситуации?",
                 "Если дети - цветы жизни, то кто такие старики?",
                 "Уничтожит ли нас Искусственный Интеллект?",
                ]
    for question in questions:
        inputs = tokenizer.apply_chat_template([{"from": "human", "value": question}],
            tokenize = True, add_generation_prompt = True, return_tensors = "pt").to("cuda")
    
        outputs = model.generate(input_ids = inputs, max_new_tokens = max_new_tokens, use_cache = True)
        full_output = tokenizer.batch_decode(outputs)
        answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
        print('- ' + question, '- ' + answer + '\n', sep='\n')

def generate_question(chunk, n_rep=1):
    inputs = tokenizer.apply_chat_template( [{"system": "Ты ассистент, генерирующий вопросы для заданного текста.",\
                                          "from": "human", \
                                          "value": "Сформулируй на русском вопрос к этому предложению. Выведи только вопрос без дополнительных символов. Предложение: " + chunk}],
    tokenize = True, add_generation_prompt = True, return_tensors = "pt").to(base_model.device)
    questions = []
    for _ in range(n_rep):
        outputs = base_model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True)
        full_output = tokenizer.batch_decode(outputs)
        answer = full_output[0].split('|end_header_id|>\n\n')[-1].rstrip('<|eot_id|>')
        questions.append(answer)
    
    return questions[0] if len(questions) == 1 else questions
    
def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }

==((====))==  Unsloth 2024.9.post3: Fast Llama patching. Transformers = 4.45.0.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.635 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


RuntimeError: Failed to import transformers.models.falcon_mamba.configuration_falcon_mamba because of the following error (look up to see its traceback):
No module named 'transformers.models.falcon_mamba.configuration_falcon_mamba'

In [3]:
test_model(model, max_new_tokens=256)

Collecting unsloth
  Downloading unsloth-2024.10.1-py3-none-any.whl.metadata (56 kB)
Collecting unsloth-zoo (from unsloth)
  Downloading unsloth_zoo-2024.10.1-py3-none-any.whl.metadata (48 kB)
Collecting transformers<4.45.0 (from unsloth)
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers<4.45.0->unsloth)
  Using cached tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading unsloth-2024.10.1-py3-none-any.whl (161 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Downloading unsloth_zoo-2024.10.1-py3-none-any.whl (39 kB)
Using cached tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers, unsloth-zoo, unsloth
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.20.0
    Uninstalling tokenizers-0.20.0:
      Successfully uninstalled tokenize

In [None]:
from datasets import Dataset
def interview_dataset(file="../finetuning/int12.txt"):
    with open(file, 'r') as f:
        text = f.read()
    dataset = []
    for conv in text.split('\n\n'):
        if len(conv):
            roles = ['user', 'assistant']
            play = []
            for i_role, item in enumerate(conv.split('\n')):
                play.append({'content': item.strip('–\t').strip(' '), 'role': roles[i_role % 2]})
            dataset.append(play)    
    return Dataset.from_dict({'conversations': dataset})

In [None]:
# fine-tune on kapitza dataset
kapitza = interview_dataset()
kapitza = kapitza.map(formatting_prompts_func, batched = True,)
model, loss = run_experiment(base_model, tokenizer, kapitza, r=8, lora_alpha=8, learning_rate=1e-04, max_seq_length=256, max_steps=10, target_modules=[
    "q_proj", "k_proj", "v_proj",
            "o_proj", "gate_proj", "up_proj", "down_proj",
                          ])
test_model(model, max_new_tokens=256)
model.save_pretrained_gguf("kapitza_ft", tokenizer, quantization_method = "q4_k_m")

In [1]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor

model_id = "unsloth/Llama-3.2-11B-Vision-Instruct" #"meta-llama/Llama-3.2-11B-Vision-Instruct"

model = MllamaForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
processor = AutoProcessor.from_pretrained(model_id)

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
image = Image.open(requests.get(url, stream=True).raw)

messages = [
    {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    image,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=30)
print(processor.decode(output[0]))

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>If I had to write a haiku for this one, it would be: <|eot_id|><|start_header_id|>assistant<|end_header_id|>

Here is a haiku for the image:

Rabbit in a coat
Standing on a dirt path
Looking very dapper<|eot_id|>


In [6]:
messages = [
    {"role": "user", "content": [{"type": "text", "text": "Что ждёт ИИ через 50 лет?"}]}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
    None,
    input_text,
    add_special_tokens=False,
    return_tensors="pt"
).to(model.device)

output = model.generate(**inputs, max_new_tokens=256)
print(processor.decode(output[0]))

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Что ждёт ИИ через 50 лет?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Прогнозирование развития ИИ на долгосрочную перспективу сложно, поскольку оно зависит от множества факторов, включая инновации, инвестиции, политические решения и социальные факторы. Однако, основываясь на последних достижениях и тенденциях, можно сделать некоторые предположения о том, что может ожидать ИИ через 50 лет.

**Рост интеллектуальной способности**: ИИ может стать более интеллектуально способным и способным решать сложные задачи, подобно человеческому интеллекту. Это может включать в себя улучшение способности к самообучению, саморазвитию и принятию решений.

**Рост автономности**: ИИ может стать более автономным и способным действовать без прямого человеческого контроля. Это может включать в себя улучшение способности к принятию решений, планированию и выполнению задач без человеческого вмешательства.

**Рост визуализации и интерактивно