In [1]:
%%capture
!mamba install --force-reinstall aiohttp -y
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0

import os
os.environ["WANDB_DISABLED"] = "true"

## Load base model

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 8000 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

base_llama_3 = "pookie3000/llama-3-8b-bnb-4bit-for-chat-training"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = base_llama_3,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit
)

#### (If you add new special tokens, you also need to train embedding I think.... (experimental))

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
)

Unsloth: Offloading input_embeddings to disk to save VRAM
Unsloth: Offloading output_embeddings to disk to save VRAM


Unsloth 2024.6 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Casting embed_tokens to float32
Unsloth: Casting lm_head to float32


#### (If you do not add special tokens)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
)

## Load dataset

In [4]:
from datasets import load_dataset

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

hf_dataset_path = "pookie3000/trump-interviews"
dataset = load_dataset(hf_dataset_path, split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Downloading readme:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/316k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/348 [00:00<?, ? examples/s]

In [5]:
print(dataset[20]["conversations"])
print("----------------------------------------------")
print(dataset[20]["text"])

[{'role': 'user', 'content': 'But he doesn’t have the authority to do that as you know.'}, {'role': 'assistant', 'content': 'What? Okay, are you ready? Let’s have this now, just now, because it’s interest – let’s keep it interesting, right? I like Mike Pence very much. He’s a very fine man. He’s a very nice man. He made a mistake. His lawyer said, you cannot move. I call them the human conveyer belt. I said, even if the votes you mean – I talked to his lawyer – even if the votes are absolutely fraudulent, he can’t send – yes, sir, he can’t send them back. And the Democrats played it and the RINOs played it. And then, the election was over. They told them he couldn’t do it. And Mike said to me, I can’t do it. The lawyers told me I can’t do it, they can’t do it. But the lawyers were wrong because right after the election, they all met, the RINOs and the Democrats, and they worked out a plan to make sure that future vice presidents don’t do what I said you could do.'}, {'role': 'user', 'c

## Train

### Normal

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2,
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

### If you train new embeddings (you need to lower batch size and gradient accum steps)

In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1,
        warmup_steps = 5,
        num_train_epochs = 5,
        #max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
    ),
)

In [None]:
trainer_stats = trainer.train()

## Inference

In [39]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Is Joe Biden competent as president?"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 2000, use_cache = True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Is Joe Biden competent as president?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

I think so. I think so. He’s a very man of God. He’s a very good man. I think he’s a very good president. I’d like to see the country do better. And we have the same thoughts in that. We want to see the country do better. We want to see the country rocking and rolling. And we can do it. We can do it.<|eot_id|>


In [11]:
hf_token = "YOUR_HF_TOKEN"
model.push_to_hub_merged("pookie3000/trump_lora", tokenizer, save_method = "lora", token = hf_token)

Unsloth: Saving LoRA adapters. Please wait...


adapter_model.safetensors:   0%|          | 0.00/4.37G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Saved lora model to https://huggingface.co/pookie3000/trump_lora
