In [1]:
from unsloth import FastLanguageModel
import torch
from typing import List, Dict
from trl import SFTTrainer
from transformers import TrainingArguments
import pandas as pd
from datasets import Dataset

In [2]:
df1 = pd.read_csv("2st_summary_train.csv")
df2 = pd.read_csv("2.1st_summary_train.csv")
df = pd.concat([df1, df2], ignore_index=True)

In [3]:
df.head(2)

Unnamed: 0,text,summary
0,- —è –ø–æ–Ω—è–ª\n- —É –º–µ–Ω—è –≤–æ–ø—Ä–æ—Å\n- –µ—Å–ª–∏ —è –¥–µ–ª–∞—é —Å–∞–π...,"–ü–æ–ª–æ–∂–∏—Ç–µ–ª—å–Ω—ã–µ –∞—Å–ø–µ–∫—Ç—ã: –±–ª–∞–≥–æ–¥–∞—Ä–Ω–æ—Å—Ç—å, —É—Å–ø–µ—Ö–∏ –≤..."
1,- –Ω–µ —è —Å —Ç–µ–ª–∞ —É–∂–µ –∑–∞—à–µ–ª; 2024-03-09T14:08:05.3...,–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–µ –∞—Å–ø–µ–∫—Ç—ã: –æ—Ç—Å—É—Ç—Å—Ç–≤–∏–µ –≤—ã–ø–æ–ª–Ω–µ–Ω–∏—è –¥–æ–º–∞...


In [4]:
dataset = Dataset.from_pandas(df)

In [5]:

max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-2b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

==((====))==  Unsloth: Fast Gemma patching release 2024.4
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.691 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.0+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.24. FA = False.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Unsloth 2024.4 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [6]:
llama_prompt = """
### Input:
{}

### Response:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["text"]
    outputs      = examples["summary"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = llama_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "prompt" : texts, }
pass

dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/897 [00:00<?, ? examples/s]

In [7]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "prompt",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=2,
        #num_train_epochs = 1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/897 [00:00<?, ? examples/s]

In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 897 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 224
 "-____-"     Number of trainable parameters = 19,611,648


Step,Training Loss
1,3.2099
2,2.1826
3,3.0836
4,2.4303
5,2.9705
6,2.5988
7,1.6849
8,2.489
9,2.2226
10,2.2755


In [9]:
messages = ['–¥–æ–ø–æ–ª–Ω–µ–Ω–∏–µ –∫ –∫–∞—Ç–µ', 
'–ø–æ–∏–≥—Ä–∞—Ç—å', 
'—è –º–æ–≥—É –Ω–µ –Ω–∞–¥–æ–ª–≥–æ –æ—Ç–æ–π—Ç–∏?', 
'–Ω–∞ –∫–æ–º–ø—å—é—Ç–µ—Ä–µ —Å—Ç–∏–∫–µ—Ä—ã —Å—Ç—Ä–∞–Ω–Ω—ã–µ –ø–æ—á–µ–º—É —Ç–∞–∫', 
'–ê —É –º–µ–Ω—è –∏–Ω—Ç–µ—Ä–Ω–µ—Ç —Å–µ–≥–æ–¥–Ω—è –ø—Ä—è–º –≤ —É–¥–∞—Ä–µ', 
'–º–æ–∂–Ω–æ –≤ –∫—Ä–æ–∫–æ–¥–∏–ª–∞ –ø–æ–∏–≥—Ä–∞—Ç—å', 
'–ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ —á—Ç–æ –∑–∞ —Å—Ç–∏–∫–µ—Ä', 
'—Ä–µ–∞–ª—å–Ω–æ, –æ–∫–∞–Ω—Ç–æ–≤–∫–∞ —Å–ª–∏—à–∫–æ–º –±–æ–ª—å—à–∞—è', 
'–Ω–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–µ –æ–Ω–∏ –∫–∞–∫–∏–µ-—Ç–æ –∞–∫–∫—É—Ä–∞—Ç–Ω–µ–Ω—å–∫–∏–µ, –∞ —Ç—É—Ç —Ç—è–ø –ª—è–ø –∫–∞–∫–æ–π —Ç–æ', 
'–∫–æ—Ä–º', 
'—ç–º–æ–¥–∑–∏', 
'–≤–æ—Ä', 
'–ß–µ–ª–æ–≤–µ–∫', 
'–ø—É—Ö–ª—è—à—å', 
'–ª—É–Ω—Ç–∏–∫', 
'—á–µ–ª–æ–≤–µ–∫ –ø—å–µ—Ç –∫–æ—Ñ–µ', 
'—à–ª–µ–ø–∞' 
]

def create_text_message(messages: List[str]) -> str:
    message_text = ""
    for message in messages:
        message_text += f"- {message}\n"
    
    return message_text

In [10]:
FastLanguageModel.for_inference(model)

In [11]:
print(llama_prompt.format(
    create_text_message(messages),
    "", # output - leave this blank for generation!
))


### Input:
- –¥–æ–ø–æ–ª–Ω–µ–Ω–∏–µ –∫ –∫–∞—Ç–µ
- –ø–æ–∏–≥—Ä–∞—Ç—å
- —è –º–æ–≥—É –Ω–µ –Ω–∞–¥–æ–ª–≥–æ –æ—Ç–æ–π—Ç–∏?
- –Ω–∞ –∫–æ–º–ø—å—é—Ç–µ—Ä–µ —Å—Ç–∏–∫–µ—Ä—ã —Å—Ç—Ä–∞–Ω–Ω—ã–µ –ø–æ—á–µ–º—É —Ç–∞–∫
- –ê —É –º–µ–Ω—è –∏–Ω—Ç–µ—Ä–Ω–µ—Ç —Å–µ–≥–æ–¥–Ω—è –ø—Ä—è–º –≤ —É–¥–∞—Ä–µ
- –º–æ–∂–Ω–æ –≤ –∫—Ä–æ–∫–æ–¥–∏–ª–∞ –ø–æ–∏–≥—Ä–∞—Ç—å
- –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ —á—Ç–æ –∑–∞ —Å—Ç–∏–∫–µ—Ä
- —Ä–µ–∞–ª—å–Ω–æ, –æ–∫–∞–Ω—Ç–æ–≤–∫–∞ —Å–ª–∏—à–∫–æ–º –±–æ–ª—å—à–∞—è
- –Ω–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–µ –æ–Ω–∏ –∫–∞–∫–∏–µ-—Ç–æ –∞–∫–∫—É—Ä–∞—Ç–Ω–µ–Ω—å–∫–∏–µ, –∞ —Ç—É—Ç —Ç—è–ø –ª—è–ø –∫–∞–∫–æ–π —Ç–æ
- –∫–æ—Ä–º
- —ç–º–æ–¥–∑–∏
- –≤–æ—Ä
- –ß–µ–ª–æ–≤–µ–∫
- –ø—É—Ö–ª—è—à—å
- –ª—É–Ω—Ç–∏–∫
- —á–µ–ª–æ–≤–µ–∫ –ø—å–µ—Ç –∫–æ—Ñ–µ
- —à–ª–µ–ø–∞


### Response:



In [12]:
 # Enable native 2x faster inference
inputs = tokenizer(
[
    llama_prompt.format(
        create_text_message(messages),
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['<bos>\n### Input:\n- –¥–æ–ø–æ–ª–Ω–µ–Ω–∏–µ –∫ –∫–∞—Ç–µ\n- –ø–æ–∏–≥—Ä–∞—Ç—å\n- —è –º–æ–≥—É –Ω–µ –Ω–∞–¥–æ–ª–≥–æ –æ—Ç–æ–π—Ç–∏?\n- –Ω–∞ –∫–æ–º–ø—å—é—Ç–µ—Ä–µ —Å—Ç–∏–∫–µ—Ä—ã —Å—Ç—Ä–∞–Ω–Ω—ã–µ –ø–æ—á–µ–º—É —Ç–∞–∫\n- –ê —É –º–µ–Ω—è –∏–Ω—Ç–µ—Ä–Ω–µ—Ç —Å–µ–≥–æ–¥–Ω—è –ø—Ä—è–º –≤ —É–¥–∞—Ä–µ\n- –º–æ–∂–Ω–æ –≤ –∫—Ä–æ–∫–æ–¥–∏–ª–∞ –ø–æ–∏–≥—Ä–∞—Ç—å\n- –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ —á—Ç–æ –∑–∞ —Å—Ç–∏–∫–µ—Ä\n- —Ä–µ–∞–ª—å–Ω–æ, –æ–∫–∞–Ω—Ç–æ–≤–∫–∞ —Å–ª–∏—à–∫–æ–º –±–æ–ª—å—à–∞—è\n- –Ω–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–µ –æ–Ω–∏ –∫–∞–∫–∏–µ-—Ç–æ –∞–∫–∫—É—Ä–∞—Ç–Ω–µ–Ω—å–∫–∏–µ, –∞ —Ç—É—Ç —Ç—è–ø –ª—è–ø –∫–∞–∫–æ–π —Ç–æ\n- –∫–æ—Ä–º\n- —ç–º–æ–¥–∑–∏\n- –≤–æ—Ä\n- –ß–µ–ª–æ–≤–µ–∫\n- –ø—É—Ö–ª—è—à—å\n- –ª—É–Ω—Ç–∏–∫\n- —á–µ–ª–æ–≤–µ–∫ –ø—å–µ—Ç –∫–æ—Ñ–µ\n- —à–ª–µ–ø–∞\n\n\n### Response:\n–ù–µ–≥–∞—Ç–∏–≤–Ω—ã–µ –∞—Å–ø–µ–∫—Ç—ã: —Ç–µ—Ö–Ω–∏—á–µ—Å–∫–∏–µ –Ω–µ–ø–æ–ª–∞–¥–∫–∏ —Å –∏–Ω—Ç–µ—Ä–Ω–µ—Ç–æ–º, –ø—Ä–æ–±–ª–µ–º—ã —Å –∫–∞—á–µ—Å—Ç–≤–æ–º —Å—Ç–∏–∫–µ—Ä–æ–≤ –Ω–∞ —Ç–µ–ª–µ—Ñ–æ–Ω–µ.<eos>']

In [None]:
model.save_pretrained("lora_gemma_model_v3") # Local saving
# model.push_to_hub("hf_repo/lora_model", token = "hf_") # Online saving

In [15]:
model.save_pretrained_gguf("gemma_gguf_q4_v5", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 195.45 out of 251.77 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 100.92it/s]

Unsloth: Saving tokenizer...




 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
make: Entering directory '/workspace/llama.cpp'
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE 
I NVCCFLAGS: -std=c++11 -O3 
I LDFLAGS:    
I CC:        cc (Ubuntu 11.

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f9d4f48b460>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f9d4f48b460>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 195.23 out of 251.77 RAM for saving.


 39%|‚ñà‚ñà‚ñà‚ñâ      | 7/18 [00:00<00:00, 103.56it/s]
Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f9d4f48b460>>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 

KeyboardInterrupt



In [14]:
model.push_to_hub_gguf("gromoboy/gemma_gguf_v2", tokenizer, quantization_method = "q4_k_m", 
                       token = "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")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 195.49 out of 251.77 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 18/18 [00:00<00:00, 101.82it/s]

Unsloth: Saving tokenizer...




 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to q4_k_m will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: llama.cpp error code = 0.
Unsloth will DELETE the broken directory and install a new one.
Press CTRL + C / cancel this if this is wrong. We shall wait 10 seconds.



Cloning into 'llama.cpp'...


HEAD is now at fa0b4ad2 cmake : remove obsolete ANDROID check
make: Entering directory '/workspace/llama.cpp'
I ccache not found. Consider installing it for faster compilation.
I llama.cpp build info: 
I UNAME_S:   Linux
I UNAME_P:   x86_64
I UNAME_M:   x86_64
I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion 
I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE 
I NVCCFLAGS: -std=c++11 -O3 
I LDFLAGS:    
I CC:        cc (Ubuntu 11.4.0-1ubuntu1

gemma_gguf_v2-unsloth.Q4_K_M.gguf:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/gromoboy/gemma_gguf_v2


In [18]:
tokenizer.decode(tokenizer.eos_token_id)

'</s>'

Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with ü§ó HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. Gemma 6 trillion tokens is 2.5x faster! [free Colab](https://colab.research.google.com/drive/10NbwlsRChbma1v55m8LAPYG15uQv6HLo?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>