<a href="https://colab.research.google.com/github/wayneotemah/Eng2Sheng-notebooks/blob/main/sheng_finetune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Imports and setup

In [None]:
%%capture
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets transformers

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

##We now add LoRA adapters so we only need to update 1 to 10% of all parameters

In [None]:
model = FastLanguageModel.get_peft_model(model,
                                         r = 16,
                                         lora_alpha = 16,
                                         lora_dropout = 0,
                                         bias = "none",
                                         use_gradient_checkpointing = "unsloth",
                                         random_state = 3407,
                                         use_rslora = False,
                                         loftq_config = None,

                                         target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","gate_proj", "up_proj", "down_proj",],
                                         )

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


##Data Prep

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset


In [None]:
ds = load_dataset("EgesaWO/Sheng_sml")

Downloading readme:   0%|          | 0.00/183 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/247k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1042 [00:00<?, ? examples/s]

In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['English', 'Sheng'],
        num_rows: 1042
    })
})

In [None]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
translate the following from english into sheng.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs       = examples["English"]
    outputs      = examples["Sheng"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [None]:
dataset = ds.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1042 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Sheng', 'text'],
        num_rows: 1042
    })
})

In [None]:
dataset["train"][0]

{'English': "Meanwhile, Saul was still breathing out murderous threats against the Lord's disciples. He went to the high priest",
 'Sheng': 'So niaje aje, Paulo alikuwa bado akispit vitisho za kumada ma disciple. Akaishia kwa kuhani mumeja',
 'text': "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\ntranslate the following from english into sheng.\n\n### Input:\nMeanwhile, Saul was still breathing out murderous threats against the Lord's disciples. He went to the high priest\n\n### Response:\nSo niaje aje, Paulo alikuwa bado akispit vitisho za kumada ma disciple. Akaishia kwa kuhani mumeja<|end_of_text|>"}

In [None]:
dataset = dataset['train'].train_test_split(test_size=0.3)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'Sheng', 'text'],
        num_rows: 729
    })
    test: Dataset({
        features: ['English', 'Sheng', 'text'],
        num_rows: 313
    })
})

## Train the model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset["train"],
    eval_dataset = dataset["test"],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # increase this to make the model learn "better"
        num_train_epochs=4,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to="tensorboard",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


##Show current memory stats

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
5.613 GB of memory reserved.


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 729 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.0989
2,3.4404
3,3.3233
4,3.058
5,2.8642
6,2.6345
7,2.4369
8,2.048
9,1.5729
10,1.5094


##Inference

In [None]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "the king is sick", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
tokenizer.batch_decode(outputs)

['<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\ntranslate the following from english into sheng.\n\n### Input:\nthe king is sick\n\n### Response:\nmwanaume huyu ni sick<|end_of_text|>']

In [None]:
model.save_pretrained("sheng_model_small")
tokenizer.save_pretrained("sheng_model_small")

('sheng_model_small/tokenizer_config.json',
 'sheng_model_small/special_tokens_map.json',
 'sheng_model_small/tokenizer.json')

In [None]:
model.push_to_hub("EgesaWO/sheng_model", token = "") ## removed my huggingface key
tokenizer.push_to_hub("EgesaWO/sheng_model", token = "") ## removed my huggingface key

README.md:   0%|          | 0.00/574 [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/EgesaWO/sheng_model


In [None]:
##Load the save model to test

In [None]:
if True:
    from unsloth import FastLanguageModel
    loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
        model_name = "EgesaWO/sheng_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(loaded_model) # Enable native 2x faster inference

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
inputs = loaded_tokenizer(
[
    alpaca_prompt.format(
        "And he trained Llama 3 to undersand sheng, its quite impressive", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(loaded_tokenizer)
_ = loaded_model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

<|begin_of_text|>Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
translate the following from english into sheng.

### Input:
And he trained Llama 3 to undersand sheng, its quite impressive

### Response:
Na alimtrain Llama 3 ku understand sheng, ni impressive sana<|end_of_text|>
