# Setup

In [None]:
import shutil, os, subprocess
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!python --version

Python 3.11.11


In [None]:
# !pip install evaluate
# !pip install rouge_score

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# !pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes
!pip install --no-deps xformers trl peft accelerate bitsandbytes

# Install Flash Attention 2 for softcapping support
import torch
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install --no-deps packaging ninja einops "flash-attn>=2.6.3"

# Load model

In [None]:
from unsloth import FastLanguageModel
# import xformers
import torch
max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
# model_name = "unsloth/gemma-2-2b-bnb-4bit"
model_name = "unsloth/gemma-2-2b"


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.5.1+cu124)
    Python  3.11.11 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = False,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)


==((====))==  Unsloth 2025.3.9: Fast Gemma2 patching. Transformers: 4.48.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.23G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/46.4k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Unsloth 2025.3.9 patched 26 layers with 26 QKV layers, 26 O layers and 26 MLP layers.


# Prepare dataset

In [None]:
import os
import pandas as pd
from pprint import pprint

data_dir = '/content/txt_data'

list_ds = []
txt_files = [f for f in os.listdir(data_dir) if f.endswith(".txt")]
print(txt_files)
print('-------' * 10)

for txt_file in txt_files:
    with open(os.path.join(data_dir, txt_file), 'r') as f:
        raw_data = f.read().strip().replace('"','')
    try:
        list_ds += [{'input':sample.split('->')[0].strip(), 'output':sample.split('->')[1].strip()} for sample in raw_data.split('\n')]
    except:
        print(txt_file)

['part_13_deepseek.txt', 'part_4_gpt.txt', 'part_12_deepseek.txt', 'part_3_gpt.txt', 'part_5_gemini.txt', 'part_14_grok.txt', 'part_10_qwen.txt', 'part_6_gemini.txt', 'part_15_grok.txt', 'part_9_qwen.txt', 'part_8_grok.txt', 'part_11_qwen.txt', 'part_2_gpt.txt', 'part_7_grok.txt', 'part_1_gpt.txt']
----------------------------------------------------------------------


In [None]:
default_instruction = "Dựa vào câu hỏi (có hoặc không) và phần gợi ý cho câu trả lời ở đầu vào để đề xuất 3 câu câu trả lời hoàn chỉnh."

for i in range(len(list_ds)):
    try:
      list_ds[i]['input'] = f"câu hỏi: {list_ds[i]['input'].split('-')[0]} | gợi ý: {list_ds[i]['input'].split('-')[1]}"
    except:
      list_ds[i]['input'] = f"câu hỏi: | gợi ý: {list_ds[i]['input'].split('-')[0]}"
    list_ds[i]['instruction'] = default_instruction

In [None]:
pprint(list_ds)

[{'input': 'câu hỏi: Bạn có cần giúp đỡ mở hộp sữa không?  | gợi ý:  hộp sữa',
  'instruction': 'Dựa vào câu hỏi (có hoặc không) và phần gợi ý cho câu trả '
                 'lời ở đầu vào để đề xuất 3 câu câu trả lời hoàn chỉnh.',
  'output': '1. tôi cần được mở hộp sữa để uống. 2. tôi cần được đổ sữa vào '
            'ly. 3. tôi cần được kiểm tra hộp sữa còn hạn không.'},
 {'input': 'câu hỏi: Bạn có muốn ăn bánh quy không?  | gợi ý:  có',
  'instruction': 'Dựa vào câu hỏi (có hoặc không) và phần gợi ý cho câu trả '
                 'lời ở đầu vào để đề xuất 3 câu câu trả lời hoàn chỉnh.',
  'output': '1. tôi muốn ăn bánh quy socola. 2. tôi muốn ăn bánh quy bơ. 3. '
            'tôi muốn ăn bánh quy với sữa.'},
 {'input': 'câu hỏi: | gợi ý: đi, phòng tập',
  'instruction': 'Dựa vào câu hỏi (có hoặc không) và phần gợi ý cho câu trả '
                 'lời ở đầu vào để đề xuất 3 câu câu trả lời hoàn chỉnh.',
  'output': '1. tôi cần được đi vào phòng tập thể dục. 2. tôi cần được sử dụng

In [None]:
from sklearn.model_selection import train_test_split

train_list_ds, val_test_list_ds = train_test_split(list_ds, test_size=0.3, random_state=42)
val_list_ds, test_list_ds = train_test_split(val_test_list_ds, test_size=0.5, random_state=42)

print("Train:", len(train_list_ds), 'samples')
print("Validation:", len(val_list_ds), 'samples')
print("Test:", len(test_list_ds), 'samples')


Train: 778 samples
Validation: 167 samples
Test: 167 samples


In [None]:

alpaca_prompt = """Dưới đây là một hướng dẫn mô tả một nhiệm vụ, kèm theo một đầu vào cung cấp ngữ cảnh bổ sung. Hãy viết một phản hồi phù hợp để hoàn thành yêu cầu.

### Hướng dẫn:
{}

### Đầu vào:
{}

### Phản hồi:
{}"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions  = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

import datasets
import pandas as pd

def create_dataset(list_ds):
    df = pd.DataFrame(list_ds)
    dataset = datasets.Dataset.from_pandas(df)
    dataset = dataset.map(formatting_prompts_func, batched = True,)
    return dataset

In [None]:
train_dataset = create_dataset(train_list_ds)
val_dataset = create_dataset(val_list_ds)

print(train_dataset)
print(val_dataset)

Map:   0%|          | 0/778 [00:00<?, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Dataset({
    features: ['input', 'output', 'instruction', 'text'],
    num_rows: 778
})
Dataset({
    features: ['input', 'output', 'instruction', 'text'],
    num_rows: 167
})


In [None]:
# split_dataset = dataset.train_test_split(test_size=0.01, seed=42)

# train_val_dataset = split_dataset["train"]
# test_dataset = split_dataset["test"]

# split_train_dataset = train_val_dataset.train_test_split(test_size=0.01, seed=42)

# train_dataset = split_train_dataset["train"]
# val_dataset = split_train_dataset["test"]

# print("Train:")
# print(train_dataset)

# print("Validation:")
# print(val_dataset)

# print("Test:")
# print(test_dataset)

# Init Trainer

In [None]:
# import evaluate
# import numpy as np

# # Load ROUGE metric
# rouge = evaluate.load("rouge")

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred

#     # Ensure predictions are extracted properly (handle tuple output)
#     if isinstance(predictions, tuple):
#         predictions = predictions[0]

#     # Convert predictions and labels to Python lists
#     predictions = predictions.tolist() if isinstance(predictions, np.ndarray) else predictions
#     labels = labels.tolist() if isinstance(labels, np.ndarray) else labels

#     # Ensure labels are correctly formatted (replace -100 with 0 for padding)
#     labels = [[token if token != -100 else 0 for token in label] for label in labels]

#     # Convert token IDs to text (batch_decode requires a list of lists)
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     # Compute ROUGE scores
#     result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

#     return {k: round(v, 4) for k, v in result.items()}



In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    # compute_metrics=compute_metrics,
    packing = True, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 4,
        per_device_eval_batch_size = 4,
        gradient_accumulation_steps = 4,
        # eval_accumulation_steps = 2,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 40,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_strategy = "steps",
        logging_steps = 7,
        eval_strategy = "steps",
        # eval_steps = 5,
        # save_strategy = "steps",
        # save_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        report_to = "none",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/778 [00:00<?, ? examples/s]

Packing train dataset (num_proc=2):   0%|          | 0/778 [00:00<?, ? examples/s]

Tokenizing to ["text"] (num_proc=2):   0%|          | 0/167 [00:00<?, ? examples/s]

Packing eval dataset (num_proc=2):   0%|          | 0/167 [00:00<?, ? examples/s]

# Training

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
5.16 GB of memory reserved.


In [None]:
from unsloth import unsloth_train
# trainer_stats = trainer.train()
trainer_stats = unsloth_train(trainer)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 108 | Num Epochs = 7 | Total steps = 40
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 20,766,720/2,635,108,608 (0.79% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
7,1.1994,1.159412
14,1.1179,1.073008
21,1.0418,1.011119
28,0.9901,0.968957
35,0.9537,0.945335


Unsloth: Not an error, but Gemma2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


# Save model to gguf format

In [None]:
trainer.model.save_pretrained_gguf("finetuned_model", tokenizer)  #, quantization_method="f16")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.67 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 26/26 [00:01<00:00, 21.76it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving finetuned_model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving finetuned_model/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting gemma2 model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q8_0'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at finetuned_model into q8_0 GGUF format.
The output location will be /content/finetuned_model/unsloth.Q8_0.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: finetuned_model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00002.bin'
INFO:hf-to-gguf:token_embd.weight,               

In [None]:
!cp -r /content/finetuned_model /content/drive/MyDrive/Project/CaNhan/chat_completion_dataset

# Inference Test Dataset

In [None]:
# #model.save_pretrained_merged("model_gemma2b_mna", tokenizer, save_method = "merged_16bit",)
# model.save_pretrained("model_gemma2b_mna") # Local saving
# tokenizer.save_pretrained("model_gemma2b_mna")

In [None]:
FastLanguageModel.for_inference(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Gemma2ForCausalLM(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 2304, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2304, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2304, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

In [None]:
from tqdm import tqdm
import json

text_preds = []
text_labels = []

for sample in tqdm(test_list_ds):
    inputs = tokenizer(
    [
        alpaca_prompt.format(
            sample['instruction'],
            sample['input'],
            "", # output - leave this blank for generation!
            )
    ], return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens = 256, use_cache = True)
    # tokenizer.batch_decode(outputs)[0].split("\n\n### Response:\n")[1].split("\n\n### Explanation:\n")[0]
    text_pred = tokenizer.batch_decode(outputs)[0].split("\n\n### Phản hồi:\n")[1]

    text_preds.append(text_pred.replace('<eos>',''))
    text_labels.append(sample['output'])

100%|██████████| 167/167 [42:03<00:00, 15.11s/it]


In [None]:
# with open("/content/drive/MyDrive/text_preds.json", "w", encoding="utf-8") as f:
#     json.dump(text_preds, f, ensure_ascii=False, indent=4)

In [None]:
# with open("/content/drive/MyDrive/text_labels.json", "w", encoding="utf-8") as f:
#     json.dump(text_labels, f, ensure_ascii=False, indent=4)

In [None]:
with open("/content/drive/MyDrive/text_preds_finetuned.json", "w", encoding="utf-8") as f:
    json.dump(text_preds, f, ensure_ascii=False, indent=4)

In [None]:
# mna_news_instruction
# mna_news_input

# inputs = tokenizer(
# [
#     alpaca_prompt.format(
#        mna_news_instruction,
#        mna_news_input,
#         "", # output - leave this blank for generation!
#     )
# ], return_tensors = "pt").to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
# # tokenizer.batch_decode(outputs)[0].split("\n\n### Response:\n")[1].split("\n\n### Explanation:\n")[0]
# tokenizer.batch_decode(outputs)[0].split("\n\n### Phản hồi:\n")[1]

'tối nay tôi muốn ăn cơm<eos>'