In [1]:
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl 
print("Done")

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import load_dataset
from trl import SFTTrainer

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [4]:
!huggingface-cli login --token $secret_hf

  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `vhak` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `vhak`


In [5]:
# Monitering the LLM
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning vietcuna', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbonpaul[0m ([33mbonpaul-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
base_model = "Viet-Mistral/Vistral-7B-Chat"  # Hugging Face model repo

dataset_name = "tranthaihoa/math_test"
new_model = "vistral7B-finetuned"

In [7]:
from datasets import load_dataset, Dataset, load_from_disk
from pathlib import Path
import google.generativeai as genai

# Dataset details
dataset_name = "tranthaihoa/math_test"
output_dir = Path("/kaggle/working")  # Specify output directory for saving datasets

# Preprocessing function
def preprocess_meta_math(batch):
    result = {
        "instruction": [],
        "response": []
    }
    
    for i in range(len(batch["id"])):
        # Construct instruction and response
        instruction = (
            f"[ID: {batch['id'][i]}] "
            f"[Question: {batch['Question'][i]}] "
            f"[Explanation: {batch['Explanation'][i]}] "
            f"[Inference Steps: {batch['Inference Steps'][i]}] "
            f"[Grade: {batch['Grade'][i]}] "
            f"[Source: {batch['Source'][i]}] "
            f"[Instruction: {batch['Instruction'][i]}] "
            f"[Response Type: {batch['Response Type'][i]}] "
            f"[Math Type: {batch['Math Type'][i]}]"
        )
        response = batch["Answer"][i]

        result["instruction"].append(instruction)
        result["response"].append(response)
    
    return result

# Load and preprocess the dataset
print("Loading dataset...")
ds = load_dataset(dataset_name)
print("Preprocessing dataset...")
preprocessed_ds = ds.map(preprocess_meta_math, batched=True, remove_columns=ds["train"].column_names)

# Shuffle the dataset
print("Shuffling dataset...")
shuffled_dataset = preprocessed_ds["train"].shuffle(seed=42)

# Limit to the total rows in the dataset
total_rows = 821
limited_dataset = shuffled_dataset.select(range(total_rows))

# Calculate split sizes (80% train, 10% validation, 10% test)
train_size = int(0.8 * total_rows)  # 656 rows for train
val_size = int(0.1 * total_rows)    # 82 rows for validation
test_size = total_rows - train_size - val_size  # 83 rows for test

# Split the dataset
print("Splitting dataset...")
train_dataset = limited_dataset.select(range(train_size))
val_dataset = limited_dataset.select(range(train_size, train_size + val_size))
test_dataset = limited_dataset.select(range(train_size + val_size, total_rows))

# Save splits in Hugging Face Dataset format
print("Saving datasets to disk...")
train_dataset.save_to_disk(output_dir / "train_dataset")
val_dataset.save_to_disk(output_dir / "validation_dataset")
test_dataset.save_to_disk(output_dir / "test_dataset")

print(f"Datasets saved to {output_dir}")
print(f"Train set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

# Print first instruction of the train dataset
print("\nPrinting the first instruction from the train dataset:")
first_instruction = train_dataset[0]["instruction"]
print("First instruction:", first_instruction)


Loading dataset...


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/330k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/821 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Shuffling dataset...
Splitting dataset...
Saving datasets to disk...


Saving the dataset (0/1 shards):   0%|          | 0/656 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/83 [00:00<?, ? examples/s]

Datasets saved to /kaggle/working
Train set size: 656
Validation set size: 82
Test set size: 83

Printing the first instruction from the train dataset:
First instruction: [ID: 98857616] [Question: hân dịp khai trương, một cửa hàng bánh Pizza giảm giá 10% tất cả các sản phẩm và giảm thêm 5% trên tổng hóa đơn khi mua từ hai sản phẩm trở lên. Bác Lan mua một Pizza rau củ size vừa giá 139 000 đồng và một Pizza thập cẩm size lớn giá 289 000 đồng. Hỏi nếu bác Lan đưa cho nhân viên thu ngân 500 000 đồng thì bác được trả lại bao nhiêu tiền?] [Explanation: Số tiền bác Lan phải trả khi mua 2 bánh pizza được giảm giá 10% là:
(139000 + 289000) - (139000 + 289000).10% = 385200 (đồng)
Số tiền bác Lan phải trả sau khi được giảm giá 5% trên tổng hóa đơn là:
385200 - 385200.5% = 365940 (đồng)
Số tiền bác Lan được trả lại là:
500000 - 365940 = 134060 (đồng)] [Inference Steps: 2.0] [Grade: 7.0] [Source: 76_de-thi-giua-hk1-toan-7_Toán 7] [Instruction: Solve the following mathematical problem, ensuring to

FIXXXXXXXXX

In [8]:
from datasets import load_from_disk

output_dir = Path("/kaggle/working")

train_dataset = load_from_disk(output_dir / "train_dataset")
val_dataset = load_from_disk(output_dir / "validation_dataset")
test_dataset = load_from_disk(output_dir / "test_dataset")

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from safetensors.torch import load_file  # Import for loading safetensors

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Define the base model and adapter checkpoint paths
base_model_name = base_model  # Replace with the actual base model name

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print("Model and tokenizer loaded successfully!")


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.52k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/597k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/169 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!


In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):

    return tokenizer(
        examples["instruction"],  # Use preformatted 'instruction' as the input
        text_pair=examples["response"],  # Use 'response' for paired input-output sequences
        padding="max_length",  # Pad sequences to the maximum length
        truncation=True,       # Truncate sequences longer than the max length
        max_length=512,        # Set the maximum length for tokenization
    )

# Tokenize the train and validation datasets
tokenized_train_dataset = train_dataset.map(
    tokenize_function, 
    batched=True  # Process multiple examples at once for efficiency
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function, 
    batched=True  # Similarly process the validation dataset
)

Map:   0%|          | 0/656 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print(model)


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(38369, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNo

In [12]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 85244960 || all params: 3889490976 || trainable%: 2.1916739369239253


In [13]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [14]:
from accelerate import Accelerator

# Initialize the accelerator
accelerator = Accelerator()
model = accelerator.prepare_model(model)

In [None]:
from pathlib import Path
from transformers import Trainer, TrainingArguments
import transformers
from datetime import datetime

# Define a custom callback to print train loss
class PrintLossCallback(transformers.TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if "loss" in logs:
            print(f"Train Loss: {logs['loss']}")

cache_dir = Path("F:/ML")  # Kaggle's writable directory
project = "vi-math-finetune"
base_model_name = "vietcuna"
run_name = f"{base_model_name}-{project}"
output_dir = cache_dir / f"vietcuna-fine-tuned-{run_name}"

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Dataset parameters
num_train_samples = 656  # Train set size
num_val_samples = 82     # Validation set size
num_test_samples = 83    # Test set size
batch_size = 4           # Set a larger batch size
num_epochs = 6           # Set number of epochs to 6

# Calculate steps
steps_per_epoch = (num_train_samples + batch_size - 1) // batch_size  # Round up
total_steps = steps_per_epoch * num_epochs
print("steps per epoch:", steps_per_epoch)

# Define trainer
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=TrainingArguments(
        output_dir=output_dir,
        warmup_steps=100,  # Adjust warmup steps to fit the smaller dataset
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=total_steps,  # Set calculated total steps
        learning_rate=2.5e-5,
        bf16=True,  # Enable bf16 if supported
        optim="paged_adamw_8bit",
        logging_steps=steps_per_epoch,  # Log once per epoch
        logging_dir=str(cache_dir / "logs"),
        save_strategy="steps",
        save_steps=steps_per_epoch,  # Save every epoch
        evaluation_strategy="steps",  # Evaluate after every epoch
        eval_steps=steps_per_epoch,  # Evaluate every epoch
        do_eval=True,
        report_to="wandb",  # Logs to WandB for monitoring
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[PrintLossCallback]  # Add the custom callback to print loss
)

# Disable cache for training
model.config.use_cache = False

# Start training
trainer.train()

# Print out the model and tokenizer saving path
print(f"Model and tokenizer saved to {output_dir}")


steps per epoch: 164


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
164,0.9336,0.690677
328,0.5928,0.661081
492,0.4283,0.694747
656,0.2832,0.776974
820,0.1789,0.853082


Train Loss: 0.9336


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Train Loss: 0.5928


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Train Loss: 0.4283


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Train Loss: 0.2832


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Train Loss: 0.1789


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
