In [1]:
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl 
print("Done")

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0
Note: you may need to restart the kernel to use updated packages.
Collecting transformers
  Downloading transformers-4.47.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.47.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
import os,torch, wandb
from datasets import load_dataset
from trl import SFTTrainer

In [3]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_hf = user_secrets.get_secret("HUGGINGFACE_TOKEN")
secret_wandb = user_secrets.get_secret("wandb")

In [4]:
!huggingface-cli login --token $secret_hf

  pid, fd = os.forkpty()


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `vhak` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `vhak`


In [5]:
# Monitering the LLM
wandb.login(key = secret_wandb)
run = wandb.init(
    project='Fine tuning mistral 7B', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbonpaul[0m ([33mbonpaul-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
base_model = "meta-math/MetaMath-Mistral-7B"  # Hugging Face model repo

dataset_name = "tranthaihoa/math_test"
new_model = "mistral_7b_vi-math_v2"

In [7]:
from datasets import load_dataset, Dataset
from pathlib import Path

# Dataset details
dataset_name = "tranthaihoa/math_test"
output_dir = Path("/kaggle/working")  # Specify output directory for saving datasets

# Preprocessing function
def preprocess_meta_math(batch):
    result = {
        "instruction": [],
        "response": []
    }
    
    for i in range(len(batch["id"])):
        # Construct instruction and response
        instruction = (
            f"[ID: {batch['id'][i]}] "
            f"[Question: {batch['Question'][i]}] "
            f"[Explanation: {batch['Explanation'][i]}] "
            f"[Inference Steps: {batch['Inference Steps'][i]}] "
            f"[Grade: {batch['Grade'][i]}] "
            f"[Source: {batch['Source'][i]}] "
            f"[Instruction: {batch['Instruction'][i]}] "
            f"[Response Type: {batch['Response Type'][i]}] "
            f"[Math Type: {batch['Math Type'][i]}]"
        )
        response = batch["Answer"][i]

        result["instruction"].append(instruction)
        result["response"].append(response)
    
    return result

# Load and preprocess the dataset
ds = load_dataset(dataset_name)
preprocessed_ds = ds.map(preprocess_meta_math, batched=True, remove_columns=ds["train"].column_names)

# Shuffle the dataset
shuffled_dataset = preprocessed_ds["train"].shuffle(seed=42)

# Limit to the total rows in the dataset
total_rows = 821
limited_dataset = shuffled_dataset.select(range(total_rows))

# Calculate split sizes (80% train, 10% validation, 10% test)
train_size = int(0.8 * total_rows)  # 656 rows for train
val_size = int(0.1 * total_rows)    # 82 rows for validation
test_size = total_rows - train_size - val_size  # 83 rows for test

# Split the dataset
train_dataset = limited_dataset.select(range(train_size))
val_dataset = limited_dataset.select(range(train_size, train_size + val_size))
test_dataset = limited_dataset.select(range(train_size + val_size, total_rows))

# Print the sizes of each split
print(f"Total dataset size: {total_rows}")
print(f"Train set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

# Save splits in Hugging Face Dataset format
train_dataset.save_to_disk(output_dir / "train_dataset")
val_dataset.save_to_disk(output_dir / "validation_dataset")
test_dataset.save_to_disk(output_dir / "test_dataset")

print(f"Datasets saved to {output_dir}")


README.md:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/330k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/821 [00:00<?, ? examples/s]

Map:   0%|          | 0/821 [00:00<?, ? examples/s]

Total dataset size: 821
Train set size: 656
Validation set size: 82
Test set size: 83


Saving the dataset (0/1 shards):   0%|          | 0/656 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/83 [00:00<?, ? examples/s]

Datasets saved to /kaggle/working


FIXXXXXXXXX

In [8]:
from datasets import load_from_disk

output_dir = Path("/kaggle/working")

train_dataset = load_from_disk(output_dir / "train_dataset")
val_dataset = load_from_disk(output_dir / "validation_dataset")
test_dataset = load_from_disk(output_dir / "test_dataset")

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from safetensors.torch import load_file  # Import for loading safetensors

# Configuration for 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Define the base model and adapter checkpoint paths
base_model_name = base_model  # Replace with the actual base model name
adapter_checkpoint_path = "/kaggle/input/checkpoint/checkpoint-2500/adapter_model.safetensors"

# Load the base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Load adapter weights using safetensors
adapter_state_dict = load_file(adapter_checkpoint_path)
model.load_state_dict(adapter_state_dict, strict=False)  # Load adapter into the model

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print("Model and tokenizer loaded successfully!")


config.json:   0%|          | 0.00/620 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

  self.pid = os.fork()


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

  self.pid = os.fork()


pytorch_model-00002-of-00002.bin:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

Model and tokenizer loaded successfully!


In [10]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):

    return tokenizer(
        examples["instruction"],  # Use preformatted 'instruction' as the input
        text_pair=examples["response"],  # Use 'response' for paired input-output sequences
        padding="max_length",  # Pad sequences to the maximum length
        truncation=True,       # Truncate sequences longer than the max length
        max_length=512,        # Set the maximum length for tokenization
    )

# Tokenize the train and validation datasets
tokenized_train_dataset = train_dataset.map(
    tokenize_function, 
    batched=True  # Process multiple examples at once for efficiency
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function, 
    batched=True  # Similarly process the validation dataset
)

Map:   0%|          | 0/656 [00:00<?, ? examples/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

In [11]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print(model)


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32001, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNo

In [12]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, config)
print_trainable_parameters(model)


trainable params: 85041184 || all params: 3837120544 || trainable%: 2.216276059739029


In [13]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [14]:
from accelerate import Accelerator

# Initialize the accelerator
accelerator = Accelerator()
model = accelerator.prepare_model(model)

In [None]:
!pip install evaluate

from pathlib import Path
from transformers import Trainer, TrainingArguments, TrainerCallback
import transformers
import torch
from datetime import datetime
import shutil
from IPython.display import FileLink
import evaluate

# Paths and project configuration
cache_dir = Path("/kaggle/working")  # Kaggle's writable directory
project = "vi-math-finetune"
base_model_name = "mistral"
run_name = f"{base_model_name}-{project}"
output_dir = cache_dir / f"mistral-fine-tuned-{run_name}"

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Dataset parameters
num_train_samples = 656  # Train set size
num_val_samples = 82     # Validation set size
num_test_samples = 83    # Test set size
batch_size = 10           # Set a larger batch size
num_epochs = 6           # Set number of epochs to 6

# Calculate steps
steps_per_epoch = (num_train_samples + batch_size - 1) // batch_size  # Round up
total_steps = steps_per_epoch * num_epochs
print("steps per epoch:", steps_per_epoch)

# Custom callback for weight decay adjustment and logging
class CustomWeightDecayCallback(TrainerCallback):
    def __init__(self, optimizer, initial_weight_decay=0.01, decay_factor=0.9):
        self.optimizer = optimizer
        self.weight_decay = initial_weight_decay
        self.decay_factor = decay_factor

    def on_epoch_end(self, args, state, control, **kwargs):
        # Adjust weight decay
        self.weight_decay *= self.decay_factor
        for group in self.optimizer.param_groups:
            group['weight_decay'] = self.weight_decay

        # Log current learning rate and weight decay
        current_lr = self.optimizer.param_groups[0]['lr']
        print(f"Epoch {state.epoch}: Learning Rate: {current_lr:.6f}, Weight Decay: {self.weight_decay:.6f}")
        
        # Optional: Log first layer weights for monitoring
        first_layer_weights = next(iter(self.optimizer.param_groups[0]['params'])).data
        weight_norm = torch.norm(first_layer_weights).item()
        print(f"Epoch {state.epoch}: First Layer Weight Norm: {weight_norm:.6f}")

# Metric definitions
metric = evaluate.load("squad")  # or other metrics like exact_match or f1

def compute_metrics(p):
    # p.predictions contains the model's predictions, and p.label_ids contains the ground truth labels
    predictions, labels = p
    # Decode the predictions and labels if they are tokenized
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Compute exact match and F1 scores
    results = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return results

# Model, tokenizer, and optimizer setup
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=TrainingArguments(
        output_dir=output_dir,
        warmup_steps=100,  # Adjust warmup steps to fit the smaller dataset
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        max_steps=total_steps,  # Set calculated total steps
        learning_rate=2.5e-5,
        weight_decay=0.01,  # Initial weight decay
        bf16=True,  # Enable bf16 if supported
        optim="paged_adamw_8bit",
        logging_steps=steps_per_epoch,  # Log once per epoch
        logging_dir=str(cache_dir / "logs"),
        save_strategy="steps",
        save_steps=steps_per_epoch,  # Save every epoch
        evaluation_strategy="steps",  # Evaluate after every epoch
        eval_steps=steps_per_epoch,  # Evaluate every epoch
        do_eval=True,
        report_to="wandb",  # Logs to WandB for monitoring
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    compute_metrics=compute_metrics,  # Add the compute_metrics function
    callbacks=[CustomWeightDecayCallback(optimizer=None)]  # Add custom callback
)

# Attach the optimizer to the callback
trainer.optimizer = torch.optim.AdamW(
    model.parameters(), lr=2.5e-5, weight_decay=0.01
)
trainer.callback_handler.callbacks[0].optimizer = trainer.optimizer

# Disable cache for training
model.config.use_cache = False

# Start training
trainer.train()

# Compress the output directory into a ZIP file
zip_file_path = f"{output_dir}.zip"
shutil.make_archive(str(output_dir), 'zip', str(output_dir))
print(f"Model files zipped at: {zip_file_path}")

# Display download link for the zip file
display(FileLink(zip_file_path))

# Print out the model and tokenizer saving path
print(f"Model and tokenizer saved to {output_dir}")


  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
steps per epoch: 66


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
