<a href="https://colab.research.google.com/github/syllerim/hyrox-performance-buddy/blob/main/2.Fine-Tuning/Hyrox_Fine_tuning_Mistral7B_LoRA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tuning

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
# paths to dataset files stored in Google Drive
DATA_DIR = '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment'

# instructed dataset
PATH_ENVIRONMENT = f'{DATA_DIR}/.env'
PATH_HYROX_SPLIT_DATASET = f'{DATA_DIR}/hyrox_split_dataset'

PATH_HYROX_MODEL_MISTRAL_LORA = f'{DATA_DIR}/hyrox_mistral_lora_model'
PATH_HYROX_MISTRAL_LORA = f'{DATA_DIR}/hyrox_mistral_lora'

PATH_HYROX_OFFLOAD = f'{DATA_DIR}/hyrox_offload'
PATH_HYROX_MODEL_MISTRAL_LORA_2 = f'{DATA_DIR}/hyrox_mistral_lora_2'

In [5]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.0


In [10]:
!pip install accelerate bitsandbytes datasets peft transformers trl -qU

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m36.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m366.3/366.3 kB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [6]:
import os
import pandas as pd

from datasets import load_from_disk
from dotenv import load_dotenv

from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [7]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
# load the split_hyrox_dataset from disk, saved in the Fine-tuning gpt-2 exercise.
split_hyrox_dataset = load_from_disk('file://' + PATH_HYROX_SPLIT_DATASET)

# Load Mistral + Prepare Tokenizer

✅ I will Fine-tune Mistral-7B-Instruct using: Hugging Face transformers, peft for efficient LoRA fine-tuning (GPU-friendly), text column from your split_hyrox_dataset


In [26]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    offload_folder=PATH_HYROX_OFFLOAD,
    load_in_4bit=False,
    torch_dtype="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Prepare Mistral for LoRA Fine-Tuning (with peft)
This step uses Parameter-Efficient Fine-Tuning (LoRA) to only train a small set of weights, which:

	•	Saves memory (works well on Colab GPUs)

	•	Speeds up training
  
	•	Keeps the base Mistral model frozen

In [None]:
# skipped on CPU
model = prepare_model_for_kbit_training(model) # Uncomment this line

In [13]:
# configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)



In [None]:
# confirm LoRA is working

model.print_trainable_parameters()

trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.0470


In [14]:
def tokenize_prompt(ds):
    tokenized = tokenizer(
        ds['text'],
        padding='max_length',
        truncation=True,
        max_length=512,
        return_attention_mask=True,
    )
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

tokenized_dataset = split_hyrox_dataset.map(tokenize_prompt, batched=True, remove_columns=split_hyrox_dataset["train"].column_names)

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

In [15]:
# verify include: input_ids, attention_mask
tokenized_dataset['train'][0].keys()
print(tokenized_dataset["train"][0]["attention_mask"])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Define TrainingArguments and run training

In [None]:
# ready to train with HuggingFace Trainer

In [None]:
# first attempt, it took 2 hours for the model to be trained
training_args = TrainingArguments(
    output_dir=PATH_HYROX_MODEL_MISTRAL_LORA,     # where to save model checkpoints
    per_device_train_batch_size=2,                # small batch size for Colab
    gradient_accumulation_steps=4,                # simulate larger batch size
    num_train_epochs=1,
    learning_rate=1e-4,                           # LoRA likes higher LR
    # fp16=True,                                    # enable mixed-precision if using GPU
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    # evaluation_strategy="epoch",                  # use val split
    report_to="none",                             # or "wandb" if using experiment tracking
)

In [21]:
# Attempt with CPU
# training_args = TrainingArguments(
#     output_dir=PATH_HYROX_MODEL_MISTRAL_LORA_2,
#     learning_rate=1e-5,
#     num_train_epochs=3,
#     logging_steps=50,
#     push_to_hub=True,
#     hub_model_id="syllerim/mistral-hyrox",
#     hub_strategy="every_save",
#     save_total_limit=1,
#     use_cpu=True,
#     gradient_accumulation_steps=2,
#     per_device_train_batch_size=1,
#     per_device_eval_batch_size=1
# )

In [24]:
small_train = tokenized_dataset['train'].select(range(200))
small_val = tokenized_dataset['val'].select(range(50))

In [None]:
# define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train,     # tokenized_dataset["train"],
    eval_dataset=small_val,        # tokenized_dataset["val"],
    tokenizer=tokenizer
)

In [None]:
# add 'labels' fields to the tokenized dataset
def add_labels(ds):
    ds["labels"] = ds["input_ids"]
    return ds

tokenized_dataset = tokenized_dataset.map(add_labels)

Map:   0%|          | 0/4204 [00:00<?, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

Map:   0%|          | 0/526 [00:00<?, ? examples/s]

- Fine-tune Mistral-7B-Instruct with LoRA on your Hyrox feedback prompts
- Save checkpoints
- Evaluate after each epoch (using your validation set)

In [None]:
trainer.train()

Step,Training Loss
10,0.4955
20,0.1554
30,0.0873
40,0.074
50,0.0674
60,0.0666
70,0.0644
80,0.065
90,0.0605
100,0.0597


TrainOutput(global_step=526, training_loss=0.06529331258267958, metrics={'train_runtime': 8708.3191, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.06, 'total_flos': 9.187596821751398e+16, 'train_loss': 0.06529331258267958, 'epoch': 1.0})

In [None]:
# model.save_pretrained(PATH_HYROX_MISTRAL_LORA)
# tokenizer.save_pretrained(PATH_HYROX_MISTRAL_LORA)

('/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/tokenizer_config.json',
 '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/special_tokens_map.json',
 '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/chat_template.jinja',
 '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/tokenizer.model',
 '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/added_tokens.json',
 '/content/drive/MyDrive/KeepCoding/Bootcamp_AI/8.LLMs/Assignment/hyrox_mistral_lora/tokenizer.json')

In [None]:
trainer.push_to_hub()

In [None]:
# load the fine-tuned checkpoint
model_path = PATH_HYROX_MISTRAL_LORA

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

# use the generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompt = """<|startoftext|>

### Instruction:
You are a performance analysis assistant for Hyrox athletes. Your job is to analyze performance data provided by atheles once they complete a Hyrox race and generate personalized, insightful feedback that helps the athlete improve.

### Input:
Generate performance feedback based on prediction and true time

### Context:
gender: 1, age: 30-34, total_time: 3800, predicted: 3600, residual: 200, cluster: 2

### Response:
"""

outputs = generator(prompt, max_new_tokens=150)
print(outputs[0]['generated_text'])