# Fine Tune Llama Model

In [1]:
import json
from functools import partial
import os
import sys
import gc
from datetime import datetime
from tqdm import tqdm
from pymongo import MongoClient

import torch
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from unsloth import FastLanguageModel # FastLanguageModel for LLMs
from peft import prepare_model_for_kbit_training

  from .autonotebook import tqdm as notebook_tqdm

Please restructure your imports with 'import unsloth' at the top of your file.
  from unsloth import is_bfloat16_supported


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


## Params

In [2]:
app_path = '../'
s3_bucket = "watspeed-data-gr-project"
s3_prefix = "models"
use_s3 = True
mongo_uri = "mongodb://localhost:27017/"
mongo_db_name = "biorxiv"
mongo_db_collection = "abstracts"
local_model_path = "models"
base_model_name = "unsloth/Llama-3.2-1B"
use_adapted_model = False
adapter_path = None # path is relative to local_model_path or s3_prefix"
use_time_series_split = False
test_size = 0.2

In [3]:
os.chdir(app_path)

In [4]:
from utils.aws import get_boto3_client
if use_s3:
    s3 = get_boto3_client("s3")

Loaded .env — assuming local environment


In [5]:
if not os.path.exists(local_model_path):
    os.makedirs(local_model_path)

## Model Prep

In [6]:
## Model Setup
print('Model Setup')
print(datetime.now())
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


if use_adapted_model:
    # if use_s3, download the adapted model from S3 from specified, bucket, prefix and path
    assert adapter_path is not None, "Adapter path must be specified when using adapted model."
    if use_s3:
        # assert s3 handler exists
        assert s3 is not None, "S3 client is not initialized."
        s3_model_path = f"{s3_prefix}/{adapter_path}"
        full_local_model_path = os.path.join(local_model_path, adapter_path)
        # Wipe local directory if it exists
        # if os.path.exists(full_model_local_path):
        #     os.rmdir(full_model_local_path)
        os.makedirs(full_local_model_path, exist_ok=True)
        # List all objects under the prefix
        paginator = s3.get_paginator('list_objects_v2')
        for page in paginator.paginate(Bucket=s3_bucket, Prefix=s3_model_path):
            for obj in page.get('Contents', []):
                key = obj['Key']
                if key.endswith('/'):  # Skip folders
                    continue
                # Determine local file path
                rel_path = os.path.basename(key)
                local_path = os.path.join(full_local_model_path, rel_path)
                os.makedirs(os.path.dirname(local_path), exist_ok=True)
    
                print(f"Downloading {key} to {local_path}")
                s3.download_file(s3_bucket, key, local_path)
    else:
        full_local_model_path = os.path.join(local_model_path, adapter_path)

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = full_local_model_path,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit
        #
    )
else:
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = base_model_name,
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )
    model = FastLanguageModel.get_peft_model(
                model,
                r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
                target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                                  "gate_proj", "up_proj", "down_proj",],
                lora_alpha = 16,
                lora_dropout = 0, # Supports any, but = 0 is optimized
                bias = "none",    # Supports any, but = "none" is optimized
                # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
                use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
                random_state = 3407,
                use_rslora = False,  # We support rank stabilized LoRA
                loftq_config = None, # And LoftQ
            )

Model Setup
2025-08-11 21:22:20.862443
==((====))==  Unsloth 2025.8.4: Fast Llama patching. Transformers: 4.55.0.
   \\   /|    NVIDIA GeForce RTX 4060 Laptop GPU. Num GPUs = 1. Max memory: 7.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.1+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.3.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.31.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.8.4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [7]:
model.print_trainable_parameters()

trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


## Data Setup

In [8]:
from utils.pytorch_dataset import BioRxivDataset
# dataset = load_dataset("your_dataset_name", split="train")
dataset = BioRxivDataset(mongo_uri=mongo_uri,
                         db_name=mongo_db_name,
                         collection_name=mongo_db_collection,
                         )
# dataset.map(partial(tokenize_with_eos, tokenizer=tokenizer, max_length=max_seq_length))
train_dataset, eval_dataset = dataset.train_test_split(test_size=test_size, 
                                random_state=42, 
                                use_time_series_split=use_time_series_split
                                )


In [9]:
train_dataset

<utils.pytorch_dataset.BioRxivDataset at 0x7981f42826f0>

In [10]:
len(train_dataset)

34789

In [11]:
len(eval_dataset)

8698

In [12]:
train_dataset.to_dict()[0:2]

[{'_id': '68982f433c834e4e5e104618',
  'doi': '10.1101/2024.01.25.577194',
  'text': 'In recent years, a vast number of novel antiphage defense mechanisms were uncovered. To facilitate the exploration of mechanistic, ecological, and evolutionary aspects related to antiphage defense systems, we released DefenseFinder in 2021 (Tesson et al., 2022). DefenseFinder is a bioinformatic program designed for the systematic identification of known antiphage defense mechanisms. The initial release of DefenseFinder v1.0.0 included 60 systems. Over the past three years, the number of antiphage systems incorporated into DefenseFinder has grown to 152. The increasing number of known systems makes it a challenge to enter the field and makes the interpretation of detections of antiphage systems difficult. Moreover, the rapid development of sequence-based predictions of structures offers novel possibilities of analysis and should be easily available. To overcome these challenges, we present a hub of res

In [13]:
def add_eos(example):
    eos_token = tokenizer.eos_token
    if eos_token is None:
        raise ValueError("Tokenizer does not define an EOS token.")
    
    text = example.get("text", "")
    if not text:
        return {"text": ""}
    
    return {"text": text + eos_token}

In [14]:
from datasets import Dataset
from tqdm import tqdm
train_hf_dataset = []
eval_hf_dataset = []
print('converting train data')
for i in tqdm(range(len(train_dataset))):
    item = train_dataset[i]
    if "text" in item.keys():
        train_hf_dataset.append(add_eos(item))
    else:
        print("skipping for index {} in train dataset".format(i))
for i in tqdm(range(len(eval_dataset))):
    item = eval_dataset[i]
    if "text" in item.keys():
        eval_hf_dataset.append(add_eos(item))
    else:
        print("skipping for index {} in eval dataset".format(i))

converting train data


100%|█████████████████████████████████████████████████████████████████████████| 34789/34789 [00:00<00:00, 529812.43it/s]
100%|███████████████████████████████████████████████████████████████████████████| 8698/8698 [00:00<00:00, 438734.09it/s]


In [15]:
train_hf_dataset = Dataset.from_list(train_hf_dataset)
eval_hf_dataset = Dataset.from_list(eval_hf_dataset)

## Training Setup

In [16]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_hf_dataset,
    eval_dataset = eval_hf_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 10,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"]: 100%|██████████████████████████████████████| 34789/34789 [00:03<00:00, 9882.82 examples/s]
Unsloth: Tokenizing ["text"]: 100%|████████████████████████████████████████| 8698/8698 [00:00<00:00, 9582.92 examples/s]


## Run Training

In [17]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 4060 Laptop GPU. Max memory = 7.996 GB.
2.41 GB of memory reserved.


In [18]:
datetime.now().strftime("%A, %B %d, %Y at %I:%M %p")

'Monday, August 11, 2025 at 09:22 PM'

In [19]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 34,789 | Num Epochs = 1 | Total steps = 10
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 11,272,192 of 1,247,086,592 (0.90% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.2069
2,2.664
3,2.5513
4,2.5998
5,2.5316
6,2.4529
7,2.359
8,2.3027
9,2.4856
10,2.4716


In [20]:
datetime.now().strftime("%A, %B %d, %Y at %I:%M %p")

'Monday, August 11, 2025 at 09:22 PM'

In [21]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

11.514 seconds used for training.
0.19 minutes used for training.
Peak reserved memory = 3.236 GB.
Peak reserved memory for training = 0.826 GB.
Peak reserved memory % of max memory = 40.47 %.
Peak reserved memory for training % of max memory = 10.33 %.


## Save Lora Weights

In [22]:
import re
# Save LoRA Weights locally and to S3 if required
print("Saving LoRA Weights...")

base_model_folder = base_model_name.replace("/", "_") + "_{}".format(datetime.now().strftime("%Y%m%d_%H%M%S"))

model_subdir = os.path.join(local_model_path, base_model_folder)
if not os.path.exists(model_subdir):
    os.makedirs(model_subdir)
lora_weights_path = os.path.join(model_subdir, "lora_weights")
if not os.path.exists(lora_weights_path):
    os.makedirs(lora_weights_path)
trainer.save_model(lora_weights_path)
tokenizer.save_pretrained(lora_weights_path)
if use_s3:
    print("Uploading LoRA Weight Files to S3...")
    for fname in os.listdir(lora_weights_path):
        fpath = os.path.join(lora_weights_path, fname)
        if os.path.isfile(fpath):
            print("{}".format(fpath))
            s3.upload_file(
                Filename=os.path.join(lora_weights_path, fname),
                Bucket=s3_bucket,
                Key=os.path.join(s3_prefix, base_model_folder, "lora_weights", fname)
            )

Saving LoRA Weights...
Uploading LoRA Weight Files to S3...
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/special_tokens_map.json
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/tokenizer_config.json
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/training_args.bin
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/README.md
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/adapter_model.safetensors
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/tokenizer.json
models/unsloth_Llama-3.2-1B_20250811_212242/lora_weights/adapter_config.json


In [23]:
datetime.now().strftime("%A, %B %d, %Y at %I:%M %p")

'Monday, August 11, 2025 at 09:22 PM'