# Fine tune an LLM for Bible Assistant AI agent

In [1]:
import os
from pathlib import Path
import json
import numpy as np
import pandas as pd
import sklearn

import subprocess

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import huggingface_hub
import torch
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling
from transformers import DataCollatorForSeq2Seq
import ollama

In [2]:
x = torch.tensor([1,2,3])
print(x.numpy())   # should work now

[1 2 3]


In [3]:
import transformers, accelerate
print("Transformers:", transformers.__version__)
print("Accelerate:", accelerate.__version__)


Transformers: 4.57.3
Accelerate: 1.12.0


In [4]:
huggingface_hub.login(os.environ["HUGGING_FACE_TOKEN"])

In [5]:
def do_generate(use_tokenizer, use_model, prompt, max_new_tokens=100, temperature=0.7):
    inputs = use_tokenizer(prompt, return_tensors="pt")
    input_len = inputs["input_ids"].shape[1]
    if temperature > 0:
        outputs = use_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature)
    else:
        outputs = use_model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
    output = outputs[0][input_len:] # Remove the repeated input token sequence. Keep only the newly generated tokens
    outtext = use_tokenizer.decode(output, skip_special_tokens=True)
    return outtext

def do_generate_response(use_tokenizer, use_model, messages, max_new_tokens=100, temperature=0.7):
    prompt = use_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return do_generate(use_tokenizer, use_model, prompt, max_new_tokens=max_new_tokens, temperature=temperature)

# def do_generate_response(use_tokenizer, use_model, user_message, max_new_tokens=100, temperature=0.7):
#     prompt = use_tokenizer.apply_chat_template([
#         {"role":"user", "content":user_message}],
#     tokenize=False,
#     add_generation_prompt=True)
#     return do_generate(use_tokenizer, use_model, prompt, max_new_tokens=max_new_tokens, temperature=temperature)

## Load train/test data

In [6]:
dev_folder = os.path.abspath("../data/dev")
os.listdir(dev_folder)

['lookup_verse.3.test.jsonl', 'lookup_verse.3.train.jsonl']

In [7]:
trainset_file = os.path.join(dev_folder, "lookup_verse.3.train.jsonl")
testset_file = os.path.join(dev_folder, "lookup_verse.3.test.jsonl")
train_ds = load_dataset("json", data_files=trainset_file, split="train")
test_ds = load_dataset("json", data_files=testset_file, split="train")

## Base model

In [8]:
model_name = "google/gemma-3-1b-it"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

In [9]:
train_ds[10]['messages']

[{'role': 'system',
  'content': 'You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".\n\nTo respond normally to the user, use:\n{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}\n\nTo call a tool, use:\n{"tool": "<tool_name>", "arguments":{ ... }}\n\nAfter you call a tool, you will receive a message with role "user" containing a JSON object.\nThe tool message always includes "tool_name" and "status".\n\nIf "status" is "ok":\n- The message will include a "result" object.\n- Read "result.text".\n- Respond using "respond_to_user" and copy "result.text" exactly as-is.\n\nIf "status" is "error":\n- The message will include an "error_message".\n- If the error message is clear enough (e.g., if the user spelled a book name wrong and it is clear which book the user intended), you can call the tool again with the corrected arguments.\n- Otherwise, respond using "respond_to_user" and copy "error_message" exactly as-i

In [10]:
input_messages = train_ds[11]['messages'][:2]
for msg in input_messages:
    print(msg)
print("-"*40)
# resp = do_generate_response(tokenizer, base_model, input_messages)
# print(resp)

{'role': 'system', 'content': 'You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".\n\nTo respond normally to the user, use:\n{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}\n\nTo call a tool, use:\n{"tool": "<tool_name>", "arguments":{ ... }}\n\nAfter you call a tool, you will receive a message with role "user" containing a JSON object.\nThe tool message always includes "tool_name" and "status".\n\nIf "status" is "ok":\n- The message will include a "result" object.\n- Read "result.text".\n- Respond using "respond_to_user" and copy "result.text" exactly as-is.\n\nIf "status" is "error":\n- The message will include an "error_message".\n- If the error message is clear enough (e.g., if the user spelled a book name wrong and it is clear which book the user intended), you can call the tool again with the corrected arguments.\n- Otherwise, respond using "respond_to_user" and copy "error_message" exactly as-is (

## Prepare data for training

In [11]:
chat = train_ds[10]['messages']
print(tokenizer.apply_chat_template(chat, tokenize=False))
for ss in ['<start_of_turn>model', '<start_of_turn>user', '<start_of_turn>', '<end_of_turn>']:
    print("-"*40)
    print(f"{ss} --> {tokenizer(ss)}")
print("-"*40)
print(tokenizer.apply_chat_template(chat, add_generation_prompt=False))

<bos><start_of_turn>user
You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".

To respond normally to the user, use:
{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}

To call a tool, use:
{"tool": "<tool_name>", "arguments":{ ... }}

After you call a tool, you will receive a message with role "user" containing a JSON object.
The tool message always includes "tool_name" and "status".

If "status" is "ok":
- The message will include a "result" object.
- Read "result.text".
- Respond using "respond_to_user" and copy "result.text" exactly as-is.

If "status" is "error":
- The message will include an "error_message".
- If the error message is clear enough (e.g., if the user spelled a book name wrong and it is clear which book the user intended), you can call the tool again with the corrected arguments.
- Otherwise, respond using "respond_to_user" and copy "error_message" exactly as-is (to let the user tell you 

In [12]:
def tokenize_for_gemma_token_level(example):
    # Convert the list of messages into a single text sequence
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )
    print(text)
    tokens = tokenizer(
        text,
        truncation=True,
        max_length=2048,
        padding=False,
        return_tensors=None
    )
    input_ids = tokens["input_ids"]
    labels = [-100] * len(input_ids)

    assistant_marker = tokenizer("<start_of_turn>model")["input_ids"][1:] # Assume the first is a <bos> token that will not appear in the middle of a chat
    endturn_marker = tokenizer("<end_of_turn>")["input_ids"][1:]
    print(f"assistant marker: {assistant_marker}")
    print(f"end-turn marker: {endturn_marker}")
    i = 0
    while i < len(input_ids):
        # Look for assistant start:
        if input_ids[i:i+len(assistant_marker)] == assistant_marker:
            # Move index to the first token *after* the start marker
            i += len(assistant_marker)
            # Copy the target tokens until reaching end of assistant message:
            while i < len(input_ids) and input_ids[i:i+len(endturn_marker)] != endturn_marker:
                labels[i] = input_ids[i]
                i += 1
            i += len(endturn_marker)
        else:
            i += 1

    tokens["labels"] = labels
    return tokens

def tokenize_for_gemma(example, max_length=2048):
    """
    Tokenize a multi-turn chat example for Gemma using text-level markers.
    Only assistant spans (<start_of_turn>model ... <end_of_turn>) are labeled.
    Everything else is masked with -100.
    """

    # 1. Render the conversation using Gemma's chat template (text only)
    text = tokenizer.apply_chat_template(
        example["messages"],
        tokenize=False,
        add_generation_prompt=False
    )

    # 2. Find all assistant spans in the raw text
    assistant_spans = []
    start_tag = "<start_of_turn>model"
    end_tag = "<end_of_turn>"

    search_pos = 0
    while True:
        start_idx = text.find(start_tag, search_pos)
        if start_idx == -1:
            break

        # content begins *after* the start tag
        content_start = start_idx + len(start_tag)

        end_idx = text.find(end_tag, content_start)
        if end_idx == -1:
            # malformed example; ignore
            break

        assistant_spans.append((content_start, end_idx))
        search_pos = end_idx + len(end_tag)

    # 3. Tokenize the entire text with offsets
    encoded = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_length,
        padding=False,
    )

    input_ids = encoded["input_ids"]
    offsets = encoded["offset_mapping"]

    # 4. Initialize all labels as masked
    labels = [-100] * len(input_ids)

    # 5. For each assistant span, unmask tokens whose offsets fall inside it
    for (span_start, span_end) in assistant_spans:
        for i, (tok_start, tok_end) in enumerate(offsets):
            # token overlaps the assistant span
            if tok_end > span_start and tok_start < span_end:
                labels[i] = input_ids[i]

    encoded["labels"] = labels
    encoded.pop("offset_mapping", None)

    return encoded

In [13]:
train_tok = train_ds.map(tokenize_for_gemma, remove_columns=train_ds.column_names)
test_tok = test_ds.map(tokenize_for_gemma, remove_columns=train_ds.column_names)

In [14]:
#train_tok = train_tok.select(range(20))

#test_tok = test_tok.select(range(10))
test_tok = test_tok.select(list(range(10)) + list(range(100,130)))

In [15]:
print(f"Train {len(train_tok)}")
print(f"Test {len(test_tok)}")

Train 1322
Test 40


In [16]:
train_tok[0].keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

## Train with LoRA

In [17]:
output_dir = os.path.abspath("../models/gemma3-lora-8")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
os.listdir(output_dir)

[]

In [18]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
)

model = get_peft_model(base_model, lora_config)
model

PeftModel(
  (base_model): LoraModel(
    (model): Gemma3ForCausalLM(
      (model): Gemma3TextModel(
        (embed_tokens): Gemma3TextScaledWordEmbedding(262144, 1152, padding_idx=0)
        (layers): ModuleList(
          (0-25): 26 x Gemma3DecoderLayer(
            (self_attn): Gemma3Attention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=1152, out_features=1024, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1152, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=1024, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
   

In [None]:
args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    learning_rate=5e-5,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    save_only_model=True,
    save_safetensors=False,
    remove_unused_columns=False
)

use_train_set = train_tok
use_eval_set = test_tok

#collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=use_train_set,
    eval_dataset=use_eval_set,
    data_collator=collator
)

trainer.train()

In [27]:
input_messages = test_ds[11]['messages'][:4]
for msg in input_messages:
    print(msg)
print("-"*40)
resp = do_generate_response(tokenizer, model, input_messages)
print(resp)

{'role': 'system', 'content': 'You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".\n\nTo respond normally to the user, use:\n{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}\n\nTo call a tool, use:\n{"tool": "<tool_name>", "arguments":{ ... }}\n\nAfter you call a tool, you will receive a message with role "user" containing a JSON object.\nThe tool message always includes "tool_name" and "status".\n\nIf "status" is "ok":\n- The message will include a "result" object.\n- Read "result.text".\n- Respond using "respond_to_user" and copy "result.text" exactly as-is.\n\nIf "status" is "error":\n- The message will include an "error_message".\n- If the error message is clear enough (e.g., if the user spelled a book name wrong and it is clear which book the user intended), you can call the tool again with the corrected arguments.\n- Otherwise, respond using "respond_to_user" and copy "error_message" exactly as-is (

## Merge and save model after fine tuning

In [28]:
def merge_lora_checkpoint(base_model_name, training_dir, checkpoint):
    adapter_dir = f"{training_dir}/checkpoint-{checkpoint}"
    merged_dir = f"{training_dir}/merged-checkpoint-{checkpoint}"

    # Keep in mind you need to get a fresh base every time before attaching to the adaptation (otherwise you may mutate the same model with different adaptations)
    base_tmp = AutoModelForCausalLM.from_pretrained(base_model_name)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    adapted = PeftModel.from_pretrained(base_tmp, adapter_dir)
    merged = adapted.merge_and_unload()
    merged.save_pretrained(merged_dir)
    tokenizer.save_pretrained(merged_dir)
    return merged_dir

In [29]:
merged_dir = merge_lora_checkpoint(model_name, output_dir, 632)
print(merged_dir)
os.listdir(merged_dir)

c:\Users\Yonatan\Documents\coding_projects\new_computer\models\gemma3-lora-7/merged-checkpoint-632


['added_tokens.json',
 'chat_template.jinja',
 'config.json',
 'generation_config.json',
 'model.safetensors',
 'special_tokens_map.json',
 'tokenizer.json',
 'tokenizer.model',
 'tokenizer_config.json']

In [30]:
loaded_hf_model = AutoModelForCausalLM.from_pretrained(merged_dir)

In [32]:
input_messages = test_ds[0]['messages'][:4]
for msg in input_messages:
    print(msg)
print("-"*40)
resp = do_generate_response(tokenizer, loaded_hf_model, input_messages)
print(resp)

{'role': 'system', 'content': 'You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".\n\nTo respond normally to the user, use:\n{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}\n\nTo call a tool, use:\n{"tool": "<tool_name>", "arguments":{ ... }}\n\nAfter you call a tool, you will receive a message with role "user" containing a JSON object.\nThe tool message always includes "tool_name" and "status".\n\nIf "status" is "ok":\n- The message will include a "result" object.\n- Read "result.text".\n- Respond using "respond_to_user" and copy "result.text" exactly as-is.\n\nIf "status" is "error":\n- The message will include an "error_message".\n- If the error message is clear enough (e.g., if the user spelled a book name wrong and it is clear which book the user intended), you can call the tool again with the corrected arguments.\n- Otherwise, respond using "respond_to_user" and copy "error_message" exactly as-is (

## Store model for ollama:

In [33]:
# def write_gemma_modelfile(merged_dir, model_name="gemma-3-1b-ft"):
#     """
#     Create an Ollama Modelfile inside the merged_dir folder.
    
#     Parameters
#     ----------
#     merged_dir : str or Path
#         Path to the folder containing the merged Gemma model files.
#     model_name : str
#         Optional name to embed in comments or metadata.
#     """

#     merged_dir = Path(merged_dir)

#     # Safety check
#     required_files = [
#         "model.safetensors",
#         "config.json",
#         "tokenizer.json",
#         "tokenizer.model"
#     ]
#     missing = [f for f in required_files if not (merged_dir / f).exists()]
#     if missing:
#         raise FileNotFoundError(
#             f"The following required model files are missing in {merged_dir}: {missing}"
#         )

#     # Standard Gemma chat template for Ollama
#     modelfile_text = f"""# Modelfile for {model_name}
# # This file was generated programmatically.

# FROM {merged_dir.as_posix()}

# TEMPLATE \"\"\"{{- range $i, $_ := .Messages }}
# {{- $last := eq (len (slice $.Messages $i)) 1 }}
# {{- if or (eq .Role "user") (eq .Role "system") }}<start_of_turn>user
# {{ .Content }}<end_of_turn>
# {{ if $last }}<start_of_turn>model
# {{ end }}
# {{- else if eq .Role "assistant" }}<start_of_turn>model
# {{ .Content }}{{ if not $last }}<end_of_turn>
# {{ end }}
# {{- end }}
# {{- end }}\"\"\"

# PARAMETER stop <end_of_turn>
# PARAMETER temperature 1
# PARAMETER top_k 64
# PARAMETER top_p 0.95
# """

#     modelfile_path = merged_dir / "Modelfile"
#     with open(modelfile_path, "w", encoding="utf-8") as f:
#         f.write(modelfile_text)

#     return modelfile_path

def write_gemma_modelfile(merged_dir, ollama_base_model_name):
    """
    Create a correct Modelfile for a fine‑tuned Gemma model by copying the
    base model's Modelfile and replacing only the FROM line.

    Parameters
    ----------
    merged_dir : str or Path
        Path to the folder containing the merged model files.
    ollama_base_model_name : str
        Name of the base model as registered in Ollama (e.g., "gemma3:1b").

    Returns
    -------
    Path
        Path to the newly written Modelfile.
    """

    merged_dir = Path(merged_dir)
    modelfile_path = merged_dir / "Modelfile"

    # 1. Get the base model's Modelfile text
    result = subprocess.run(
        ["ollama", "show", "--modelfile", ollama_base_model_name],
        capture_output=True,
        text=True,
        check=True
    )
    base_modelfile = result.stdout

    # 2. Replace the FROM line with a local directory reference
    new_from = f"FROM {merged_dir.as_posix()}"
    modelfile_lines = base_modelfile.splitlines()

    for i, line in enumerate(modelfile_lines):
        if line.startswith("FROM "):
            modelfile_lines[i] = new_from
            break

    updated_modelfile = "\n".join(modelfile_lines)

    # 3. Write the updated Modelfile into merged_dir
    with open(modelfile_path, "w", encoding="utf-8") as f:
        f.write(updated_modelfile)

    return modelfile_path

def build_ollama_model_cli(model_name: str, merged_dir: str):
    merged_dir = Path(merged_dir)
    modelfile = merged_dir / "Modelfile"

    subprocess.run(
        ["ollama", "create", model_name, "-f", str(modelfile)],
        check=True
    )


In [34]:
ollama_base_model_name = "gemma3:1b"
new_model_name = f"{ollama_base_model_name}-ft2"
print(new_model_name)

gemma3:1b-ft2


In [35]:
modelfile_path = write_gemma_modelfile(merged_dir, ollama_base_model_name)
print(f"==> {modelfile_path}")
build_ollama_model_cli(new_model_name, merged_dir)

==> c:\Users\Yonatan\Documents\coding_projects\new_computer\models\gemma3-lora-7\merged-checkpoint-632\Modelfile


In [36]:
ollama.list()

ListResponse(models=[Model(model='gemma3:1b-ft2', modified_at=datetime.datetime(2026, 1, 23, 11, 51, 6, 306399, tzinfo=TzInfo(-18000)), digest='252321fba99d14a9cba449fa8f045c92e97bdec3fadcbfa9ab446766caca55a8', size=2015833891, details=ModelDetails(parent_model='', format='gguf', family='gemma3', families=['gemma3'], parameter_size='999.89M', quantization_level='F16')), Model(model='gemma3:1b-ft1', modified_at=datetime.datetime(2026, 1, 16, 10, 38, 25, 375681, tzinfo=TzInfo(-18000)), digest='f246014b90b8cb36ca293a97c7b5c5ce2e9cfa4790050d6a659c02401dca8c9c', size=2015833891, details=ModelDetails(parent_model='', format='gguf', family='gemma3', families=['gemma3'], parameter_size='999.89M', quantization_level='F16')), Model(model='google/gemma-3-1b-it-ft1:latest', modified_at=datetime.datetime(2026, 1, 16, 10, 8, 13, 449228, tzinfo=TzInfo(-18000)), digest='b615ad8f3a73023d47771e856d21f4069d053d3e745b000a5752f5b858b0586d', size=2015825284, details=ModelDetails(parent_model='', format='ggu

In [51]:
for msg in input_messages:
    print("-"*30)
    print(msg['role'])
    print(msg['content'])

------------------------------
system
You are a research assistant that always responds using a JSON object with fields "tool" and "arguments".

To respond normally to the user, use:
{"tool": "respond_to_user", "arguments":{"text": "<text to show the user>"}}

To call a tool, use:
{"tool": "<tool_name>", "arguments":{ ... }}

After you call a tool, you will receive a message with role "user" containing a JSON object.
The tool message always includes "tool_name" and "status".

If "status" is "ok":
- The message will include a "result" object.
- Read "result.text".
- Respond using "respond_to_user" and copy "result.text" exactly as-is.

If "status" is "error":
- The message will include an "error_message".
- Respond using "respond_to_user" and copy "error_message" exactly as-is.

Rules:
- Do not call a tool immediately after a tool message.
- Never modify, translate, summarize, or explain text returned by the tool.
- Never add commentary or extra text.
- Never guess missing arguments.
- 