In [1]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset, Features, Value, load_dataset
import json
import os
import torch
import re
import html
import gc

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

In [3]:
def clean_text(text: str):
    # Your cleaning logic here
    if not isinstance(text, str):
        return ""
    text = html.unescape(text)
    text = re.sub(r"--.*", "", text)
    text = re.sub(r"[^\x00-\x7F]+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [4]:
def read_json_file(file_path: str):
    training_data = []

    with open(file_path, "r") as file:
        buffer: str = ""
        for line in file:
            buffer += line.strip()
            try:
                item: dict = json.loads(buffer)
                buffer: str = ""
                title: str = clean_text(item.get("title", ""))
                content: str = clean_text(item.get("content", ""))
                if title and content:
                    yield {"title": title, "content": content}
            except json.JSONDecodeError:
                continue
    return training_data

In [5]:
def format_for_training(example: dict, tokenizer) -> dict:
    title = example["title"]
    content = example["content"]

    # conversation = [
    #     {'role': 'user', 'content': f'Write a piece in the style of a book titled "{title}".'},
    #     {'role': 'assistant', 'content': content}
    # ]
    # formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
    formatted_text = f"### Title:\n{title}\n\n### Content:\n{content}"
    return {"text": formatted_text + tokenizer.eos_token}

In [6]:
fourbit_models = [
    #"unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    #"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    #"unsloth/Phi-3-medium-4k-instruct",
    #"unsloth/gemma-2-9b-bnb-4bit"
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2025.9.5: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    NVIDIA GeForce RTX 3060 Laptop GPU. Num GPUs = 1. Max memory: 6.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.9.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
dataset_path = "/home/thiagofernandes101/projects/fiap/FineTunningTechChallenge/datasets/trn.json"
streaming_dataset = load_dataset("json", data_files=dataset_path, split="train", streaming=True)
filtered_dataset = streaming_dataset.filter(
    lambda example: example.get("title") and example.get("content")
)
processed_dataset = filtered_dataset.map(
    lambda example: format_for_training(example, tokenizer)
)

In [8]:
training_args = TrainingArguments(
    output_dir="results",
    per_device_train_batch_size=2,  # Aumente se sua GPU permitir para acelerar
    gradient_accumulation_steps=4, # Ajuste para manter um lote efetivo de 8 (2*4)
    warmup_steps=5,
    max_steps = 100, # Remova esta linha
    # num_train_epochs=1, # Adicione esta linha para treinar em todo o dataset uma vez
    learning_rate=2e-4,
    logging_steps=10,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = processed_dataset, # Convert to iterable dataset
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
    packing = False, # Keep packing as it's efficient with streaming
)

In [9]:
def test_model(title_for_testing):
  # title_for_testing = "Jane's Battleships of the 20th Century"
  message_for_testing = [
      {"role": "user", "content": f'Tell me about the book "{title_for_testing}".'},
  ]
  input_ids = tokenizer.apply_chat_template(
      message_for_testing,
      tokenize=True,
      add_generation_prompt=True, # This is crucial for inference
      return_tensors="pt"
  ).to("cuda")
  outputs = model.generate(input_ids=input_ids, max_new_tokens=256, use_cache=True)
  results = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  print(results)

In [10]:
print("\n--- TEST (BEFORE) TRAINING ---")
test_model("Jane's Battleships of the 20th Century")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- TEST (BEFORE) TRAINING ---
Tell me about the book "Jane's Battleships of the 20th Century". "Jane's Battleships of the 20th Century" does not appear to be a widely recognized book or a well-known publication in the public domain as of my knowledge cutoff in 2023. It's possible that the book could be a niche or specialized publication, perhaps a work of historical interest or a fictional narrative.

If you are looking for information on a specific topic related to battleships or naval history, I can certainly help with that. However, for specific details about the book in question, I would need more context or information.

If you have more details or context about the book, please provide them, and I'll do my best to assist you. Given the lack of information about "Jane's Battleships of the 20th Century," I can't provide a detailed summary or analysis of the book. However, I can offer a general overview of the historical context of battleships in the 20th century, which might be r

In [11]:
print("Iniciando o fine-tuning...")
trainer_stats = trainer.train()
print("Fine-tuning concluído!")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 32009}.


Iniciando o fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 800 | Num Epochs = 9,223,372,036,854,775,807 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
10,2.6637
20,2.47
30,2.2903
40,2.2473
50,2.2239
60,2.246
70,2.3418
80,2.2605
90,2.0748
100,2.172


Fine-tuning concluído!


In [12]:
print("\n--- TEST (AFTER) TRAINING ---")
test_model("Jane's Battleships of the 20th Century")


--- TEST (AFTER) TRAINING ---
Tell me about the book "Jane's Battleships of the 20th Century". "Jane's Battleships of the 20th Century" is a comprehensive reference guide published by Jane's Information Group, a company known for its specialized military and defense publications. The book provides detailed information on battleships from the 20th century, including their design, armament, and operational history.

The book is organized chronologically, with each entry providing a brief overview of the ship's history, followed by a more detailed description of its design, armament, and notable actions. The book also includes numerous illustrations, including photographs, drawings, and diagrams.

"Jane's Battleships of the 20th Century" is an essential reference for anyone interested in naval history, maritime warfare, or military strategy. It is also a valuable resource for naval historians, military officers, and anyone with an interest in the history of naval warfare.
