In [1]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset, Features, Value
import json
import os
import torch
import re
import html
import gc

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

In [5]:
def clean_text(text: str):
    text = html.unescape(text)
    text = re.sub(r"--.*", "", text) # Remove lines starting with '--' or similar patterns
    text = re.sub(r"[^\x00-\x7F]+", "", text) # Remove non-ASCII characters (optional, depending on your dataset)
    text = re.sub(r"\s+", " ", text) # Remove excessive whitespace
    text = text.strip()

    return text

In [6]:
def read_json_file(file_path: str):
    training_data = []

    with open(file_path, "r") as file:
        buffer: str = ""
        for line in file:
            buffer += line.strip()
            try:
                item: dict = json.loads(buffer)
                buffer: str = ""
                title: str = clean_text(item.get("title", ""))
                content: str = clean_text(item.get("content", ""))
                if title and content:
                    yield {"title": title, "content": content}
            except json.JSONDecodeError:
                continue
    return training_data

In [7]:
def format_for_training(example: dict, tokenizer) -> dict:
    """
    Formats a title and content into a chat-like format using the tokenizer's chat template.
    This teaches the model to respond with the book's content when asked about its title.
    """
    conversation = [
        {'role': 'user', 'content': f'Tell me about the book titled "{example["title"]}". What is its content?'},
        {'role': 'assistant', 'content': example['content']}
    ]
    formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
    return {"text": formatted_text + tokenizer.eos_token}

In [8]:
fourbit_models = [
    #"unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    #"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    #"unsloth/Phi-3-medium-4k-instruct",
    #"unsloth/gemma-2-9b-bnb-4bit"
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

==((====))==  Unsloth 2025.9.10: Fast Llama patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/140 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Unsloth 2025.9.10 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
dataset_path = "/content/drive/MyDrive/FineTuningTechChallenge/trn.json"

raw_data = read_json_file(dataset_path)
training_data_list = [{"title": item["title"], "content": item["content"]} for item in raw_data]
base_dataset = Dataset.from_list(training_data_list)
dataset = base_dataset.map(lambda x: format_for_training(x, tokenizer))

Map:   0%|          | 0/336568 [00:00<?, ? examples/s]

In [10]:
training_args = TrainingArguments(
    output_dir="/results",
    per_device_train_batch_size=2,  # Aumente se sua GPU permitir para acelerar
    gradient_accumulation_steps=4, # Ajuste para manter um lote efetivo de 8 (2*4)
    warmup_steps=10,
    # max_steps = 70, # Remova esta linha
    num_train_epochs=1, # Adicione esta linha para treinar em todo o dataset uma vez
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=42,
    save_strategy="epoch", # Salvar ao final de cada época
    save_total_limit=2,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
    packing = True,
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/336568 [00:00<?, ? examples/s]

In [11]:
title_for_testing = "A Day in the Life of China"
message_for_testing = [
    {"role": "user", "content": f"Regarding the book {title_for_testing}, what was the author’s primary goal in writing it?"},
]
input_ids = tokenizer.apply_chat_template(message_for_testing, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=input_ids, max_new_tokens=256, use_cache=True)
results = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("\n--- TESTE ANTES DO TREINO ---")
print(results)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- TESTE ANTES DO TREINO ---
Regarding the book A Day in the Life of China, what was the author’s primary goal in writing it? As an AI language model, I don't have direct access to specific books or their authors' intentions. However, I can provide a general idea of what an author's primary goal might be when writing a book like "A Day in the Life of China."

1. To inform: The author might aim to provide readers with a detailed and accurate portrayal of daily life in China, offering insights into the culture, customs, and social dynamics.

2. To educate: The author might want to educate readers about the complexities of Chinese society, including its history, politics, and economic development.

3. To entertain: The author might aim to engage readers with a compelling narrative that captures the essence of daily life in China, making the experience both enjoyable and memorable.

4. To raise awareness: The author might want to draw attention to specific issues or challenges facing Chi

In [12]:
print("Iniciando o fine-tuning...")
trainer.train()
print("Fine-tuning concluído!")

Iniciando o fine-tuning...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 336,568 | Num Epochs = 1 | Total steps = 42,071
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 29,884,416 of 3,850,963,968 (0.78% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbezaleelhur101[0m ([33mbezaleelhur101-fiap[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,3.2114
2,3.7546
3,3.0279
4,3.1249
5,3.6026
6,2.8525
7,3.0237
8,2.9533
9,2.8018
10,2.6531


KeyboardInterrupt: 

In [None]:
# del model
# del tokenizer
# del trainer
# del dataset

# gc.collect()
# torch.cuda.empty_cache()

In [None]:
title_for_testing = "A Day in the Life of China"
message_for_testing = [
    {"role": "user", "content": f"Regarding the book {title_for_testing}, what was the author’s primary goal in writing it?"},
]
input_ids = tokenizer.apply_chat_template(message_for_testing, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=input_ids, max_new_tokens=256, use_cache=True)
results = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("\n--- TESTE DE INFERÊNCIA ---")
print(results)