In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4
!pip install --no-deps trl==0.22.2

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments
from datasets import Dataset, Features, Value
import json
import os
import torch
import re
import html
import gc

In [None]:
max_seq_length = 1024
dtype = None
load_in_4bit = True

In [None]:
def clean_text(text: str):
    text = html.unescape(text)
    text = re.sub(r"--.*", "", text) # Remove lines starting with '--' or similar patterns
    text = re.sub(r"[^\x00-\x7F]+", "", text) # Remove non-ASCII characters (optional, depending on your dataset)
    text = re.sub(r"\s+", " ", text) # Remove excessive whitespace
    text = text.strip()

    return text

In [None]:
def read_json_file(file_path: str):
    training_data = []

    with open(file_path, "r") as file:
        buffer: str = ""
        for line in file:
            buffer += line.strip()
            try:
                item: dict = json.loads(buffer)
                buffer: str = ""
                title: str = clean_text(item.get("title", ""))
                content: str = clean_text(item.get("content", ""))
                if title and content:
                    yield {"title": title, "content": content}
            except json.JSONDecodeError:
                continue
    return training_data

In [None]:
def format_for_training(example: dict, tokenizer) -> dict:
    """
    Formats a title and content into a chat-like format using the tokenizer's chat template.
    This teaches the model to respond with the book's content when asked about its title.
    """
    conversation = [
        {'role': 'user', 'content': f'Tell me about the book titled "{example["title"]}". What is its content?'},
        {'role': 'assistant', 'content': example['content']}
    ]
    formatted_text = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=False)
    return {"text": formatted_text + tokenizer.eos_token}

In [None]:
fourbit_models = [
    #"unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    #"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    #"unsloth/Phi-3-medium-4k-instruct",
    #"unsloth/gemma-2-9b-bnb-4bit"
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

In [None]:
dataset_path = "/content/drive/MyDrive/FineTuningTechChallenge/trn.json"

raw_data = read_json_file(dataset_path)
training_data_list = [{"title": item["title"], "content": item["content"]} for item in raw_data]
base_dataset = Dataset.from_list(training_data_list)
dataset = base_dataset.map(lambda x: format_for_training(x, tokenizer))

In [None]:
training_args = TrainingArguments(
    output_dir="/results",
    per_device_train_batch_size=2,  # Aumente se sua GPU permitir para acelerar
    gradient_accumulation_steps=4, # Ajuste para manter um lote efetivo de 8 (2*4)
    warmup_steps=10,
    # max_steps = 70, # Remova esta linha
    num_train_epochs=1, # Adicione esta linha para treinar em todo o dataset uma vez
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=1,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=42,
    save_strategy="epoch", # Salvar ao final de cada época
    save_total_limit=2,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
    packing = True,
)

In [None]:
title_for_testing = "A Day in the Life of China"
message_for_testing = [
    {"role": "user", "content": f"Regarding the book {title_for_testing}, what was the author’s primary goal in writing it?"},
]
input_ids = tokenizer.apply_chat_template(message_for_testing, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=input_ids, max_new_tokens=256, use_cache=True)
results = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("\n--- TESTE ANTES DO TREINO ---")
print(results)

In [None]:
print("Iniciando o fine-tuning...")
trainer.train()
print("Fine-tuning concluído!")

In [None]:
# del model
# del tokenizer
# del trainer
# del dataset

# gc.collect()
# torch.cuda.empty_cache()

In [None]:
title_for_testing = "A Day in the Life of China"
message_for_testing = [
    {"role": "user", "content": f"Regarding the book {title_for_testing}, what was the author’s primary goal in writing it?"},
]
input_ids = tokenizer.apply_chat_template(message_for_testing, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=input_ids, max_new_tokens=256, use_cache=True)
results = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

print("\n--- TESTE DE INFERÊNCIA ---")
print(results)