# Fine-Tuning de Modelo LLM para Dom√≠nio M√©dico (ChatML)

Este notebook implementa o pipeline completo de fine-tuning de um modelo LLM para tarefas de question-answering m√©dico usando o formato **ChatML**.

## ‚ö†Ô∏è Op√ß√µes de Pr√©-processamento

Voc√™ pode usar este notebook de duas maneiras:
1. **Upload do arquivo j√° processado**: Fa√ßa upload de `formatted_medical_dataset.jsonl` (se j√° rodou o pipeline localmente).
2. **Upload do arquivo bruto**: Fa√ßa upload de `ori_pqal.json` e o notebook processar√° os dados para voc√™.

## Objetivos:
1. Carregar/Processar dataset m√©dico no padr√£o ChatML
2. Carregar modelo base Meta-Llama-3-8B-Instruct pre-quantizado
3. Configurar LoRA (QLoRA) para treinamento eficiente
4. Treinar modelo com dados m√©dicos
5. Testar e salvar modelo treinado

## Requisitos:
- **Google Colab** com GPU
- GPU com pelo menos 8GB VRAM


In [None]:
# C√âLULA 1: INSTALA√á√ÉO DE DEPEND√äNCIAS
print("üì¶ Instalando depend√™ncias...")
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install transformers datasets
print("\n‚úÖ Depend√™ncias instaladas!")

In [None]:
# C√âLULA 2: IMPORTA√á√ïES
from unsloth import FastLanguageModel, is_bfloat16_supported
import torch
import json
import re
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from pathlib import Path

print("‚úÖ Bibliotecas importadas!")

In [None]:
# C√âLULA 3: CONFIGURA√á√ïES
MAX_SEQ_LENGTH = 2048
DTYPE = None  # Auto-detect
LOAD_IN_4BIT = True
DEFAULT_MODEL = "unsloth/llama-3-8b-Instruct-bnb-4bit"

LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 16,
    "lora_dropout": 0,
    "bias": "none",
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "use_gradient_checkpointing": "unsloth",
    "random_state": 3407,
    "use_rslora": False,
}

TRAINING_CONFIG = {
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "warmup_steps": 5,
    "max_steps": 100,
    "learning_rate": 2e-4,
    "optim": "adamw_8bit",
    "weight_decay": 0.01,
    "lr_scheduler_type": "linear",
    "seed": 3407,
    "output_dir": "outputs",
    "logging_steps": 1,
}

print(f"Configurado para usar modelo: {DEFAULT_MODEL}")

In [None]:
# C√âLULA 4: CARREGAMENTO E PR√â-PROCESSAMENTO
FORMATTED_DATASET_PATH = Path("formatted_medical_dataset.jsonl")
RAW_DATASET_PATH = Path("ori_pqal.json")

def anonymize_text(text):
    if not isinstance(text, str): return text
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{4}', '[DATA]', text)
    text = re.sub(r'\d{4}-\d{2}-\d{2}', '[DATA]', text)
    text = re.sub(r'ID:\s*\d+', 'ID: [PACIENTE_ID]', text, flags=re.IGNORECASE)
    text = re.sub(r'Patient ID:\s*\d+', 'Patient ID: [PACIENTE_ID]', text, flags=re.IGNORECASE)
    text = re.sub(r'\d{3}[-.]?\d{3}[-.]?\d{4}', '[TELEFONE]', text)
    text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
    return text

def process_raw_to_chatml(input_path, output_path):
    print(f"‚è≥ Processando arquivo bruto: {input_path}...")
    with open(input_path, 'r', encoding='utf-8') as f:
        raw_data = json.load(f)
    
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for data_id, content in raw_data.items():
            question = content.get("QUESTION", "")
            context = " ".join(content.get("CONTEXTS", []))
            answer = content.get("LONG_ANSWER", "")
            
            input_text = f"Contexto: {anonymize_text(context)}\nPergunta: {question}"
            
            chatml_entry = {
                "messages": [
                    {"role": "system", "content": "Responda √† pergunta baseando-se nos contextos fornecidos."},
                    {"role": "user", "content": input_text},
                    {"role": "assistant", "content": anonymize_text(answer)}
                ]
            }
            f_out.write(json.dumps(chatml_entry, ensure_ascii=False) + "\n")
    print(f"‚úÖ Arquivo ChatML gerado: {output_path}")

# Verifica arquivos
if not FORMATTED_DATASET_PATH.exists():
    if RAW_DATASET_PATH.exists():
        process_raw_to_chatml(RAW_DATASET_PATH, FORMATTED_DATASET_PATH)
    else:
        try:
            from google.colab import files
            print("üì§ Arquivo n√£o encontrado. Fa√ßa upload de 'ori_pqal.json' ou 'formatted_medical_dataset.jsonl':")
            uploaded = files.upload()
            # Checa o que foi subido
            if "ori_pqal.json" in uploaded:
                process_raw_to_chatml("ori_pqal.json", FORMATTED_DATASET_PATH)
        except ImportError:
            print("‚ö†Ô∏è  Erro: Arquivos de dataset n√£o encontrados.")

if FORMATTED_DATASET_PATH.exists():
    dataset = load_dataset("json", data_files=str(FORMATTED_DATASET_PATH), split="train")
    print(f"\n‚úÖ Dataset pronto: {len(dataset)} exemplos")
    print(f"üìÑ Exemplo: {dataset[0]['messages']}")
else:
    raise FileNotFoundError("Dataset n√£o encontrado para carregar.")

In [None]:
# C√âLULA 5: CARREGAMENTO DO MODELO BASE
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=DEFAULT_MODEL,
    max_seq_length=MAX_SEQ_LENGTH,
    dtype=DTYPE,
    load_in_4bit=LOAD_IN_4BIT,
)

print("‚úÖ Modelo base carregado!")

In [None]:
# C√âLULA 6: CONFIGURA√á√ÉO LoRA
model = FastLanguageModel.get_peft_model(
    model,
    **LORA_CONFIG
)
print("‚úÖ LoRA configurado!")

In [None]:
# C√âLULA 7: FORMATA√á√ÉO CHATML
from unsloth import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="chatml",
    mapping={"role": "role", "content": "content", "user": "user", "assistant": "assistant"},
)

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return { "text" : texts, }

formatted_dataset = dataset.map(formatting_prompts_func, batched=True)
print("‚úÖ Dataset formatado com ChatML template!")

In [None]:
# C√âLULA 8: TREINAMENTO
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=formatted_dataset,
    dataset_text_field="text",
    max_seq_length=MAX_SEQ_LENGTH,
    args=TrainingArguments(
        **TRAINING_CONFIG,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
    ),
)

trainer.train()
print("‚úÖ Treinamento conclu√≠do!")

In [None]:
# C√âLULA 9: TESTE R√ÅPIDO
FastLanguageModel.for_inference(model)
messages = [
    {"role": "system", "content": "Voc√™ √© um assistente m√©dico prestativo."},
    {"role": "user", "content": "Qual a import√¢ncia da vitamina D para os ossos?"},
]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
outputs = model.generate(input_ids=inputs, max_new_tokens=256)
print(tokenizer.batch_decode(outputs)[0])

In [None]:
# C√âLULA 10: SALVAMENTO
MODEL_OUTPUT_DIR = Path("lora_model_medical")
model.save_pretrained(str(MODEL_OUTPUT_DIR))
tokenizer.save_pretrained(str(MODEL_OUTPUT_DIR))
print(f"‚úÖ Modelo salvo em: {MODEL_OUTPUT_DIR}")

# ============================================================================
# MODELO DE UTILIZA√á√ÉO (INFER√äNCIA)
# ============================================================================
Este formato √© ideal para rodar o modelo em GPUs dom√©sticas para teste r√°pido.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# 1. Configura√ß√£o de Quantiza√ß√£o (Para caber na GPU dom√©stica)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# 2. Carregar o Modelo BASE
base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct" 
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# 3. APLICAR O SEU FINE-TUNING (Sem Merge!)
# adapter_id = "vitateje/biobyia"
adapter_id = "../lora_model_medical" 
print(f"Injetando conhecimento m√©dico de: {adapter_id}")
model = PeftModel.from_pretrained(model, adapter_id)

# 4. Teste R√°pido
text = "Contexto: Paciente com dores agudas... Pergunta: Qual o procedimento?"
inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=200)
print("\nRESPOSTA DO AGENTE:\n")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))