## 1. Libs

In [26]:
import gc
import platform
import re
import subprocess
import time
from pathlib import Path

import numpy as np
import pandas as pd
import psutil
import torch
from bs4 import BeautifulSoup
from datasets import Dataset
from peft import (LoraConfig, PeftModel, get_peft_model,
                  prepare_model_for_kbit_training)
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          BitsAndBytesConfig, DataCollatorForLanguageModeling,
                          TextStreamer, Trainer, TrainingArguments)

In [2]:
if torch.cuda.is_available():
    print("CUDA is available!")
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
else:
    print("‚ö†Ô∏è CUDA not available ‚Äî running on CPU.")


CUDA is available!
Using GPU: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA version: 12.6


## 2. Data

### Import

In [23]:
# Read the TSV file
path = Path("data/zenodo_wikipedia-pt/media_resource.tab")
df = pd.read_csv(path, sep="\t", encoding="utf-8")

print(df.shape)
print(df.columns)
df.head(3)

(219674, 11)
Index(['identifier', 'taxonID', 'type', 'format', 'CVterm', 'title',
       'description', 'furtherInformationURL', 'language', 'UsageTerms',
       'Owner'],
      dtype='object')


Unnamed: 0,identifier,taxonID,type,format,CVterm,title,description,furtherInformationURL,language,UsageTerms,Owner
0,72fb7e4d377fc9601bac5f4e5f3b63a2,Q140,http://purl.org/dc/dcmitype/Text,text/html,http://rs.tdwg.org/ontology/voc/SPMInfoItems#D...,Le√£o,"><div lang=""pt"" dir=""ltr""> <p>O <b>le√£o</b><su...",http://pt.wikipedia.org/w/index.php?title=Le%C...,pt,http://creativecommons.org/licenses/by-sa/3.0/,Autores e editores de Wikipedia
1,facbaa84c032fa36235f7d9e8fa34192,Q140,http://purl.org/dc/dcmitype/Text,text/html,http://rs.tdwg.org/ontology/voc/SPMInfoItems#T...,Le√£o: Brief Summary,"> <p>O le√£o [feminino: leoa] (<a href=""http://...",http://pt.wikipedia.org/w/index.php?title=Le%C...,pt,http://creativecommons.org/licenses/by-sa/3.0/,Autores e editores de Wikipedia
2,52427a332e31b690daf46ef212302f1c,Q764,http://purl.org/dc/dcmitype/Text,text/html,http://rs.tdwg.org/ontology/voc/SPMInfoItems#D...,Fungi,"lang=""pt"" dir=""ltr""><div> <figure typeof=""mw:F...",http://pt.wikipedia.org/w/index.php?title=Fung...,pt,http://creativecommons.org/licenses/by-sa/3.0/,Autores e editores de Wikipedia


### Clean-up

In [24]:
# Clean HTML from descriptions
def clean_html_wikipedia(text):
    """Limpa HTML e res√≠duos textuais t√≠picos de dumps da Wikip√©dia"""
    if pd.isna(text):
        return None
    
    # Remove everything after 'Refer√™ncias' section
    html = str(text)
    html = re.split(r'id="Refer', html, flags=re.IGNORECASE)[0]

    # Remove tags HTML
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(" ", strip=True)

    # Remove references like [ 1 ], [ 23 ], [ note 5 ]
    text = re.sub(r"\[\s*\d+\s*\]", "", text)
    # Remove patterns like 'editar c√≥digo-fonte'
    text = re.sub(r"editar c√≥digo[- ]fonte", "", text, flags=re.IGNORECASE)
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    # Remove spaces before punctuation
    text = re.sub(r"\s([?.!,;:])", r"\1", text)
    # Remove duplicated quotes and occasional unbalanced parentheses
    text = text.replace("¬´ ", "¬´").replace(" ¬ª", "¬ª").strip()

    return text

df["text"] = df["description"].apply(clean_html_wikipedia)

In [25]:
# Remove duplicates
df = df.drop_duplicates(subset=["identifier"])

# Select relevant columns
df_clean = df[[
    "identifier", "title", "text", "furtherInformationURL"
]]

# Rename columns
df_clean = df_clean.rename(columns={
    "identifier": "id",
    "furtherInformationURL": "url"
})

df_clean.head(3)


Unnamed: 0,id,title,text,url
0,72fb7e4d377fc9601bac5f4e5f3b63a2,Le√£o,> O le√£o [feminino: leoa ] ( nome cient√≠fico: ...,http://pt.wikipedia.org/w/index.php?title=Le%C...
1,facbaa84c032fa36235f7d9e8fa34192,Le√£o: Brief Summary,> O le√£o [feminino: leoa] ( nome cient√≠fico: P...,http://pt.wikipedia.org/w/index.php?title=Le%C...
2,52427a332e31b690daf46ef212302f1c,Fungi,"lang=""pt"" dir=""ltr""> Estrutura√ß√£o filogen√©tica...",http://pt.wikipedia.org/w/index.php?title=Fung...


### Prepare Data

In [26]:
# Prepare dataset
dataset = Dataset.from_pandas(df_clean[["text"]])

# Save dataset
dataset_path = Path("data/animal_wikipedia_pt_dataset")
dataset.save_to_disk(dataset_path)

Saving the dataset (1/1 shards): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 219674/219674 [00:00<00:00, 1557412.98 examples/s]


## 3. SLM

In [3]:
# Choose model
model_name = "./models/gemma-3-1b-it"

In [4]:
# Import dataset
dataset = Dataset.load_from_disk('data/animal_wikipedia_pt_dataset')

# Sample dataset
dataset_split = dataset.shuffle(seed=42).train_test_split(test_size=0.1)
train_dataset = dataset_split['train'].select(range(min(9000, len(dataset_split['train']))))
eval_dataset = dataset_split['test'].select(range(min(1000, len(dataset_split['test']))))

print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")


Train size: 9000
Eval size: 1000


### Tokenization 

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Gemma needs this

def tokenize_function(examples):
    # Add EOS token to help model learn when to stop
    texts = [text + tokenizer.eos_token for text in examples["text"]]
    
    result = tokenizer(
        texts,
        truncation=True,
        max_length=512,
        padding=False,  # Don't pad here, let collator handle it
        return_attention_mask=True
    )
    
    return result

# Tokenize dataset
train_tokenized = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names, desc="Tokenizing")
eval_tokenized = eval_dataset.map(tokenize_function, batched=True, remove_columns=eval_dataset.column_names, desc="Tokenizing")

# Count tokens for train and eval sets
def count_tokens(tokenized_dataset):
    return sum(len(input_ids) for input_ids in tokenized_dataset["input_ids"])

train_total_tokens = count_tokens(train_tokenized)
train_avg_tokens = train_total_tokens / len(train_tokenized)

eval_total_tokens = count_tokens(eval_tokenized)
eval_avg_tokens = eval_total_tokens / len(eval_tokenized)

print(f"\nTrain set size: {len(train_tokenized)} examples")
print(f"Train set total tokens: {train_total_tokens:,}")
print(f"Train set average tokens per example: {train_avg_tokens:.1f}")

print(f"\nEval set size: {len(eval_tokenized)} examples")
print(f"Eval set total tokens: {eval_total_tokens:,}")
print(f"Eval set average tokens per example: {eval_avg_tokens:.1f}")


Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9000/9000 [00:00<00:00, 17405.11 examples/s]
Tokenizing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 18189.76 examples/s]



Train set size: 9000 examples
Train set total tokens: 844,928
Train set average tokens per example: 93.9

Eval set size: 1000 examples
Eval set total tokens: 90,494
Eval set average tokens per example: 90.5


### Data Collator

In [6]:
# Create data collator for masked language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,              # Causal LM (GPT-style)
    pad_to_multiple_of=8    # Optimization for GPU
)

### Prepare Model

In [7]:
# Clear any existing models from memory
if 'model' in globals():
    del model
if 'trainer' in globals():
    del trainer

# Force garbage collection
gc.collect()
torch.cuda.empty_cache()

print("Cleared previous models from memory.")
print(f"Current allocated VRAM: {torch.cuda.memory_allocated() / (1024**3):.2f} GB")

Cleared previous models from memory.
Current allocated VRAM: 0.00 GB


In [8]:
# Define 4-bit quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",           
    bnb_4bit_use_double_quant=True,
)

# Load model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

print(f"Model loaded!")
print(f"   - Device: {model.device}")
print(f"   - Dtype: {model.dtype}")
print(f"   - VRAM: {torch.cuda.memory_allocated(0) / (1024**3):.2f}GB")

Model loaded!
   - Device: cuda:0
   - Dtype: torch.float16
   - VRAM: 0.90GB


In [9]:
# Prepare model for k-bit training
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

print(f"Gradient setup done: {model.dtype} | VRAM: {torch.cuda.memory_allocated(0) / (1024**3):.2f}GB")

Gradient setup done: torch.float16 | VRAM: 0.90GB


In [10]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"],
    lora_alpha=32,
    lora_dropout=0.05,   
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print(f"\nLoRA applied: {model.dtype} | VRAM: {torch.cuda.memory_allocated(0) / (1024**3):.2f}GB")


trainable params: 13,045,760 || all params: 1,012,931,712 || trainable%: 1.2879

LoRA applied: torch.float16 | VRAM: 0.95GB


### Training

In [11]:
# Set up training arguments and trainer
training_args = TrainingArguments(
    output_dir=f"./results/{model_name.split("/")[-1]}-animals-lora",
    
    # Training hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,   # Effective batch size = 4*4=16
    
    # Optimization
    learning_rate=2e-4,              # Higher LR works well with LoRA
    warmup_steps=100,
    lr_scheduler_type="cosine",

    # Evaluation
    eval_strategy="steps",              
    eval_steps=250,                     
    per_device_eval_batch_size=8, 
    
    # Logging & Saving
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,              # Keep only 2 checkpoints
    logging_dir="./logs",
    load_best_model_at_end=True,     # Load best checkpoint at end
    metric_for_best_model="loss",    # Use loss to evaluate best model
    
    # Performance
    fp16=True,                       # Mixed precision training
    optim="paged_adamw_8bit",        # Memory-efficient optimizer
    gradient_checkpointing=True,     # Saves memory
    
    # Other
    report_to="none",                # or "tensorboard"/"wandb" if you want
    dataloader_pin_memory=True,
    remove_unused_columns=False,
)

In [12]:
# --- 1. System/GPU Info Functions ---
def get_gpu_memory():
    # Keep the original function to get Used/Total VRAM via nvidia-smi
    try:
        result = subprocess.run(
            ['nvidia-smi', '--query-gpu=memory.used,memory.total', 
             '--format=csv,nounits,noheader'], 
            capture_output=True, text=True, check=True
        )
        used, total = map(int, result.stdout.strip().split(','))
        return used, total
    except subprocess.CalledProcessError:
        print("‚ö†Ô∏è Warning: 'nvidia-smi' command failed. Are you on an NVIDIA GPU machine?")
        return 0, 0
    except FileNotFoundError:
        print("‚ö†Ô∏è Warning: 'nvidia-smi' not found. Ensure NVIDIA drivers are installed.")
        return 0, 0

def get_cpu_memory():
    # Get total system memory (RAM) and usage
    ram = psutil.virtual_memory()
    # Convert bytes to GB and return
    return ram.total / (1024**3), ram.used / (1024**3)

# --- 2. Print Initial Configuration ---
print("--- üíª System & Environment Check ---")
print(f"OS: {platform.system()} {platform.release()}")
print(f"Python: {platform.python_version()}")

# CUDA/PyTorch checks
if torch.cuda.is_available():
    print(f"PyTorch CUDA: ‚úÖ Enabled (Device: {torch.cuda.get_device_name(0)})")
    print(f"CUDA Version: {torch.version.cuda}")
    cuda_total_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"Total CUDA VRAM: {cuda_total_memory:.2f} GB")
else:
    print("PyTorch CUDA: ‚ùå Not Enabled. Check installation/drivers.")

# --- 3. Model Loading & Verification (Your original code) ---
print("\n--- üß† Model Loading & Verification ---")
print(f"Model loaded on: {model.device}")
print(f"Model dtype: {model.dtype}")

# Try one forward pass
sample = tokenizer("O le√£o √© um animal", return_tensors="pt").to(model.device)
with torch.no_grad():
    output = model(**sample)
print("‚úÖ Model works! Ready to train.")

# --- 4. Detailed Memory Check ---
print("\n--- üìä Memory Usage Report ---")

# GPU VRAM usage (from nvidia-smi)
gpu_used_smi, gpu_total_smi = get_gpu_memory()
gpu_free = gpu_total_smi - gpu_used_smi

print(f"GPU VRAM (Model + System): {gpu_used_smi}MB / {gpu_total_smi}MB (Free: {gpu_free}MB)")

# PyTorch Allocated/Cached Memory (More accurate for the current process)
if torch.cuda.is_available():
    allocated = torch.cuda.memory_allocated(0) / (1024**3)
    cached = torch.cuda.memory_reserved(0) / (1024**3)
    print(f"PyTorch VRAM Allocated: {allocated:.2f} GB")
    print(f"PyTorch VRAM Cached: {cached:.2f} GB")
    print(f"PyTorch Max VRAM Used: {torch.cuda.max_memory_allocated(0) / (1024**3):.2f} GB (Since beginning)")

# CPU RAM usage
ram_total_gb, ram_used_gb = get_cpu_memory()
print(f"System RAM (Used): {ram_used_gb:.2f} GB / {ram_total_gb:.2f} GB")

# --- 5. Text Generation Example (Your original code) ---
print("\n--- ‚úçÔ∏è Text Generation Test ---")
input_text = "A baleia azul √©"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=50,
        num_beams=5,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Input: {input_text}")
print(f"Generated: {generated_text}")

--- üíª System & Environment Check ---
OS: Linux 6.6.87.2-microsoft-standard-WSL2
Python: 3.12.3
PyTorch CUDA: ‚úÖ Enabled (Device: NVIDIA GeForce RTX 4070 Laptop GPU)
CUDA Version: 12.6
Total CUDA VRAM: 8.00 GB

--- üß† Model Loading & Verification ---
Model loaded on: cuda:0
Model dtype: torch.float16
‚úÖ Model works! Ready to train.

--- üìä Memory Usage Report ---
GPU VRAM (Model + System): 1791MB / 8188MB (Free: 6397MB)
PyTorch VRAM Allocated: 0.96 GB
PyTorch VRAM Cached: 1.59 GB
PyTorch Max VRAM Used: 1.12 GB (Since beginning)
System RAM (Used): 8.22 GB / 15.37 GB

--- ‚úçÔ∏è Text Generation Test ---
Input: A baleia azul √©
Generated: A baleia azul √© uma das maiores e mais belas criaturas do oceano.

**Caracter√≠sticas:**

*   **Tamanho:** As baleias azuis podem variar de 1,5 a 3 metros de comprimento.
*


In [13]:
# Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=eval_tokenized, 
    data_collator=data_collator,
)
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
250,1.848,1.841163
500,1.8266,1.705029
750,1.6139,1.660208
1000,1.5711,1.619913
1250,1.3863,1.634028
1500,1.4349,1.624888


TrainOutput(global_step=1689, training_loss=1.671677661125585, metrics={'train_runtime': 16081.569, 'train_samples_per_second': 1.679, 'train_steps_per_second': 0.105, 'total_flos': 2.393596658029363e+16, 'train_loss': 1.671677661125585, 'epoch': 3.0})

## 4. Test

In [15]:
# 1. Base Model Path
BASE_MODEL_ID = "./models/gemma-3-1b-it"

# 2. Fine-Tuned (LoRA) Model Path
LORA_ADAPTER_PATH = "./results/gemma-3-1b-it-animals-lora/checkpoint-1689"

In [18]:
# Load the base model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Loading Base Model: {BASE_MODEL_ID}")
base_tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map=device,
)
base_model.eval()
print("Base Model loaded.")

# Load the base model again for the fine-tuned version
ft_tokenizer = AutoTokenizer.from_pretrained(LORA_ADAPTER_PATH) # Load the tokenizer from the checkpoint
ft_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID, # Use the base model as the foundation
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map=device,
)

# Load the LoRA adapter weights onto the base model
print(f"Attaching LoRA Adapter from: {LORA_ADAPTER_PATH}")
ft_model = PeftModel.from_pretrained(ft_model, LORA_ADAPTER_PATH)
ft_model.eval()
print("Fine-Tuned Model loaded.")

Loading Base Model: ./models/gemma-3-1b-it
Base Model loaded.
Attaching LoRA Adapter from: ./results/gemma-3-1b-it-animals-lora/checkpoint-1689
Fine-Tuned Model loaded.


In [31]:
from transformers import TextStreamer

def test_models(prompt):
    messages = [{"role": "user", "content": prompt}]
    ft_prompt = ft_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    base_prompt = base_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    generation_kwargs = {
        "max_new_tokens": 500,
        "do_sample": True,
        "temperature": 0.7,
        "top_k": 50,
        "pad_token_id": ft_tokenizer.eos_token_id
    }

    # ===== USER =====
    print("=" * 50)
    print("USER:")
    print("-" * 50)
    print(prompt)

    # ===== BASE MODEL =====
    print("=" * 50)
    print("BASE MODEL RESPONSE:")
    print("-" * 50)
    
    base_input_ids = base_tokenizer(base_prompt, return_tensors="pt").to(device)
    
    # Create streamer for real-time output
    base_streamer = TextStreamer(base_tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    start_time = time.time()
    with torch.no_grad():
        base_output = base_model.generate(
            **base_input_ids, 
            **generation_kwargs,
            streamer=base_streamer
        )
    end_time = time.time()
    
    # Calculate metrics
    base_tokens_generated = len(base_output[0]) - len(base_input_ids['input_ids'][0])
    base_duration = end_time - start_time
    base_tokens_per_sec = base_tokens_generated / base_duration if base_duration > 0 else 0
    
    print(f"\nTokens: {base_tokens_generated} | Time: {base_duration:.2f}s | Speed: {base_tokens_per_sec:.2f} tok/s")

    # ===== FINE-TUNED MODEL =====
    print("\n" + "=" * 50)
    print("FINE-TUNED (LoRA) MODEL RESPONSE:")
    print("-" * 50)
    
    ft_input_ids = ft_tokenizer(ft_prompt, return_tensors="pt").to(device)
    
    # Create streamer for real-time output
    ft_streamer = TextStreamer(ft_tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    start_time = time.time()
    with torch.no_grad():
        ft_output = ft_model.generate(
            **ft_input_ids, 
            **generation_kwargs,
            streamer=ft_streamer
        )
    end_time = time.time()
    
    # Calculate metrics
    ft_tokens_generated = len(ft_output[0]) - len(ft_input_ids['input_ids'][0])
    ft_duration = end_time - start_time
    ft_tokens_per_sec = ft_tokens_generated / ft_duration if ft_duration > 0 else 0
    
    print(f"\nTokens: {ft_tokens_generated} | Time: {ft_duration:.2f}s | Speed: {ft_tokens_per_sec:.2f} tok/s")
    
    # ===== COMPARISON =====
    print("\n" + "=" * 50)
    print("PERFORMANCE COMPARISON:")
    print(f"Base Model:       {base_tokens_per_sec:.2f} tok/s")
    print(f"Fine-Tuned Model: {ft_tokens_per_sec:.2f} tok/s")
    speedup = ((ft_tokens_per_sec - base_tokens_per_sec) / base_tokens_per_sec * 100) if base_tokens_per_sec > 0 else 0
    print(f"Difference:       {speedup:+.1f}%")
    print("=" * 50)

In [32]:
# Define your test prompt
# prompt = "A baleia azul √©"
# prompt = "Fale sobre o le√£o"
prompt = "O que s√£o canidae?"

test_models(prompt)

USER:
--------------------------------------------------
O que s√£o canidae?
BASE MODEL RESPONSE:
--------------------------------------------------
Canidae √© o nome da fam√≠lia de mam√≠feros carn√≠voros que inclui c√£es, lobos, raposas, cotes, e outros. Eles s√£o animais fascinantes e complexos, com uma rica hist√≥ria evolutiva e uma variedade impressionante de caracter√≠sticas. Aqui est√° um resumo detalhado sobre as canid√©s:

**1. Origem e Evolu√ß√£o:**

*   **Ancestrais:** Canid√©s surgiram h√° cerca de 40 a 60 milh√µes de anos, em uma regi√£o que hoje √© a √Åsia Central.
*   **Diversifica√ß√£o:** A fam√≠lia Canidae se diversificou em diferentes grupos ao longo de milh√µes de anos, adaptando-se a diferentes nichos ecol√≥gicos.
*   **Origem dos Canis:** O g√™nero *Canis* √© considerado o ancestral comum de todos os canid√©s modernos.

**2. Caracter√≠sticas Distintivas:**

*   **Dentes:** Possuem dentes especializados para raspar carne, que se encaixam em um formato √∫nico, ideal p

---
USER:
---
O que s√£o canidae?

---
BASE MODEL RESPONSE:

---
Canidae √© o nome da fam√≠lia de mam√≠feros carn√≠voros que inclui c√£es, lobos, raposas, cotes, e outros. Eles s√£o animais fascinantes e complexos, com uma rica hist√≥ria evolutiva e uma variedade impressionante de caracter√≠sticas. Aqui est√° um resumo detalhado sobre as canid√©s:

**1. Origem e Evolu√ß√£o:**

*   **Ancestrais:** Canid√©s surgiram h√° cerca de 40 a 60 milh√µes de anos, em uma regi√£o que hoje √© a √Åsia Central.
*   **Diversifica√ß√£o:** A fam√≠lia Canidae se diversificou em diferentes grupos ao longo de milh√µes de anos, adaptando-se a diferentes nichos ecol√≥gicos.
*   **Origem dos Canis:** O g√™nero *Canis* √© considerado o ancestral comum de todos os canid√©s modernos.

**2. Caracter√≠sticas Distintivas:**

*   **Dentes:** Possuem dentes especializados para raspar carne, que se encaixam em um formato √∫nico, ideal para a mordida em carne.
*   **Olfato:** S√£o incrivelmente sens√≠veis ao olfato, o que lhes permite detectar presas e outros animais √† dist√¢ncia. O olfato √© considerado um sentido prim√°rio na fam√≠lia Canidae.
*   **Estrutura Social:** A maioria dos canid√©s vive em grupos sociais complexos, com hierarquias sociais bem definidas.
*   **Comportamento:**  S√£o animais altamente inteligentes, com comportamentos complexos, incluindo comunica√ß√£o vocal, marca√ß√£o territorial e coopera√ß√£o.

**3. Subgrupos Principais:**

*   **C√£es (Canidae):**  A maior e mais conhecida subfam√≠lia, incluindo ra√ßas diversas com diferentes caracter√≠sticas f√≠sicas e comportamentais.
*   **Lobos (Canini):**  Com a cabe√ßa mais larga e uma longa cauda, geralmente mais herb√≠voros.
*   **Rapos (Canini):**  Carregam uma "cabe√ßa de raio" (um pequeno "l√¢mina" na parte de tr√°s da cabe√ßa) que usam para capturar presas.
*   **Cotes (Canidae):**  Possuem uma pelagem densa e um focinho longo, geralmente com uma "cabe√ßa de raio" (um pequeno "l√¢mina") na parte de tr√°s.

Tokens: 500 | Time: 54.21s | Speed: 9.22 tok/s

---
FINE-TUNED (LoRA) MODEL RESPONSE:

---
As can√≠deos s√£o um grupo de mam√≠feros pertencentes √† ordem Carnivora. Eles s√£o animais que chegam a medir at√© 1,8 metros de comprimento e podem ter at√© 2,4 kg de peso. S√£o um dos grupos maiores de mam√≠feros terrestres existentes e constituem a maior parte da fauna do planeta. O grupo abrange uma grande diversidade de esp√©cies, com adapta√ß√µes diversas para diferentes ambientes. Tamb√©m √© o grupo que inclui os cachorros, c√£es, le√µes, tigres, lobos, ursos, coiotes, raposas, gatos, coiotes e, recentemente, o urso polar, que √© a √∫nica esp√©cie do grupo que vive na Ant√°rtida. √â um grupo de animais que vivem em cardumes, al√©m de serem animais sociais, e muitas esp√©cies vivem em grupos familiares ou de matilha. Obtida de " https://pt.wikipedia.org/w/index.php?title=Canidae&oldid=46671420 " A maioria das esp√©cies de can√≠deos s√£o predadores. S√£o animais carn√≠voros, ou seja, se alimentam principalmente de carne de outros animais. Eles possuem dentes adaptados para rasgar e triturar ossos e carne, al√©m de uma l√≠ngua comprida e afiada para agarrar as presas. Eles tamb√©m possuem uma vis√£o agu√ßada, audi√ß√£o e olfato muito desenvolvidos. Apesar de serem animais carn√≠voros, algumas esp√©cies de can√≠deos s√£o on√≠voras ou herb√≠voras, sendo o urso pardo o exemplo mais not√≥rio. Eles tamb√©m s√£o conhecidos pelos seus comportamentos complexos, como a comunica√ß√£o atrav√©s de latidos, uivos, linguagem corporal e at√© mesmo a cria√ß√£o de sociedades. As rela√ß√µes entre as diferentes esp√©cies de can√≠deos ainda s√£o objeto de estudo e existem muitas esp√©cies ainda a serem descobertas. H√° tamb√©m a quest√£o do parentesco dos felinos, com muitos estudos indicando que eles descendem de um √∫nico ancestral comum que viveu no final do Jur√°ssico. Obtida de " https://pt.wikipedia.org/w/index.php?title=Canidae&oldid=46671420 " Classifica√ß√£o do grupo Os can√≠deos podem ser divididos em dois grupos principais: os can√≠deos primitivos e os can√≠deos modernos. Os can√≠deos

Tokens: 500 | Time: 148.70s | Speed: 3.36 tok/s

---
PERFORMANCE COMPARISON:
- Base Model:       9.22 tok/s
- Fine-Tuned Model: 3.36 tok/s
- Difference:       -63.5%

---

## Notebook topics ‚Äî demo.ipynb

- **Environment & libraries**
  - Listed and imported core Python libraries used across the notebook: numpy, pandas, psutil, torch, BeautifulSoup, datasets, transformers, peft, etc.
  - Performed a quick CUDA/PyTorch availability check and printed device/CUDA info.

- **Data import**
  - Read media_resource.tab (TSV) into a DataFrame.
  - Printed shape and columns to inspect the dataset.

- **Data cleaning**
  - Implemented `clean_html_wikipedia(text)` to:
    - Strip HTML using BeautifulSoup.
    - Remove "Refer√™ncias" and typical Wikipedia artifacts (edit links, bracketed references).
    - Normalize whitespace and punctuation.
  - Applied the cleaning function to create a `text` column.

- **Deduplication & column selection**
  - Dropped duplicate rows by `identifier`.
  - Selected and renamed relevant fields to a cleaned DataFrame (`id`, `title`, `text`, `url`).

- **Dataset preparation & saving**
  - Converted the cleaned DataFrame into a Hugging Face `Dataset`.
  - Saved the dataset to animal_wikipedia_pt_dataset (disk cache / arrow files present in the repo).

- **Model selection**
  - Set base model path: gemma-3-1b-it (local model files present).

- **Train / Eval split**
  - Loaded dataset from disk and created a train/test split (90/10).
  - Sampled up to 9k training examples and up to 1k evaluation examples.

- **Tokenization**
  - Loaded tokenizer from the base model and set `pad_token` to `eos_token`.
  - Tokenization function appends EOS, truncates to 512 tokens, and returns attention masks.
  - Tokenized train and eval splits and computed total / average token counts for each.

- **Data collator**
  - Created a `DataCollatorForLanguageModeling` for causal LM (mlm=False) with padding optimizations.

- **Model loading & quantization**
  - Defined a 4-bit quantization configuration (`BitsAndBytesConfig` with nf4, double quant).
  - Loaded the model with 4-bit weights using device auto-mapping.
  - Enabled gradient checkpointing and input-requires-grad for k-bit training.

- **LoRA setup**
  - Configured LoRA with `r=16`, target projection modules, alpha, dropout, and CAUSAL_LM task.
  - Applied LoRA (`get_peft_model`) and printed trainable parameter stats.

- **Training configuration**
  - Prepared `TrainingArguments` for LoRA training:
    - 3 epochs, effective batch size via gradient accumulation, learning rate, warmup, cosine scheduler.
    - Evaluation every 250 steps, checkpointing, logging, mixed precision (fp16), optimizer (paged_adamw_8bit), gradient checkpointing.
    - Output dir pattern: `./results/{model_name}-animals-lora`.
  - Created a `Trainer` and invoked `trainer.train()` (training cell present).

- **System & memory diagnostics**
  - Added helper functions to report GPU (via `nvidia-smi`) and system RAM usage (psutil).
  - Printed detailed PyTorch CUDA memory stats (allocated, cached, max) and a short forward/generation sanity test to confirm model behavior.

- **Saving / checkpoints**
  - Notebook uses Hugging Face Trainer defaults plus `save_steps` and `save_total_limit`; results are stored under results (several checkpoints and adapter files visible in the repo).

- **Loading & testing fine-tuned LoRA adapter**
  - Demonstrated loading the base model and reloading it to attach a LoRA adapter from a checkpoint (e.g., checkpoint-1689).
  - Showed how to apply the model-specific chat template and generate text from both base and fine-tuned models, printing only the assistant response.
  - Generation parameters used: sampling, temperature, top-k, and pad/eos handling.

- **Notable artifacts & files**
  - Cleaned dataset saved at animal_wikipedia_pt_dataset (arrow cache files present).
  - Model files in gemma-3-1b-it and gemma-2b.
  - Training outputs and LoRA checkpoints under results (including adapter_model.safetensors, tokenizer files, trainer_state, and checkpoint meta like `chat_template.jinja`).

- **Purpose summary (one-line)**
  - Prepare a Portuguese Wikipedia-derived animal text dataset, fine-tune a Gemma causal LM with LoRA on the dataset using 4-bit quantization and memory-efficient training, and validate generation from base vs. fine-tuned adapters.