<a href="https://colab.research.google.com/github/scadriano/lia1_2025_2/blob/main/Aula%2008%20-%20Modelo%20para%20Identificar%20Cats%20or%20Dogs/LIA_Cats%26Dogs_com_PyTorch_ViT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dogs & Cats com Vision Transformer (ViT) 🐶 🐱

Este projeto objetiva implementar um classificador de Cães e Gatos, usando um modelo pré-treinado do tipo transformer de visão, com PyTorch da biblioteca Hugging Face Transformers.

O dataset tem 1002 imagens para treino (501 dogs; 501 cats) e 200 imagens para teste (100 dogs; 100 cats), mais o duno!

## Preparação dos dados

In [None]:
# Instalar pacotes necessários
!pip install -q transformers datasets evaluate

In [None]:
# Baixar os datasets (treino) do GitHub
!wget -q https://github.com/scadriano/lia1_2025_2/raw/main/Aula%2008%20-%20Modelo%20para%20Identificar%20Cats%20or%20Dogs/dataset_treino.zip

In [None]:
# Baixar os datasets (teste) do GitHub
!wget -q https://github.com/scadriano/lia1_2025_2/raw/main/Aula%2008%20-%20Modelo%20para%20Identificar%20Cats%20or%20Dogs/dataset_teste.zip

In [None]:
# Descompactar os arquivos (treino)
!unzip -q dataset_treino.zip

In [None]:
# Descompactar os arquivos (teste)
!unzip -q dataset_teste.zip

In [None]:
# Importação de bibliotecas
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.datasets import ImageFolder
from transformers import ViTForImageClassification, ViTImageProcessor
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [None]:
# Definir dispositivo (GPU se disponível)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'✅ Usando dispositivo: {device}')

In [None]:
# Nome do modelo pré-treinado
model_name = "google/vit-base-patch16-224"

# Carregar processador e modelo
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(
    model_name,
    num_labels=2,                 # 2 classes: gato ou cachorro
    ignore_mismatched_sizes=True  # permite ajustar camadas finais
)
model = model.to(device)

In [None]:
# Criar o dataset personalizado
class DogsAndCatsDataset(Dataset):
    def __init__(self, root_dir, processor):
        self.dataset = ImageFolder(root_dir)
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        # Pré-processar imagem para o ViT
        inputs = self.processor(images=image, return_tensors="pt")
        pixel_values = inputs['pixel_values'].squeeze()
        return pixel_values, label

In [None]:
# Dividir o dataset em treino e validação

# Diretório de treino
train_dir = './dataset_treino'

# Criar dataset completo
full_dataset = DogsAndCatsDataset(train_dir, processor)

# Dividir em treino (80%) e validação (20%)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

print(f'Total de imagens: {len(full_dataset)}')
print(f'Treinamento: {len(train_dataset)} | Validação: {len(val_dataset)}')

# DataLoader
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

## Treinamento

In [None]:
# Treinamento

# Configurações de treino
num_epochs = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Loop de treinamento
for epoch in range(num_epochs):
    model.train()
    total_loss, correct, total = 0, 0, 0

    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
    for images, labels in progress_bar:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=images, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, predicted = outputs.logits.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()

        progress_bar.set_postfix(
            loss=total_loss/(len(progress_bar)),
            accuracy=100.*correct/total
        )

    # Validação
    model.eval()
    val_correct, val_total, val_loss = 0, 0, 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(pixel_values=images, labels=labels)
            val_loss += outputs.loss.item()
            _, predicted = outputs.logits.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()

    print(f"📊 Epoch {epoch+1}/{num_epochs}")
    print(f"  Treino: Loss {total_loss/len(train_loader):.4f} | Acc {100.*correct/total:.2f}%")
    print(f"  Val   : Loss {val_loss/len(val_loader):.4f} | Acc {100.*val_correct/val_total:.2f}%\n")


In [None]:
# Gerar a Matriz de Confusão
all_preds, all_labels = [], []
model.eval()
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(pixel_values=images, labels=labels)
        _, predicted = outputs.logits.max(1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Matriz de confusão (0 cat; 1 dog)
cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['cat','dog'])
disp.plot(cmap='Blues')
plt.title("Matriz de Confusão")
plt.show()

## Deploy - inferência em nova imagem

In [None]:
from PIL import Image

# Carregar imagem de teste
test_image_path = './dataset_teste/100.jpg'
image = Image.open(test_image_path)

# Pré-processar
inputs = processor(images=image, return_tensors="pt").to(device)

# Inferência
model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1).item()

class_names = ['cat', 'dog']
predicted_label = class_names[predicted_class]
confidence = probabilities[0][predicted_class].item()

print(f"🐾 Classe predita: {predicted_label} | Confiança: {confidence:.2%}")
image