In [None]:
!pip install kaggle

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("andrewmvd/dog-and-cat-detection")

print("Path to dataset files:", path)

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET
from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights

In [None]:
class MyDataset(Dataset):
    def __init__(self, annotations_dir, image_dir, transform=None):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transform = transform
        self.image_files = self.filter_images_with_multiple_objects()

    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, f)):
                img_name = f
                annotation_name = os.path.splitext(img_name)[0] + ".xml"
                annotation_path = os.path.join(self.annotations_dir, annotation_name)

                if self.count_objects_in_annotation(annotation_path) == 1:
                    valid_image_files.append(img_name)
        return valid_image_files

    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = len(root.findall("object"))
            return count
        except FileNotFoundError:
            return 0

    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        # Lấy kích thước ảnh để chuẩn hóa
        image_width = int(root.find("size/width").text)
        image_height = int(root.find("size/height").text)

        label = None
        bbox = None

        for obj in root.findall("object"):
            name = obj.find("name").text
            if label is None:  # Lấy nhãn đầu tiên
                label = name

            # Lấy tọa độ bounding box
            xmin = int(obj.find("bndbox/xmin").text)
            ymin = int(obj.find("bndbox/ymin").text)
            xmax = int(obj.find("bndbox/xmax").text)
            ymax = int(obj.find("bndbox/ymax").text)

            # Chuẩn hóa tọa độ
            bbox = [
                xmin / image_width,
                ymin / image_height,
                xmax / image_width,
                ymax / image_height,
            ]
            break  # Chỉ lấy đối tượng đầu tiên

        return label, bbox

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_file = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_file)
        annotation_name = os.path.splitext(img_file)[0] + ".xml"
        annotation_path = os.path.join(self.annotations_dir, annotation_name)

        img = Image.open(img_path).convert("RGB")
        label, bbox = self.parse_annotation(annotation_path)

        if self.transform:
            img = self.transform(img)

        # Chuyển nhãn thành số (0 cho mèo, 1 cho chó)
        label_num = 0 if label == "cat" else 1 if label == "dog" else -1

        # Chuyển đổi bbox và label thành tensor
        bbox_tensor = torch.tensor(bbox, dtype=torch.float32)
        label_tensor = torch.tensor(label_num, dtype=torch.float32)

        return img, bbox_tensor, label_tensor

    def merge_images(self):
        idx1 = random.randint(0, len(self.image_files) - 1)
        img1_file = self.image_files[idx1]
        img1_path = os.path.join(self.image_dir, img1_file)

        idx2 = random.randint(0, len(self.image_files) - 1)
        img2_file = self.image_files[idx2]
        img2_path = os.path.join(self.image_dir, img2_file)

        img1 = Image.open(img1_path).convert("RGB")
        img2 = Image.open(img2_path).convert("RGB")

        # Ghép ảnh
        merged_image = Image.new("RGB", (img1.width + img2.width, max(img1.height, img2.height)))
        merged_image.paste(img1, (0, 0))
        merged_image.paste(img2, (img1.width, 0))

        merged_annotations = []
        merged_annotations.append({"bbox": img1.annotations[0], "label": img1.annotations[1]})

        # Điều chỉnh tọa độ bbox cho img2 nếu cần
        new_bbox = [
            img2.annotations[0][0] + img1.width / merged_image.width,
            img2.annotations[0][1] / merged_image.height,
            img2.annotations[0][2] + img1.width / merged_image.width,
            img2.annotations[0][3] / merged_image.height,
        ]

        merged_annotations.append({"bbox": new_bbox, "label": img2.annotations[1]})

        return merged_image, merged_annotations

In [None]:
# Thư mục dữ liệu
annotations_dir = os.path.join('data', 'annotations')
image_dir = os.path.join('data', 'images')

# Định nghĩa chuyển đổi
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# Tạo một dataset và dataloaders
dataset = MyDataset(annotations_dir, image_dir, transform=transform)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
class SimpleYOLO(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLO, self).__init__()
        self.backbone = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
        self.num_classes = num_classes

        # Loại bỏ lớp phân loại cuối cùng của ResNet
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])

        # Thêm đầu ra YOLO
        self.fcs = nn.Linear(2048, 2 + (4 * self.num_classes))  # 2 là số grid cell

    def forward(self, x):
        # x có dạng: (batch_size, C, H, W)
        features = self.backbone(x)
        features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1))  # (batch_size, 2048, 1, 1)
        features = features.view(features.size(0), -1)  # (batch_size, 2048)
        features = self.fcs(features)
        return features

In [None]:
# Khởi tạo model, criterion và optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 2  # Giả sử hai lớp: chó và mèo
class_to_idx = {'dog': 0, 'cat': 1}

model = SimpleYOLO(num_classes=num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
def calculate_loss(output, targets, device, num_classes):
    mse_loss = nn.MSELoss()
    ce_loss = nn.CrossEntropyLoss()

    batch_size = output.shape[0]
    total_loss = 0

    output = output.view(batch_size, 2, 2, 4 + num_classes)  # Reshape output to (batch_size, grid_y, grid_x, 4 + num_classes)

    for i in range(batch_size):  # Iterate through each image in the batch
        for j in range(len(targets[i])):  # Iterate through objects in the image
            bbox_center_x = (targets[i][j][0] + targets[i][j][2]) / 2
            bbox_center_y = (targets[i][j][1] + targets[i][j][3]) / 2
            grid_x = int(bbox_center_x) * 2
            grid_y = int(bbox_center_y) * 2

            # Classification loss for the responsible grid cell
            label_one_hot = torch.zeros(num_classes, device=device)
            label_one_hot[int(targets[i][j][4])] = 1
            classification_loss = ce_loss(output[i], grid_y, grid_x, 4, label_one_hot)

            # Regression loss for the responsible grid cell
            bbox_target = targets[i][j][:4].to(device)
            regression_loss = mse_loss(output[i], grid_y, grid_x, 4, bbox_target)

            # No Object Loss for other grid cells
            no_obj_loss = 0
            for other_grid_y in range(2):
                for other_grid_x in range(2):
                    if other_grid_y != grid_y or other_grid_x != grid_x:
                        no_obj_loss += mse_loss(output[i], other_grid_y, other_grid_x, 4, torch.zeros(4, device=device))

            total_loss += classification_loss + regression_loss + no_obj_loss

    return total_loss / batch_size  # Average loss over the batch

In [None]:
def evaluate_model(model, data_loader, device, num_classes):
    model.eval()
    running_loss = 0.0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for images, targets in data_loader:
            images = images.to(device)
            output = model(images)

            total_loss = calculate_loss(output, targets, device, num_classes)
            running_loss += total_loss.item()

            for batch_idx in range(images.shape[0]):
                for target in targets[batch_idx]:
                    all_predictions.append(output[batch_idx, :, :, :].argmax().item())
                    all_targets.append(target[4].item())

    val_loss = running_loss / len(data_loader)

    # Calculate accuracy
    all_predictions = torch.tensor(all_predictions, device=device)
    all_targets = torch.tensor(all_targets, device=device)
    val_accuracy = (all_predictions == all_targets).float().mean().item()

    return val_loss, val_accuracy

In [None]:
def train_model(model, train_loader, val_loader, optimizer, num_epochs, device, num_classes):
    best_val_accuracy = 0.0
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0

        for images, targets in train_loader:
            images = images.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            output = model(images)

            total_loss = calculate_loss(output, targets, device, num_classes)
            total_loss.backward()
            optimizer.step()

            running_loss += total_loss.item()

        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)

        # Validation
        val_loss, val_accuracy = evaluate_model(model, val_loader, device, num_classes)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # Save the best model
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), "best_model.pth")

    return train_losses, val_losses, train_accuracies, val_accuracies

In [None]:
def inference(model, image_path, transform, device, class_to_idx, threshold=0.5):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    original_width, original_height = image.size

    # Thay đổi kích thước hình ảnh theo yêu cầu đầu vào của mô hình
    resized_image = image.resize((448, 448))
    resized_width, resized_height = resized_image.size

    # Áp dụng các phép biến đổi giống như trong quá trình huấn luyện
    transformed_image = transform(resized_image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(transformed_image).view(1, 2, 2, 4 + len(class_to_idx))  # Điều chỉnh cho lưới 2x2

    fig, ax = plt.subplots(1)
    ax.axis("off")
    ax.imshow(resized_image)  # Hiển thị hình ảnh đã thay đổi kích thước

    for grid_y in range(2):
        for grid_x in range(2):
            # Dự đoán lớp và bounding box cho ô lưới hiện tại
            class_pred = output[0, grid_y, grid_x, 4:].argmax().item()
            bbox = output[0, grid_y, grid_x, :4].tolist()  # Bounding box dự đoán
            # Độ tin cậy của lớp dự đoán
            confidence = torch.softmax(output[0, grid_y, grid_x, 4:], dim=0)[class_pred]

            # Nếu độ tin cậy > threshold
            if confidence > threshold:
                x_min = bbox[0] * (resized_width / 2) + grid_x * (resized_width / 2)
                y_min = bbox[1] * (resized_height / 2) + grid_y * (resized_height / 2)
                x_max = bbox[2] * (resized_width / 2) + grid_x * (resized_width / 2)
                y_max = bbox[3] * (resized_height / 2) + grid_y * (resized_height / 2)

                # Vẽ bounding box và nhãn trên hình ảnh nếu độ tin cậy lớn hơn ngưỡng
                rect = patches.Rectangle(
                    (x_min, y_min),
                    x_max - x_min,
                    y_max - y_min,
                    linewidth=1,
                    edgecolor="r",
                    facecolor="none"
                )
                ax.add_patch(rect)
                ax.text(
                    x_min,
                    y_min,
                    f"{list(class_to_idx.keys())[class_pred]}: {confidence:.2f}",
                    color="white",
                    fontsize=12,
                    bbox=dict(facecolor="red", alpha=0.5)
                )

    plt.show()

# Tải mô hình tốt nhất
model.load_state_dict(torch.load("best_model.pth"))

# Dự đoán trên một hình ảnh mẫu
image_path = os.path.join("image_dir", "cat.100.jpg")
# Đường dẫn đến hình ảnh bạn muốn kiểm tra
image_path = "/mnt/c/Study/DD Project/good_1.jpg"
inference(model, image_path, transform, device, class_to_idx, threshold=0.5)