<a href="https://colab.research.google.com/github/ykitaguchi77/CorneAI/blob/main/yolov5_gradCAM_corneAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**YOLOv5_GradCAM_CorneAI**

https://github.com/pooya-mohammadi/yolov5-gradcam

##**Setup YOLOv5**

In [1]:
!pip uninstall deep_utils -y
!pip install -U git+https://github.com/pooya-mohammadi/deep_utils.git
!pip install torch
!pip install torchvision
!pip install -U opencv-python
print("[INFO] To use new installed version of opencv, the session should be restarted!!!!")

!git clone https://github.com/pooya-mohammadi/yolov5-gradcam

[0mCollecting git+https://github.com/pooya-mohammadi/deep_utils.git
  Cloning https://github.com/pooya-mohammadi/deep_utils.git to /tmp/pip-req-build-7f00sh4q
  Running command git clone --filter=blob:none --quiet https://github.com/pooya-mohammadi/deep_utils.git /tmp/pip-req-build-7f00sh4q
  Resolved https://github.com/pooya-mohammadi/deep_utils.git to commit 676177f45cab804253103a03cdbc8133f8580ed6
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: deep-utils
  Building wheel for deep-utils (setup.py) ... [?25l[?25hdone
  Created wheel for deep-utils: filename=deep_utils-1.3.30-py3-none-any.whl size=534410 sha256=f61d187521263c51f977e24cd31e8e5f39d2735eff87258132eb02d7de00316e
  Stored in directory: /tmp/pip-ephem-wheel-cache-a13c_wyv/wheels/8f/0a/f4/5e2b92d9573699e3e30ce319a4b06218eb281695935d0b8b54
Successfully built deep-utils
Installing collected packages: deep-utils
Successfully installed deep-utils-1.3.30
Collecting nvidia-cuda-nvrtc

In [2]:
import os
os.chdir('/content/yolov5-gradcam')

model_path = "/gdrive/MyDrive/Deep_learning/CorneAI_nagoya/yolo5_forcresco/weights/eye_nii_2202_onecaseoneimage2_doctorcompare_yolov5s_epoch200_batch16_89.8p/last.pt"
img_path = "/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問/フォトスリット_serial/3.jpg"

In [3]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
!python main.py --model-path $model_path --img-path $img_path --output-dir out

#**単一画像解析**

In [14]:
import time
import torch
import torch.nn.functional as F


def find_yolo_layer(model, layer_name):
    """Find yolov5 layer to calculate GradCAM and GradCAM++

    Args:
        model: yolov5 model.
        layer_name (str): the name of layer with its hierarchical information.

    Return:
        target_layer: found layer
    """
    hierarchy = layer_name.split('_')
    target_layer = model.model._modules[hierarchy[0]]

    for h in hierarchy[1:]:
        target_layer = target_layer._modules[h]
    return target_layer


class YOLOV5GradCAM:

    def __init__(self, model, layer_name, img_size=(640, 640)):
        self.model = model
        self.gradients = dict()
        self.activations = dict()

        def backward_hook(module, grad_input, grad_output):
            self.gradients['value'] = grad_output[0]
            return None

        def forward_hook(module, input, output):
            self.activations['value'] = output
            return None

        target_layer = find_yolo_layer(self.model, layer_name)
        target_layer.register_forward_hook(forward_hook)
        target_layer.register_backward_hook(backward_hook)

        device = 'cuda' if next(self.model.model.parameters()).is_cuda else 'cpu'
        self.model(torch.zeros(1, 3, *img_size, device=device))
        print('[INFO] saliency_map size :', self.activations['value'].shape[2:])

    def forward(self, input_img, class_idx=True):
        """
        Args:
            input_img: input image with shape of (1, 3, H, W)
        Return:
            mask: saliency map of the same spatial dimension with input
            logit: model output
            preds: The object predictions
        """
        saliency_maps = []
        b, c, h, w = input_img.size()
        tic = time.time()
        preds, logits = self.model(input_img)
        print("[INFO] model-forward took: ", round(time.time() - tic, 4), 'seconds')
        #for logit, cls, cls_name in zip(logits[0], preds[1][0], preds[2][0]):
        for cls, cls_name in zip(preds[1][0], preds[2][0]):
            print(cls_name)
            if class_idx:
                score = logits[0][0][cls]
            else:
                score = logits[0][0].max()
            self.model.zero_grad()
            tic = time.time()
            score.backward(retain_graph=True)
            print(f"[INFO] {cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')
            gradients = self.gradients['value']
            activations = self.activations['value']
            b, k, u, v = gradients.size()
            alpha = gradients.view(b, k, -1).mean(2)
            weights = alpha.view(b, k, 1, 1)
            saliency_map = (weights * activations).sum(1, keepdim=True)
            saliency_map = F.relu(saliency_map)
            saliency_map = F.upsample(saliency_map, size=(h, w), mode='bilinear', align_corners=False)
            saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()
            saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data
            saliency_maps.append(saliency_map)
        return saliency_maps, logits, preds

    def __call__(self, input_img):
        return self.forward(input_img)

In [16]:
import os
import time
import numpy as np
# from models.gradcam import YOLOV5GradCAM
from models.yolo_v5_object_detector import YOLOV5TorchObjectDetector
import cv2
from deep_utils import Box, split_extension

# パラメータ
model_path = "/gdrive/MyDrive/Deep_learning/CorneAI_nagoya/yolo5_forcresco/weights/eye_nii_2202_onecaseoneimage2_doctorcompare_yolov5s_epoch200_batch16_89.8p/last.pt"
img_path = "/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問/フォトスリット_serial/32.jpg"
output_dir = 'out'  # 出力ディレクトリ
img_size = 640  # 入力画像サイズ
target_layer = 'model_23_cv3_act'  # GradCAM を適用するレイヤー
method = 'gradcam'  # 'gradcam' または 'gradcampp'
device = 'cpu'  # 'cuda' または 'cpu'

# クラス名リストを指定
names = ["infection","normal","non-infection","scar","tumor","deposit","APAC","lens opacity","bullous"]

os.chdir('/content/yolov5-gradcam')

def get_res_img(bbox, mask, res_img):
    mask = mask.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy().astype(
        np.uint8)
    heatmap = cv2.applyColorMap(mask, cv2.COLORMAP_JET)
    n_heatmat = (Box.fill_outer_box(heatmap, bbox) / 255).astype(np.float32)
    res_img = res_img / 255
    res_img = cv2.add(res_img, n_heatmat)
    res_img = (res_img / res_img.max())
    return res_img, n_heatmat


def put_text_box(bbox, cls_name, res_img):
    x1, y1, x2, y2 = bbox
    # this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!
    cv2.imwrite('temp.jpg', (res_img * 255).astype(np.uint8))
    res_img = cv2.imread('temp.jpg')
    res_img = Box.put_box(res_img, bbox)
    res_img = Box.put_text(res_img, cls_name, (x1, y1))
    return res_img


def concat_images(images):
    w, h = images[0].shape[:2]
    width = w
    height = h * len(images)
    base_img = np.zeros((width, height, 3), dtype=np.uint8)
    for i, img in enumerate(images):
        base_img[:, h * i:h * (i + 1), ...] = img
    return base_img


def main(img_path):
    input_size = (img_size, img_size)
    img = cv2.imread(img_path)
    print('[INFO] Loading the model')
    model = YOLOV5TorchObjectDetector(model_path, device, img_size=input_size,
                                      names=names)
    torch_img = model.preprocessing(img[..., ::-1])
    if method == 'gradcam':
        saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)
    tic = time.time()
    masks, logits, [boxes, _, class_names, _] = saliency_method(torch_img)
    print("total time:", round(time.time() - tic, 4))
    result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()
    result = result[..., ::-1]  # convert to bgr
    images = [result]
    for i, mask in enumerate(masks):
        res_img = result.copy()
        bbox, cls_name = boxes[0][i], class_names[0][i]
        res_img, heat_map = get_res_img(bbox, mask, res_img)
        res_img = put_text_box(bbox, cls_name, res_img)
        images.append(res_img)
    final_image = concat_images(images)
    img_name = split_extension(os.path.split(img_path)[-1], suffix='-res')
    output_path = f'{output_dir}/{img_name}'
    os.makedirs(output_dir, exist_ok=True)
    print(f'[INFO] Saving the final image at {output_path}')
    cv2.imwrite(output_path, final_image)


def folder_main(folder_path):
    input_size = (img_size, img_size)
    print('[INFO] Loading the model')
    model = YOLOV5TorchObjectDetector(model_path, device, img_size=input_size,
                                      classes=classes)
    for item in os.listdir(folder_path):
        img_path = os.path.join(folder_path, item)
        img = cv2.imread(img_path)
        torch_img = model.preprocessing(img[..., ::-1])
        if method == 'gradcam':
            saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)
        tic = time.time()
        masks, logits, [boxes, _, class_names, _] = saliency_method(torch_img)
        print("total time:", round(time.time() - tic, 4))
        result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()
        result = result[..., ::-1]  # convert to bgr
        images = [result]
        for i, mask in enumerate(masks):
            res_img = result.copy()
            bbox, cls_name = boxes[0][i], class_names[0][i]
            res_img, heat_map = get_res_img(bbox, mask, res_img)
            res_img = put_text_box(bbox, cls_name, res_img)
            images.append(res_img)
        final_image = concat_images(images)
        img_name = split_extension(os.path.split(img_path)[-1], suffix='-res')
        output_path = f'{output_dir}/{img_name}'
        os.makedirs(output_dir, exist_ok=True)
        print(f'[INFO] Saving the final image at {output_path}')
        cv2.imwrite(output_path, final_image)


if __name__ == '__main__':
    if os.path.isdir(img_path):
        folder_main(img_path)
    else:
        main(img_path)

INFO:models.yolo:Fusing layers... 
Fusing layers... 


[INFO] Loading the model


INFO:utils.torch_utils:Model Summary: 213 layers, 7034398 parameters, 0 gradients
Model Summary: 213 layers, 7034398 parameters, 0 gradients


[INFO] Model is loaded




[INFO] saliency_map size : torch.Size([20, 20])
[INFO] model-forward took:  0.1295 seconds
normal
[INFO] normal, model-backward took:  0.1786 seconds
total time: 0.3135
[INFO] Saving the final image at out/32-res.jpg




#**Top3 analysis**

In [8]:
import time
import torch
import torch.nn.functional as F

def get_res_img(bbox, masks, res_img):
    for mask in masks:
        mask = mask.squeeze().mul(255).add_(0.5).clamp_(0, 255).detach().cpu().numpy().astype(np.uint8)
        heatmap = cv2.applyColorMap(mask, cv2.COLORMAP_JET)
        n_heatmat = (Box.fill_outer_box(heatmap, bbox) / 255).astype(np.float32)
        res_img = res_img / 255
        res_img = cv2.add(res_img, n_heatmat)
        res_img = (res_img / res_img.max())
    return res_img, n_heatmat


# def put_text_box(bbox, cls_name, res_img):
#     x1, y1, x2, y2 = bbox
#     # this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!
#     cv2.imwrite('temp.jpg', (res_img * 255).astype(np.uint8))
#     res_img = cv2.imread('temp.jpg')
#     res_img = Box.put_box(res_img, bbox)
#     res_img = Box.put_text(res_img, cls_name, (x1, y1))
#     return res_img

def put_text_box(bbox, cls_name, res_img):
    x1, y1, x2, y2 = bbox
    # this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!
    cv2.imwrite('temp.jpg', (res_img * 255).astype(np.uint8))
    res_img = cv2.imread('temp.jpg')
    res_img = Box.put_box(res_img, bbox)

    # Set the font size and other parameters
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 2.0
    color = (0, 255, 0)  # Green color in BGR format
    thickness = 2

    # Calculate the text size and position
    text_size, _ = cv2.getTextSize(cls_name, font, font_scale, thickness)
    text_x = 10  # Adjust the horizontal position to place the text near the left edge
    text_y = text_size[1] + 10  # Adjust the vertical position to place the text near the top edge

    # Put the text on the image
    cv2.putText(res_img, cls_name, (text_x, text_y), font, font_scale, color, thickness)

    return res_img

def concat_images(images):
    w, h = images[0].shape[:2]
    width = w
    height = h * len(images)
    base_img = np.zeros((width, height, 3), dtype=np.uint8)
    for i, img in enumerate(images):
        base_img[:, h * i:h * (i + 1), ...] = img
    return base_img
def find_yolo_layer(model, layer_name):
    """Find yolov5 layer to calculate GradCAM and GradCAM++

    Args:
        model: yolov5 model.
        layer_name (str): the name of layer with its hierarchical information.

    Return:
        target_layer: found layer
    """
    hierarchy = layer_name.split('_')
    target_layer = model.model._modules[hierarchy[0]]

    for h in hierarchy[1:]:
        target_layer = target_layer._modules[h]
    return target_layer


class YOLOV5GradCAM:
    def __init__(self, model, layer_name, img_size=(640, 640)):
        self.model = model
        self.gradients = dict()
        self.activations = dict()
        self.cls_names = []  # 追加: Top1~3のクラス名を保存するリスト

        def backward_hook(module, grad_input, grad_output):
            self.gradients['value'] = grad_output[0]
            return None

        def forward_hook(module, input, output):
            self.activations['value'] = output
            return None

        target_layer = find_yolo_layer(self.model, layer_name)
        target_layer.register_forward_hook(forward_hook)
        target_layer.register_backward_hook(backward_hook)

        device = 'cuda' if next(self.model.model.parameters()).is_cuda else 'cpu'
        self.model(torch.zeros(1, 3, *img_size, device=device))
        print('[INFO] saliency_map size :', self.activations['value'].shape[2:])

    def forward(self, input_img, class_idx=True):
        saliency_maps = []
        b, c, h, w = input_img.size()
        tic = time.time()
        preds, logits = self.model(input_img)
        print("[INFO] model-forward took: ", round(time.time() - tic, 4), 'seconds')
        print(f"preds: {preds}")
        print(f"logits[0]: {logits[0]}")

        _, top3_indices = torch.topk(logits[0], k=3)
        preds[1][0] = top3_indices.tolist()[0]
        preds[2][0] = [names[i] for i in preds[1][0]]
        print(f"preds[1][0]: {preds[1][0]}")
        print(f"preds[2][0]: {preds[2][0]}")

        self.cls_names = preds[2][0]  # 修正: Top1~3のクラス名を保存

        for cls, cls_name in zip(preds[1][0], preds[2][0]):
            print(f"cls_name: {cls_name}")

            if class_idx:
                score = logits[0][0][cls]
            else:
                score = logits[0][0].max()
            self.model.zero_grad()
            tic = time.time()
            score.backward(retain_graph=True)
            print(f"[INFO] {cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')
            gradients = self.gradients['value']
            activations = self.activations['value']
            b, k, u, v = gradients.size()
            alpha = gradients.view(b, k, -1).mean(2)
            weights = alpha.view(b, k, 1, 1)
            saliency_map = (weights * activations).sum(1, keepdim=True)
            saliency_map = F.relu(saliency_map)
            saliency_map = F.upsample(saliency_map, size=(h, w), mode='bilinear', align_corners=False)
            saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()
            saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data
            saliency_maps.append(saliency_map)

        return saliency_maps, logits, preds, self.cls_names  # 修正: cls_names を返り値に追加
    def __call__(self, input_img):
        return self.forward(input_img)

In [9]:
import os
import time
import numpy as np
# from models.gradcam import YOLOV5GradCAM
from models.yolo_v5_object_detector import YOLOV5TorchObjectDetector
import cv2
from deep_utils import Box, split_extension


# パラメータ
model_path = "/gdrive/MyDrive/Deep_learning/CorneAI_nagoya/yolo5_forcresco/weights/eye_nii_2202_onecaseoneimage2_doctorcompare_yolov5s_epoch200_batch16_89.8p/last.pt"
img_path = "/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問/フォトスリット_serial/7.jpg"
# output_dir = 'out'  # 出力ディレクトリ
output_dir = '/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問_GradCAM/GradCam_img'
img_size = 640  # 入力画像サイズ
target_layer = 'model_23_cv3_act'  # GradCAM を適用するレイヤー
method = 'gradcam'  # 'gradcam' または 'gradcampp'
device = 'cpu'  # 'cuda' または 'cpu'

# クラス名リストを指定
names = ["infection","normal","non-infection","scar","tumor","deposit","APAC","lens opacity","bullous"]

#ここからがメイン
input_size = (img_size, img_size)
img = cv2.imread(img_path)
print('[INFO] Loading the model')
model = YOLOV5TorchObjectDetector(model_path, device, img_size=input_size,
                                  names=names)
torch_img = model.preprocessing(img[..., ::-1])
if method == 'gradcam':
    saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)
tic = time.time()

masks, logits, [boxes, _, _, _], cls_names = saliency_method(torch_img)
print(F"cls_names: {cls_names}")
print("total time:", round(time.time() - tic, 4))
result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()
result = result[..., ::-1]  # convert to bgr
images = [result]

for i in range(len(masks)):
    res_img = result.copy()
    h, w, _ = res_img.shape  # 画像の高さと幅を取得
    for j, mask in enumerate(masks[i]):
        bbox = boxes[0][j]
        bbox = [max(0, coord) for coord in bbox]  # negative座標を0に変更
        bbox = [min(coord, max_val) for coord, max_val in zip(bbox, [w, h, w, h])]  # 画像サイズを超える座標を画像サイズに合わせる
        res_img, _ = get_res_img(bbox, [mask], res_img)
        res_img = put_text_box(bbox, cls_names[i], res_img)
    images.append(res_img)

final_image = concat_images(images)
img_name = split_extension(os.path.split(img_path)[-1], suffix='-res')
output_path = f'{output_dir}/{img_name}'
os.makedirs(output_dir, exist_ok=True)
print(f'[INFO] Saving the final image at {output_path}')
cv2.imwrite(output_path, final_image)

INFO:models.yolo:Fusing layers... 
Fusing layers... 
INFO:utils.torch_utils:Model Summary: 213 layers, 7034398 parameters, 0 gradients
Model Summary: 213 layers, 7034398 parameters, 0 gradients


[INFO] Loading the model
[INFO] Model is loaded




[INFO] saliency_map size : torch.Size([20, 20])
[INFO] model-forward took:  0.1345 seconds
preds: [[[[58, 1, 470, 575]]], [[1]], [['normal']], [[0.75]]]
logits[0]: tensor([[-3.57789,  1.37006, -4.83992, -3.20287, -2.91802, -5.00547, -7.60832, -5.65962, -7.46785]], grad_fn=<IndexBackward0>)
preds[1][0]: [1, 4, 3]
preds[2][0]: ['normal', 'tumor', 'scar']
cls_name: normal
[INFO] normal, model-backward took:  0.1932 seconds
cls_name: tumor
[INFO] tumor, model-backward took:  0.142 seconds
cls_name: scar




[INFO] scar, model-backward took:  0.1417 seconds
cls_names: ['normal', 'tumor', 'scar']
total time: 0.6287
[INFO] Saving the final image at /gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問_GradCAM/GradCam_img/7-res.jpg


True

In [10]:
import numpy as np
from deep_utils.utils.box_utils.boxes import Box
import torch
from models.experimental import attempt_load
from utils.general import xywh2xyxy
from utils.datasets import letterbox
import cv2
import time
import torchvision
import torch.nn as nn
from utils.metrics import box_iou

class YOLOV5TorchObjectDetector(nn.Module):
    def __init__(self,
                 model_weight,
                 device,
                 img_size,
                 names=None,
                 mode='eval',
                 confidence=0.4,
                 iou_thresh=0.45,
                 agnostic_nms=False):
        super(YOLOV5TorchObjectDetector, self).__init__()
        self.device = device
        self.model = None
        self.img_size = img_size
        self.mode = mode
        self.confidence = confidence
        self.iou_thresh = iou_thresh
        self.agnostic = agnostic_nms
        self.model = attempt_load(model_weight, device=device)
        print("[INFO] Model is loaded")
        self.model.requires_grad_(True)
        self.model.to(device)
        if self.mode == 'train':
            self.model.train()
        else:
            self.model.eval()
        # fetch the names
        if names is None:
            print('[INFO] fetching names from coco file')
            self.names = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat',
                          'traffic light',
                          'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
                          'cow',
                          'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase',
                          'frisbee',
                          'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
                          'surfboard',
                          'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana',
                          'apple',
                          'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
                          'couch',
                          'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
                          'keyboard', 'cell phone',
                          'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
                          'teddy bear',
                          'hair drier', 'toothbrush']
        else:
            self.names = names

        # preventing cold start
        img = torch.zeros((1, 3, *self.img_size), device=device)
        self.model(img)

    @staticmethod
    def non_max_suppression(prediction, logits, conf_thres=0.6, iou_thres=0.45, classes=None, agnostic=False,
                            multi_label=False, labels=(), max_det=300):
        """Runs Non-Maximum Suppression (NMS) on inference and logits results

        Returns:
             list of detections, on (n,6) tensor per image [xyxy, conf, cls] and pruned input logits (n, number-classes)
        """

        nc = prediction.shape[2] - 5  # number of classes
        xc = prediction[..., 4] > conf_thres  # candidates

        # Checks
        assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
        assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'

        # Settings
        min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
        max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
        time_limit = 10.0  # seconds to quit after
        redundant = True  # require redundant detections
        multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
        merge = False  # use merge-NMS

        t = time.time()
        output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
        logits_output = [torch.zeros((0, 80), device=logits.device)] * logits.shape[0]
        for xi, (x, log_) in enumerate(zip(prediction, logits)):  # image index, image inference
            # Apply constraints
            # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
            x = x[xc[xi]]  # confidence
            log_ = log_[xc[xi]]
            # Cat apriori labels if autolabelling
            if labels and len(labels[xi]):
                l = labels[xi]
                v = torch.zeros((len(l), nc + 5), device=x.device)
                v[:, :4] = l[:, 1:5]  # box
                v[:, 4] = 1.0  # conf
                v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
                x = torch.cat((x, v), 0)

            # If none remain process next image
            if not x.shape[0]:
                continue

            # Compute conf
            x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf
            # log_ *= x[:, 4:5]
            # Box (center x, center y, width, height) to (x1, y1, x2, y2)
            box = xywh2xyxy(x[:, :4])

            # Detections matrix nx6 (xyxy, conf, cls)
            if multi_label:
                i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
                x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
            else:  # best class only
                conf, j = x[:, 5:].max(1, keepdim=True)
                # log_ = x[:, 5:]
                x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
                log_ = log_[conf.view(-1) > conf_thres]
            # Filter by class
            if classes is not None:
                x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

            # Check shape
            n = x.shape[0]  # number of boxes
            if not n:  # no boxes
                continue
            elif n > max_nms:  # excess boxes
                x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence

            # Batched NMS
            c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
            boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
            i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
            if i.shape[0] > max_det:  # limit detections
                i = i[:max_det]
            if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
                # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
                iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
                weights = iou * scores[None]  # box weights
                x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
                if redundant:
                    i = i[iou.sum(1) > 1]  # require redundancy

            output[xi] = x[i]
            logits_output[xi] = log_[i]
            assert log_[i].shape[0] == x[i].shape[0]
            if (time.time() - t) > time_limit:
                print(f'WARNING: NMS time limit {time_limit}s exceeded')
                break  # time limit exceeded

        return output, logits_output

    @staticmethod
    def yolo_resize(img, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True):

        return letterbox(img, new_shape=new_shape, color=color, auto=auto, scaleFill=scaleFill, scaleup=scaleup)

    def forward(self, img):
        prediction, logits, _ = self.model(img, augment=False)
        prediction, logits = self.non_max_suppression(prediction, logits, self.confidence, self.iou_thresh,
                                                      classes=None,
                                                      agnostic=self.agnostic)
        self.boxes, self.class_names, self.classes, self.confidences = [[[] for _ in range(img.shape[0])] for _ in
                                                                        range(4)]
        for i, det in enumerate(prediction):  # detections per image
            if len(det):
                for *xyxy, conf, cls in det:
                    # xyxyの値を修正
                    xyxy[0] = max(0, xyxy[0])
                    xyxy[1] = max(0, xyxy[1])
                    xyxy[2] = min(img_size, xyxy[2])
                    xyxy[3] = min(img_size, xyxy[3])

                    bbox = Box.box2box(xyxy,
                                       in_source=Box.BoxSource.Torch,
                                       to_source=Box.BoxSource.Numpy,
                                       return_int=True)
                    self.boxes[i].append(bbox)
                    self.confidences[i].append(round(conf.item(), 2))
                    cls = int(cls.item())
                    self.classes[i].append(cls)
                    if self.names is not None:
                        self.class_names[i].append(self.names[cls])
                    else:
                        self.class_names[i].append(cls)
        return [self.boxes, self.classes, self.class_names, self.confidences], logits

    def preprocessing(self, img):
        if len(img.shape) != 4:
            img = np.expand_dims(img, axis=0)
        im0 = img.astype(np.uint8)
        img = np.array([self.yolo_resize(im, new_shape=self.img_size)[0] for im in im0])
        img = img.transpose((0, 3, 1, 2))
        img = np.ascontiguousarray(img)
        img = torch.from_numpy(img).to(self.device)
        img = img / 255.0
        return img

In [13]:
import os
import time
import numpy as np
# from models.gradcam import YOLOV5GradCAM
#from models.yolo_v5_object_detector import YOLOV5TorchObjectDetector
import cv2
from deep_utils import Box, split_extension
import gc



# パラメータ
model_path = "/gdrive/MyDrive/Deep_learning/CorneAI_nagoya/yolo5_forcresco/weights/eye_nii_2202_onecaseoneimage2_doctorcompare_yolov5s_epoch200_batch16_89.8p/last.pt"
img_path = "/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問/フォトスリット_serial"
output_dir = 'out'  # 出力ディレクトリ
img_size = 640  # 入力画像サイズ
target_layer = 'model_23_cv3_act'  # GradCAM を適用するレイヤー
method = 'gradcam'  # 'gradcam' または 'gradcampp'
device = 'cpu'  # 'cuda' または 'cpu'

# クラス名リストを指定
names = ["infection","normal","non-infection","scar","tumor","deposit","APAC","lens opacity","bullous"]

import time
import torch
import torch.nn.functional as F

def get_res_img(bbox, masks, res_img):
    for mask in masks:
        mask = mask.squeeze().mul(255).add_(0.5).clamp_(0, 255).detach().cpu().numpy().astype(np.uint8)
        heatmap = cv2.applyColorMap(mask, cv2.COLORMAP_JET)
        n_heatmat = (Box.fill_outer_box(heatmap, bbox) / 255).astype(np.float32)
        res_img = res_img / 255
        res_img = cv2.add(res_img, n_heatmat)
        res_img = (res_img / res_img.max())
    return res_img, n_heatmat


# def put_text_box(bbox, cls_name, res_img):
#     x1, y1, x2, y2 = bbox
#     # this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!
#     cv2.imwrite('temp.jpg', (res_img * 255).astype(np.uint8))
#     res_img = cv2.imread('temp.jpg')
#     res_img = Box.put_box(res_img, bbox)
#     res_img = Box.put_text(res_img, cls_name, (x1, y1))
#     return res_img

def put_text_box(bbox, cls_name, res_img):
    x1, y1, x2, y2 = bbox
    # this is a bug in cv2. It does not put box on a converted image from torch unless it's buffered and read again!
    cv2.imwrite('temp.jpg', (res_img * 255).astype(np.uint8))
    res_img = cv2.imread('temp.jpg')
    res_img = Box.put_box(res_img, bbox)

    # Set the font size and other parameters
    font = cv2.FONT_HERSHEY_SIMPLEX
    font_scale = 2.0
    color = (0, 255, 0)  # Green color in BGR format
    thickness = 2

    # Calculate the text size and position
    text_size, _ = cv2.getTextSize(cls_name, font, font_scale, thickness)
    text_x = 10  # Adjust the horizontal position to place the text near the left edge
    text_y = text_size[1] + 10  # Adjust the vertical position to place the text near the top edge

    # Put the text on the image
    cv2.putText(res_img, cls_name, (text_x, text_y), font, font_scale, color, thickness)

    return res_img

def concat_images(images):
    w, h = images[0].shape[:2]
    width = w
    height = h * len(images)
    base_img = np.zeros((width, height, 3), dtype=np.uint8)
    for i, img in enumerate(images):
        base_img[:, h * i:h * (i + 1), ...] = img
    return base_img
def find_yolo_layer(model, layer_name):
    """Find yolov5 layer to calculate GradCAM and GradCAM++

    Args:
        model: yolov5 model.
        layer_name (str): the name of layer with its hierarchical information.

    Return:
        target_layer: found layer
    """
    hierarchy = layer_name.split('_')
    target_layer = model.model._modules[hierarchy[0]]

    for h in hierarchy[1:]:
        target_layer = target_layer._modules[h]
    return target_layer


class YOLOV5GradCAM:
    def __init__(self, model, layer_name, img_size=(640, 640)):
        self.model = model
        self.gradients = dict()
        self.activations = dict()
        self.cls_names = []  # 追加: Top1~3のクラス名を保存するリスト

        def backward_hook(module, grad_input, grad_output):
            self.gradients['value'] = grad_output[0]
            return None

        def forward_hook(module, input, output):
            self.activations['value'] = output
            return None

        target_layer = find_yolo_layer(self.model, layer_name)
        target_layer.register_forward_hook(forward_hook)
        target_layer.register_backward_hook(backward_hook)

        device = 'cuda' if next(self.model.model.parameters()).is_cuda else 'cpu'
        self.model(torch.zeros(1, 3, *img_size, device=device))
        # print('[INFO] saliency_map size :', self.activations['value'].shape[2:])

    def forward(self, input_img, class_idx=True):
        saliency_maps = []
        b, c, h, w = input_img.size()
        tic = time.time()
        preds, logits = self.model(input_img)
        # print("[INFO] model-forward took: ", round(time.time() - tic, 4), 'seconds')

        _, top3_indices = torch.topk(logits[0], k=3)
        preds[1][0] = top3_indices.tolist()[0]
        preds[2][0] = [names[i] for i in preds[1][0]]

        self.cls_names = preds[2][0]  # 修正: Top1~3のクラス名を保存

        for cls, cls_name in zip(preds[1][0], preds[2][0]):

            if class_idx:
                score = logits[0][0][cls]
            else:
                score = logits[0][0].max()
            self.model.zero_grad()
            tic = time.time()
            score.backward(retain_graph=True)
            # print(f"[INFO] {cls_name}, model-backward took: ", round(time.time() - tic, 4), 'seconds')
            gradients = self.gradients['value']
            activations = self.activations['value']
            b, k, u, v = gradients.size()
            alpha = gradients.view(b, k, -1).mean(2)
            weights = alpha.view(b, k, 1, 1)
            saliency_map = (weights * activations).sum(1, keepdim=True)
            saliency_map = F.relu(saliency_map)
            saliency_map = F.upsample(saliency_map, size=(h, w), mode='bilinear', align_corners=False)
            saliency_map_min, saliency_map_max = saliency_map.min(), saliency_map.max()
            saliency_map = (saliency_map - saliency_map_min).div(saliency_map_max - saliency_map_min).data
            saliency_maps.append(saliency_map)

        return saliency_maps, logits, preds, self.cls_names  # 修正: cls_names を返り値に追加
    def __call__(self, input_img):
        return self.forward(input_img)


def main(img_path):
   input_size = (img_size, img_size)
   img = cv2.imread(img_path)
  #  print('[INFO] Loading the model')
   model = YOLOV5TorchObjectDetector(model_path, device, img_size=input_size,
                                     names=names)
   torch_img = model.preprocessing(img[..., ::-1])
   if method == 'gradcam':
       saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)
   tic = time.time()
   masks, logits, [boxes, _, _, _], cls_names = saliency_method(torch_img)
   print("total time:", round(time.time() - tic, 4))
   result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()
   result = result[..., ::-1]  # convert to bgr
   images = [result]

   for i in range(len(masks)):
       res_img = result.copy()
       for j, mask in enumerate(masks[i]):
           bbox = boxes[0][j]
           res_img, _ = get_res_img(bbox, [mask], res_img)
           res_img = put_text_box(bbox, cls_names[i], res_img)
       images.append(res_img)

   final_image = concat_images(images)
   img_name = split_extension(os.path.split(img_path)[-1], suffix='-res')
   output_path = f'{output_dir}/{img_name}'
   os.makedirs(output_dir, exist_ok=True)
   print(f'[INFO] Saving the final image at {output_path}')
   cv2.imwrite(output_path, final_image)


def folder_main(folder_path):
   input_size = (img_size, img_size)
  #  print('[INFO] Loading the model')
   model = YOLOV5TorchObjectDetector(model_path, device, img_size=input_size,
                                     names=names)

   for item in os.listdir(folder_path):
       img_path = os.path.join(folder_path, item)
       img_basename = os.path.basename(img_path)
       print(f"Processing image: {img_basename}")  # 追加: 画像のファイル名を表示

       img = cv2.imread(img_path)
       torch_img = model.preprocessing(img[..., ::-1])
       if method == 'gradcam':
           saliency_method = YOLOV5GradCAM(model=model, layer_name=target_layer, img_size=input_size)
       tic = time.time()
       masks, logits, [boxes, _, _, _], cls_names = saliency_method(torch_img)
       print(f"boxes: {boxes}")
      #  # boxes[0][0][0]とboxes[0][0][1]が0未満の場合は0に修正
      #  boxes[0][0][0] = max(1, boxes[0][0][0])
      #  boxes[0][0][1] = max(1, boxes[0][0][1])

      #  # boxes[0][0][2]とboxes[0][0][3]が640より大きい場合は640に修正
      #  boxes[0][0][2] = min(639, boxes[0][0][2])
      #  boxes[0][0][3] = min(639, boxes[0][0][3])

       #  print("total time:", round(time.time() - tic, 4))
       result = torch_img.squeeze(0).mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).detach().cpu().numpy()
       result = result[..., ::-1]  # convert to bgr
       images = [result]

       for i in range(len(masks)):
           res_img = result.copy()
           for j, mask in enumerate(masks[i]):
               bbox = boxes[0][j]
               res_img, _ = get_res_img(bbox, [mask], res_img)
               res_img = put_text_box(bbox, cls_names[i], res_img)
           images.append(res_img)

       final_image = concat_images(images)
       img_name = split_extension(os.path.split(img_path)[-1], suffix='-res')
       output_path = f'{output_dir}/{img_name}'
       os.makedirs(output_dir, exist_ok=True)
       print(f'[INFO] Saving the final image at {output_path}')
       cv2.imwrite(output_path, final_image)

           # オブジェクトを明示的に削除
       del saliency_method
       del masks
       del logits
       del boxes
       del images
       del final_image
       gc.collect()


if __name__ == '__main__':
   if os.path.isdir(img_path):
       folder_main(img_path)
   else:
       main(img_path)

INFO:models.yolo:Fusing layers... 
Fusing layers... 
INFO:utils.torch_utils:Model Summary: 213 layers, 7034398 parameters, 0 gradients
Model Summary: 213 layers, 7034398 parameters, 0 gradients


[INFO] Model is loaded
Processing image: 141.jpg




boxes: [[[60, 223, 315, 550]]]
[INFO] Saving the final image at out/141-res.jpg
Processing image: 30.jpg




boxes: [[[40, 25, 479, 599]]]
[INFO] Saving the final image at out/30-res.jpg
Processing image: 26.jpg




boxes: [[[52, 80, 449, 640]]]
[INFO] Saving the final image at out/26-res.jpg
Processing image: 123.jpg




boxes: [[[92, 95, 367, 612]]]
[INFO] Saving the final image at out/123-res.jpg
Processing image: 159.jpg




boxes: [[[1, 61, 453, 561]]]
[INFO] Saving the final image at out/159-res.jpg
Processing image: 62.jpg




boxes: [[[17, 187, 404, 607]]]
[INFO] Saving the final image at out/62-res.jpg
Processing image: 169.jpg




boxes: [[[60, 93, 397, 633]]]
[INFO] Saving the final image at out/169-res.jpg
Processing image: 164.jpg




boxes: [[[96, 79, 410, 609]]]
[INFO] Saving the final image at out/164-res.jpg




Processing image: 39.jpg
boxes: [[[0, 0, 475, 598]]]
[INFO] Saving the final image at out/39-res.jpg
Processing image: 19.jpg




boxes: [[[4, 81, 482, 639], [5, 84, 482, 638]]]
[INFO] Saving the final image at out/19-res.jpg
Processing image: 135.jpg




boxes: [[[60, 188, 329, 486]]]
[INFO] Saving the final image at out/135-res.jpg




Processing image: 58.jpg
boxes: [[[20, 140, 400, 575]]]
[INFO] Saving the final image at out/58-res.jpg
Processing image: 157.jpg




boxes: [[[44, 57, 425, 596]]]
[INFO] Saving the final image at out/157-res.jpg
Processing image: 34.jpg




boxes: [[[39, 91, 455, 629]]]
[INFO] Saving the final image at out/34-res.jpg
Processing image: 64.jpg




boxes: [[[16, 176, 382, 582]]]
[INFO] Saving the final image at out/64-res.jpg
Processing image: 31.jpg




IndexError: list index out of range

To do

/gdrive/MyDrive/研究/進行中の研究/角膜スマートフォンAIプロジェクト/前原の240問/フォトスリット_serial/31.jpg
の判定がつかない問題