In [None]:
# 載入模型
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2

model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.py", "weights/checkpoint.pth") # swin_T
# model = load_model("groundingdino/config/GroundingDINO_SwinB_cfg.py", "weights/groundingdino_swinb_cogcoor.pth") # swin_B

# 圖片時若有lora要改成True
use_lora = False
lora_checkpoint = 'D:\\GroundingDINO_live\\Open-GroundingDino\\output\\checkpoint0017_lora_prompt.pth'

In [None]:
# LoRA 套件
import torch
import torch.nn as nn
import numpy as np

class LoRA_qkv(nn.Module):
    def __init__(
            self,
            qkv,
            linear_a_q: nn.Module,
            linear_b_q: nn.Module,
            linear_a_v: nn.Module,
            linear_b_v: nn.Module,):
        super().__init__()
        self.qkv = qkv
        self.linear_a_q = linear_a_q
        self.linear_b_q = linear_b_q
        self.linear_a_v = linear_a_v
        self.linear_b_v = linear_b_v
        self.d_model = qkv.in_features
        self.w_identity = torch.eye(qkv.in_features)

    def forward(self, x):
        qkv = self.qkv(x)
        q_ba = self.linear_b_q(self.linear_a_q(x))
        v_ba = self.linear_b_v(self.linear_a_v(x))
        qkv[:, :,  :self.d_model] += q_ba # q part
        qkv[:, :,  -self.d_model:] += v_ba # v part
        return qkv
    
class LoRA_gdswin(nn.Module):
    def __init__(self, model, rank=256):
        super().__init__()
        self.rank = rank
        assert rank > 0
        # base_vit_dim = sam_model.image_encoder.patch_embed.proj.out_channels
        self.A_weights = []
        self.B_weights = []
        for param in model.parameters():
            param.requires_grad = False

        for layer in model.backbone[0].layers:
            for blk in layer.blocks:
                w_qkv_linear = blk.attn.qkv
                self.d_model = w_qkv_linear.in_features
                w_a_linear_q = nn.Linear(self.d_model, self.rank, bias=False)
                w_b_linear_q = nn.Linear(self.rank, self.d_model, bias=False)
                w_a_linear_v = nn.Linear(self.d_model, self.rank, bias=False)
                w_b_linear_v = nn.Linear(self.rank, self.d_model, bias=False)
                self.A_weights.append(w_a_linear_q)
                self.B_weights.append(w_b_linear_q)
                self.A_weights.append(w_a_linear_v)
                self.B_weights.append(w_b_linear_v)
                blk.attn.qkv = LoRA_qkv(
                    w_qkv_linear,
                    w_a_linear_q,
                    w_b_linear_q,
                    w_a_linear_v,
                    w_b_linear_v
                )
        self.reset_parameters()
        self.lora_model = model
        

    def reset_parameters(self):
        # Initalisation like in the paper
        for w_A in self.A_weights:
            nn.init.kaiming_uniform_(w_A.weight, a=np.sqrt(5))
        for w_B in self.B_weights:
            nn.init.zeros_(w_B.weight)

class LoRALinear(nn.Module):
    def __init__(self, original_linear, rank):
        super().__init__()
        self.original_linear = original_linear
        self.rank = rank
        self.lora_A = nn.Linear(original_linear.in_features, rank, bias=False)
        self.lora_B = nn.Linear(rank, original_linear.out_features, bias=False)
        nn.init.normal_(self.lora_A.weight, std=0.02)
        nn.init.zeros_(self.lora_B.weight)

    def forward(self, x):
        return self.original_linear(x) + self.lora_B(self.lora_A(x))


if use_lora:
    model = LoRA_gdswin(model, 512)
    model = model.lora_model
    model.load_state_dict(torch.load(lora_checkpoint)['model'])

In [None]:
# 測試單張圖片
BOX_TRESHOLD = 0.2
TEXT_TRESHOLD = 0.2
IMAGE_PATH = r'C:\Users\yangu\Desktop\testt\AA90TL43J2PAT152053601-01.jpg' # 圖片位置
TEXT_PROMPT = 'the right defect' # prompt

image_source, image = load_image(IMAGE_PATH)
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_TRESHOLD,
    text_threshold=TEXT_TRESHOLD
)

annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)
cv2.imwrite("annotated_image.jpg", annotated_frame)

In [None]:
# 預測用function
import torch
from torchvision.ops import box_convert
from glob import glob
import cv2
import os
import numpy as np
import xml.etree.ElementTree as ET
import supervision as sv

def create_xml(objects, filename_text, folder_text, width_text, height_text, save_path):
    annotation = ET.Element("annotation")
    # 添加子節點
    folder = ET.SubElement(annotation, "folder")
    folder.text = folder_text
    filename = ET.SubElement(annotation, "filename")
    filename.text = filename_text
    size = ET.SubElement(annotation, "size")
    width = ET.SubElement(size, "width")
    width.text = width_text
    height = ET.SubElement(size, "height")
    height.text = height_text
    depth = ET.SubElement(size, "depth")
    depth.text = "3"
    segmented = ET.SubElement(annotation, "segmented")
    segmented.text = "0"
    for obj in objects:
        object_node = ET.SubElement(annotation, "object")
        name = ET.SubElement(object_node, "name")
        name.text = obj["name"]
        truncated = ET.SubElement(object_node, "truncated")
        truncated.text = "0"
        difficult = ET.SubElement(object_node, "difficult")
        difficult.text = "0"
        bndbox = ET.SubElement(object_node, "bndbox")
        xmin = ET.SubElement(bndbox, "xmin")
        xmin.text = obj["xmin"]
        xmax = ET.SubElement(bndbox, "xmax")
        xmax.text = obj["xmax"]
        ymin = ET.SubElement(bndbox, "ymin")
        ymin.text = obj["ymin"]
        ymax = ET.SubElement(bndbox, "ymax")
        ymax.text = obj["ymax"]
        
    # 創建 ElementTree 物件
    tree = ET.ElementTree(annotation)
    # 將樹寫入文件
    tree.write(os.path.join(save_path, filename_text[:-4]+'.xml'))

def bbox_convert(boxes,image_source):
    '''
    Filter background with opencv match template, 
    '''
    h, w, _ = image_source.shape
    boxes = boxes * torch.Tensor([w, h, w, h])
    xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
    return xyxy


def non_maximum_suppression_fast(boxes, overlapThresh=0.3):
    if len(boxes) == 0:
        return []
    pick = []
    x1 = boxes[:,0].astype("float")
    y1 = boxes[:,1].astype("float")
    x2 = boxes[:,2].astype("float")
    y2 = boxes[:,3].astype("float")
    bound_area = (x2-x1+1) * (y2-y1+1)
    sort_index = np.argsort(y2)
    while sort_index.shape[0] > 0:
        last = sort_index.shape[0]-1
        i = sort_index[last]
        pick.append(i)
        xx1 = np.maximum(x1[i], x1[sort_index[:last]])
        yy1 = np.maximum(y1[i], y1[sort_index[:last]])
        xx2 = np.minimum(x2[i], x2[sort_index[:last]])
        yy2 = np.minimum(y2[i], y2[sort_index[:last]])
        w = np.maximum(0, xx2 - xx1 + 1)
        h = np.maximum(0, yy2 - yy1 + 1)
        overlap = (w*h) / bound_area[sort_index[:last]]
        sort_index = np.delete(sort_index, np.concatenate(([last], np.where(overlap > overlapThresh)[0]))) 
    return boxes[pick]

def nms(bounding_boxes, confidence_score, threshold=0.4):
    if len(bounding_boxes) == 0:
        return [], []
    boxes = np.array(bounding_boxes)
    start_x = boxes[:, 0]
    start_y = boxes[:, 1]
    end_x = boxes[:, 2]
    end_y = boxes[:, 3]
    # Confidence scores of bounding boxes
    confidence_score = confidence_score.numpy()
    score = confidence_score
    # Piced bounding boxes
    picked_boxes = []
    picked_score = []
    # Compute areas of bounding boxes
    areas = (end_x - start_x + 1) * (end_y - start_y + 1)
    # Sort by confidence score of bounding boxes
    order = np.argsort(score)

def ann_pic(box_im, image_source,  labels):
    detections = sv.Detections(xyxy=np.array(box_im))
    box_annotator = sv.BoxAnnotator()
    annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    return annotated_frame

In [None]:
# 測試 groundinigdino  
# Groundingdino(GD) autolabel to .xml
# 程式過程: detect(GD) -> nms -> match template (del background) -> xml

TEXT_PROMPT = 'defect' # prompt
BOX_TRESHOLD = 0.2
TEXT_TRESHOLD = 0.2
folder_name = 'groundingdino_test'

directory = 'C:\\Users\\yangu\\Desktop\\DMRV' # 指定資料夾下的圖片檔都會跑 grounding dino
for root, dirs, files in os.walk(directory):
    for file in files:
        if file.lower().endswith('.jpg'):
            try:
                img_path = os.path.join(root, file)
                image_source, image = load_image(img_path) 
                m_image = cv2.imread(img_path)
                boxes, logits, phrases = predict(
                    model=model,
                    image=image,
                    caption=TEXT_PROMPT,
                    box_threshold=BOX_TRESHOLD,
                    text_threshold=TEXT_TRESHOLD)
                
                # annotation_frame = annotate(image_source, boxes, logits, phrases)
                bounding_boxes = bbox_convert(boxes, image_source)
                nms_output = non_maximum_suppression_fast(bounding_boxes, overlapThresh=0.3)
                tmp_image = m_image.copy()
                for box in nms_output:
                    box_int = box
                    tmp = m_image[int(box_int[1]):int(box_int[3]), int(box_int[0]):int(box_int[2])]
                    mask = np.ones_like(tmp) * 255
                    tmp_image[int(box_int[1]):int(box_int[3]), int(box_int[0]):int(box_int[2])] = mask
                objects = []
                box_im, labels = [], []
                for box in nms_output:
                    box_int = box
                    tmp = m_image[int(box_int[1]):int(box_int[3]), int(box_int[0]):int(box_int[2])]
                    out_match = cv2.matchTemplate(tmp_image, tmp, cv2.TM_CCOEFF_NORMED)
                    if len(np.where(out_match>0.97)[0]) == 0:
                        save_box = [int(box_int[0]), int(box_int[2]), int(box_int[1]), int(box_int[3])]
                        box_im.append([int(box_int[0]), int(box_int[1]), int(box_int[2]), int(box_int[3])])
                        labels.append('defect')
                        objects.append({"name": "defect", "xmin": str(save_box[0]), "xmax": str(save_box[1]), "ymin": str(save_box[2]), "ymax": str(save_box[3])})
                if len(objects) > 0:
                    filename_text = img_path.split('\\')[-1]
                    folder_text = img_path.split('\\')[-2]
                    save_path =  os.path.dirname(img_path)
                    create_xml(objects, filename_text, folder_text,  str(m_image.shape[1]), str(m_image.shape[0]), save_path = save_path)
                    # 畫圖
                    # annotation_frame = ann_pic(box_im, image_source, labels)
                    # a = 'D:\\yangu\\dataset\\groundingdino_test\\'+filename_text
                    # cv2.imwrite(a, annotation_frame)

            except Exception as err:
                print(img_path)
                print(err)