GitHub  
https://paperswithcode.com/paper/detecting-twenty-thousand-classes-using-image  
論文  
https://arxiv.org/abs/2201.02605v2  

<a href="https://colab.research.google.com/github/kaz12tech/ai_demos/blob/master/detecting_demo.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ランタイムの設定
「ランタイム」→「ランタイムのタイプを変更」→「ハードウェアアクセラレータ」をGPUに変更

# 実行方法
「ランタイム」→「すべてのセルを実行」を選択

# 環境セットアップ

## Cudaバージョンの確認

In [None]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

## Pytorchバージョンの変更
Deticとの比較対象であるCLIP-ODSがtorch-1.7.1対応のため

In [None]:
!pip install --upgrade pip > /dev/null
!pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex > /dev/null
!pip install clip-ods==0.0.1rc2 > /dev/null

## Pytorchバージョンの確認

In [None]:
import torch
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
print("torch: ", TORCH_VERSION, "; cuda: ", CUDA_VERSION)

## detectron2をインストール

In [None]:
!pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/$CUDA_VERSION/torch$TORCH_VERSION/index.html

## GitHubからコードをclone

In [None]:
%cd /content/

# GitHubからcode clone
!git clone https://github.com/facebookresearch/Detic.git --recurse-submodules
%cd Detic
# Deticの動作に必要なライブラリをインストール
!pip install -r requirements.txt

## セットアップ
ライブラリをインポート

In [None]:
# Some basic setup:
# Setup detectron2 logger
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()

# import some common libraries
import sys
import numpy as np
import os, json, cv2, random
from google.colab.patches import cv2_imshow

# import some common detectron2 utilities
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog

# Detic libraries
sys.path.insert(0, 'third_party/CenterNet2/projects/CenterNet2/')
from centernet.config import add_centernet_config
from detic.config import add_detic_config
from detic.modeling.utils import reset_cls_test

## 学習済みモデルのダウンロード
検出器の定義と学習済みモデルのダウンロード

In [None]:
# Build the detector and download our pretrained weights
cfg = get_cfg()
add_centernet_config(cfg)
add_detic_config(cfg)
cfg.merge_from_file("configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml")
cfg.MODEL.WEIGHTS = 'https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth'
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = 'rand'
cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = True # For better visualization purpose. Set to False for all classes.
predictor = DefaultPredictor(cfg)

## テスト画像のセットアップ
検出対象のファイルをアップロード  
  
使用画像  
https://pixabay.com/ja/photos/%e5%8b%95%e7%89%a9-%e7%8a%ac-%e7%8c%ab-%e5%ad%90%e7%8c%ab-%e5%ad%90%e7%8a%ac-2222007/

In [None]:
%cd /content/Detic

from google.colab import files
uploaded = files.upload()
uploaded = list(uploaded.keys())
print(uploaded)

## カスタムクラスによる物体検出
入力されたキーワードに該当する物体を検出

In [None]:
%%time

from detic.modeling.text.text_encoder import build_text_encoder
def get_clip_embeddings(vocabulary, prompt='a '):
    text_encoder = build_text_encoder(pretrain=True)
    text_encoder.eval()
    texts = [prompt + x for x in vocabulary]
    emb = text_encoder(texts).detach().permute(1, 0).contiguous().cpu()
    return emb

vocabulary = 'custom'
metadata = MetadataCatalog.get("__unused")

#@title 検出設定
#@markdown 検出対象の名称を英語で入力してください。\
#@markdown 複数検出する場合','で区切ってください。例) cat,dog
detect_target = 'Dog,Cat' #@param {type:"string"}

metadata.thing_classes = detect_target.split(',')

classifier = get_clip_embeddings(metadata.thing_classes)
num_classes = len(metadata.thing_classes)
reset_cls_test(predictor.model, classifier, num_classes)

for file in uploaded:
  im = cv2.imread(file)

  # Reset visualization threshold
  #@markdown 表示するスコアの閾値を設定してください。min:0, max:1.0
  output_score_threshold = 0.5 #@param {type:"slider", min:0, max:1.0, step:0.1}
  for cascade_stages in range(len(predictor.model.roi_heads.box_predictor)):
    predictor.model.roi_heads.box_predictor[cascade_stages].test_score_thresh = output_score_threshold

  # Run model and show results
  outputs = predictor(im)
  v = Visualizer(im[:, :, ::-1], metadata)
  out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
  cv2_imshow(out.get_image()[:, :, ::-1])
  cv2.imwrite("detit_"+file, out.get_image()[:, :, ::-1])

del metadata.thing_classes

## 学習済みラベルによる物体検出
LVIS、Objects365、OpenImages、Cocoデータセットの学習したラベルで物体検出

ボキャブラリ(検出用のラベル)を設定

In [None]:
# Setup the model's vocabulary using build-in datasets

BUILDIN_CLASSIFIER = {
    'lvis': 'datasets/metadata/lvis_v1_clip_a+cname.npy',
    'objects365': 'datasets/metadata/o365_clip_a+cnamefix.npy',
    'openimages': 'datasets/metadata/oid_clip_a+cname.npy',
    'coco': 'datasets/metadata/coco_clip_a+cname.npy',
}

BUILDIN_METADATA_PATH = {
    'lvis': 'lvis_v1_val',
    'objects365': 'objects365_v2_val',
    'openimages': 'oid_val_expanded',
    'coco': 'coco_2017_val',
}

# change to 'lvis', 'objects365', 'openimages', or 'coco'
#@title 適用するラベルを選択してください。
vocabulary = 'openimages' #@param ["lvis", "objects365", "openimages", "coco"] {allow-input: false}
metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary])
classifier = BUILDIN_CLASSIFIER[vocabulary]
num_classes = len(metadata.thing_classes)
reset_cls_test(predictor.model, classifier, num_classes)

ボキャブラリに設定した物体を全て検出

In [None]:
for file in uploaded:
  im = cv2.imread(file)

  # Reset visualization threshold
  output_score_threshold = 0.3
  for cascade_stages in range(len(predictor.model.roi_heads.box_predictor)):
    predictor.model.roi_heads.box_predictor[cascade_stages].test_score_thresh = output_score_threshold

  # Run model and show results
  outputs = predictor(im)
  v = Visualizer(im[:, :, ::-1], metadata)
  out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
  cv2_imshow(out.get_image()[:, :, ::-1])

# CLIP-ODSのセットアップ

## ライブラリのインポート

In [None]:
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
from clip_ods import clip, CLIPDetectorV1

## モデルをロード

In [None]:
device = torch.device('cuda:0')
model, preprocess = clip.load("RN50x4", device=device)  # "ViT-B/32","RN50","RN101","RN50x4"
clip_detector = CLIPDetectorV1(model, preprocess, device)

In [None]:
%%time

img_path = uploaded[0]

coords, masks = clip_detector.get_coords_and_masks(Image.open(img_path))
anchor_features = clip_detector.get_anchor_features(Image.open(img_path), coords)

In [None]:
%%time

img = Image.open(img_path)
colour = (0,255,0)

result = clip_detector.detect_by_text(
  texts=detect_target.split(','),
  img=Image.open(img_path),
  coords=coords, masks=masks,
  anchor_features=anchor_features,
  skip_box_thr=output_score_threshold
)

img = clip_detector.draw(
  img, 
  result,
  label=' '.join(s for s in detect_target),
  colour=colour,
  font_colour=colour,
  font_scale=0.5, 
  font_thickness=1,
)


plt.figure(num=None, figsize=(8, 8), dpi=120, facecolor='w', edgecolor='k')
plt.imshow(img)
pil_img = Image.fromarray(img)
pil_img.save('clip_ods_' + uploaded[0])

In [None]:
image = Image.open("detit_" + uploaded[0]).convert("RGB")
pre_image = Image.open("clip_ods_" + uploaded[0]).convert("RGB")

dst = Image.new('RGB', (image.width + pre_image.width, image.height))
dst.paste(image, (0, 0))
dst.paste(pre_image, (image.width, 0))

# 画像の表示
plt.figure(figsize=(12, 12))
plt.imshow(dst)


# CIFAR10のセットアップ

In [None]:
from torchvision import datasets, transforms

## testsetをダウンロード

In [None]:
%cd /content/Detic

transform = transforms.Compose(
    [
     transforms.Resize((32,32)),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5),(0.5, 0.5, 0.5)),
     ])

test_dataset = datasets.CIFAR10(root='./', train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 32, shuffle=True)



## CIFAR10をtensorからPILに変換

In [None]:
def im_convert(tensor):
  image = tensor.clone().detach().numpy()
  image = image.transpose(1,2,0)
  image = image * np.array((0.5,0.5,0.5)) + np.array((0.5,0.5,0.5))
  image = image.clip(0,1)
  return image

## CIFAR10を表示

In [None]:
# テストデータセット：縦横32ピクセルのRGBの画像が10000枚
print(test_dataset.data.shape)
print(test_dataset.classes)

dataiter = iter(test_loader)
images, labels = dataiter.next()


fig = plt.figure(num=None, figsize=(12, 5), dpi=128, facecolor='w', edgecolor='k')

for i in range(32):
  ax = fig.add_subplot(4, 8, i+1, xticks=[], yticks=[])
  plt.imshow(im_convert(images[i]))
  ax.set_title(test_dataset.classes[labels[i].item()])

## CIFAR10の保存
画像フォルダに見立ててアルバムフォルダに画像を保存

In [None]:
%cd /content/Detic
!mkdir ./albam

for i in range(32):
  plt.imsave(
      "./albam/" + str(i) + '_' + test_dataset.classes[labels[i].item()] + ".jpg",
      im_convert(images[i])
      )
  
import glob
albams = glob.glob("./albam/*.jpg")

In [None]:
vocabulary = 'custom'
metadata = MetadataCatalog.get("__unused")

#@title 検出設定
#@markdown 検出対象の名称を英語で入力してください。\
#@markdown 複数検出する場合','で区切ってください。例) cat,dog
detect_target = 'Dog' #@param {type:"string"}

metadata.thing_classes = detect_target.split(',')

classifier = get_clip_embeddings(metadata.thing_classes)
num_classes = len(metadata.thing_classes)
reset_cls_test(predictor.model, classifier, num_classes)

detected_list = []
not_list = []

for file in albams:
  im = cv2.imread(file)

  # Reset visualization threshold
  #@markdown 表示するスコアの閾値を設定してください。min:0, max:1.0
  output_score_threshold = 0.5 #@param {type:"slider", min:0, max:1.0, step:0.1}
  for cascade_stages in range(len(predictor.model.roi_heads.box_predictor)):
    predictor.model.roi_heads.box_predictor[cascade_stages].test_score_thresh = output_score_threshold

  # Run model and show results
  outputs = predictor(im)
  # 検出クラスが1以上の場合は検出対象有と判定
  if 0 < len(outputs["instances"].pred_classes):
    v = Visualizer(im[:, :, ::-1], metadata)
    out = v.draw_instance_predictions(outputs["instances"].to("cpu"))
    cv2_imshow(out.get_image()[:, :, ::-1])
    detected_list.append(file)
  else:
    not_list.append(file)

del metadata.thing_classes

In [None]:
print("以下の画像は", detect_target, "が検出されました。\n", detected_list)
print("以下の画像は未検出です。\n", not_list)