In [1]:
from google.colab import drive
drive.mount('/content/drive')

# !git clone https://github.com/Megvii-BaseDetection/YOLOX.git
%cd /content/drive/MyDrive/SMU/Computer Vision/Project/YOLOX

Mounted at /content/drive


In [None]:
# install all dependancies

!pip3 install -v -e .  # or  python3 setup.py develop

### Train and predict using command lines

In [None]:
# this is to predict one image based on original pre-trained model

!python tools/demo.py image -n yolox-nano -c model_path/yolox_nano.pth --path assets/dog.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device cuda

In [None]:
# this is to predict one video clips based on original pre-trained model

!python tools/demo.py video -n yolox-nano -c model_path/yolox_nano.pth --path assets/GoPro9.mp4 --conf 0.25 --nms 0.45 --tsize 640 --save_result --device cuda

In [4]:
# Training YOLOX_NANO

In [5]:
!python tools/train.py -f exps/example/custom/nano.py -d 1 -b 32 --fp16 -o -c model_path/yolox_nano.pth

[32m2022-10-20 06:29:58[0m | [1mINFO    [0m | [36myolox.core.trainer[0m:[36m130[0m - [1margs: Namespace(batch_size=32, cache=False, ckpt='model_path/yolox_nano.pth', devices=1, dist_backend='nccl', dist_url=None, exp_file='exps/example/custom/nano.py', experiment_name='nano', fp16=True, logger='tensorboard', machine_rank=0, name=None, num_machines=1, occupy=True, opts=[], resume=False, start_epoch=None)[0m
[32m2022-10-20 06:29:59[0m | [1mINFO    [0m | [36myolox.core.trainer[0m:[36m131[0m - [1mexp value:
╒═══════════════════╤════════════════════════════╕
│ keys              │ values                     │
╞═══════════════════╪════════════════════════════╡
│ seed              │ None                       │
├───────────────────┼────────────────────────────┤
│ output_dir        │ './YOLOX_outputs'          │
├───────────────────┼────────────────────────────┤
│ print_interval    │ 10                         │
├───────────────────┼────────────────────────────┤
│ eval_interv

In [None]:
# this is to predict one video clips based on retrained model

!python tools/demo.py video -n yolox-nano -f exps/example/custom/nano.py -c YOLOX_outputs/nano/epoch_300_ckpt.pth --path assets/GoPro9.mp4 --conf 0.25 --nms 0.45 --tsize 640 --save_result --device cuda

In [6]:
# this is to predict one image based on retrained model

!python tools/demo.py image -f exps/example/custom/nano.py -c YOLOX_outputs/nano/epoch_300_ckpt.pth --path assets/val_image/d77774e5-1db0-4639-b50b-a9e64e76d487_2000.jpg --conf 0.25 --nms 0.45 --tsize 640 --save_result --device [cpu/gpu]

[32m2022-10-20 07:58:46.305[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m259[0m - [1mArgs: Namespace(camid=0, ckpt='YOLOX_outputs/nano/epoch_300_ckpt.pth', conf=0.25, demo='image', device='[cpu/gpu]', exp_file='exps/example/custom/nano.py', experiment_name='nano', fp16=False, fuse=False, legacy=False, name=None, nms=0.45, path='assets/val_image/d77774e5-1db0-4639-b50b-a9e64e76d487_2000.jpg', save_result=True, trt=False, tsize=640)[0m
[32m2022-10-20 07:58:46.485[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m269[0m - [1mModel Summary: Params: 0.90M, Gflops: 2.55[0m
[32m2022-10-20 07:58:46.488[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m282[0m - [1mloading checkpoint[0m
[32m2022-10-20 07:58:46.582[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m286[0m - [1mloaded checkpoint done.[0m
[32m2022-10-20 07:58:47.598[0m | [1mINFO    [0m | [36m__main__[0m:[36minference[0m:[36m165[0m - [1mInfer time: 0.1594

### Originally form tools/demo.py

In [24]:
import argparse
import os
import time
from loguru import logger

import cv2

import torch

from yolox.data.data_augment import ValTransform
from yolox.data.datasets import COCO_CLASSES
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess, vis

In [25]:
def get_image_list(path):
    image_names = []
    for maindir, subdir, file_name_list in os.walk(path):
        for filename in file_name_list:
            apath = os.path.join(maindir, filename)
            ext = os.path.splitext(apath)[1]
            if ext in IMAGE_EXT:
                image_names.append(apath)
    return image_names

In [62]:
class Predictor(object):
    def __init__(
        self,
        model,
        exp,
        cls_names=COCO_CLASSES,
        trt_file=None,
        decoder=None,
        device="cpu",
        fp16=False,
        legacy=False,
    ):
        self.model = model
        self.cls_names = cls_names
        self.decoder = decoder
        self.num_classes = exp.num_classes
        self.confthre = exp.test_conf
        self.nmsthre = exp.nmsthre
        self.test_size = exp.test_size
        self.device = device
        self.fp16 = fp16
        self.preproc = ValTransform(legacy=legacy)
        if trt_file is not None:
            from torch2trt import TRTModule

            model_trt = TRTModule()
            model_trt.load_state_dict(torch.load(trt_file))

            x = torch.ones(1, 3, exp.test_size[0], exp.test_size[1]).cuda()
            self.model(x)
            self.model = model_trt

    def inference(self, img):
        img_info = {"id": 0}
        if isinstance(img, str):
            img_info["file_name"] = os.path.basename(img)
            img = cv2.imread(img)
        else:
            img_info["file_name"] = None

        height, width = img.shape[:2]
        img_info["height"] = height
        img_info["width"] = width
        img_info["raw_img"] = img

        ratio = min(self.test_size[0] / img.shape[0], self.test_size[1] / img.shape[1])
        img_info["ratio"] = ratio

        img, _ = self.preproc(img, None, self.test_size)
        img = torch.from_numpy(img).unsqueeze(0)
        img = img.float()
        if self.device == "gpu":
            img = img.cuda()
            if self.fp16:
                img = img.half()  # to FP16

        with torch.no_grad():
            t0 = time.time()
            outputs = self.model(img)
            if self.decoder is not None:
                outputs = self.decoder(outputs, dtype=outputs.type())
            outputs = postprocess(
                outputs, self.num_classes, self.confthre,
                self.nmsthre, class_agnostic=True
            )
            logger.info("Infer time: {:.4f}s".format(time.time() - t0))
        ########################################################################
        print (f'\n******outputs******\n', outputs, f'\n******img_info******\n', img_info.keys())
        ########################################################################
        return outputs, img_info

    def visual(self, output, img_info, cls_conf=0.35):
        ratio = img_info["ratio"]
        img = img_info["raw_img"]
        if output is None:
            return img
        output = output.cpu()

        bboxes = output[:, 0:4]

        # preprocessing: resize
        bboxes /= ratio

        cls = output[:, 6]
        scores = output[:, 4] * output[:, 5]

        vis_res = vis(img, bboxes, scores, cls, cls_conf, self.cls_names)
        return vis_res

In [63]:
def image_demo(predictor, vis_folder, path, current_time, save_result):
    if os.path.isdir(path):
        files = get_image_list(path)
    else:
        files = [path]
    files.sort()
    for image_name in files:
        ########################################################################
        outputs, img_info = predictor.inference(image_name)
        result_image = predictor.visual(outputs[0], img_info, predictor.confthre)
        ########################################################################
        if save_result:
            save_folder = os.path.join(
                vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
            )
            os.makedirs(save_folder, exist_ok=True)
            save_file_name = os.path.join(save_folder, os.path.basename(image_name))
            logger.info("Saving detection result in {}".format(save_file_name))
            cv2.imwrite(save_file_name, result_image)
        ch = cv2.waitKey(0)
        if ch == 27 or ch == ord("q") or ch == ord("Q"):
            break

In [64]:
def imageflow_demo(predictor, vis_folder, current_time, args):
    cap = cv2.VideoCapture(args.path if args.demo == "video" else args.camid)
    width = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  # float
    height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)  # float
    fps = cap.get(cv2.CAP_PROP_FPS)
    if args.save_result:
        save_folder = os.path.join(
            vis_folder, time.strftime("%Y_%m_%d_%H_%M_%S", current_time)
        )
        os.makedirs(save_folder, exist_ok=True)
        if args.demo == "video":
            save_path = os.path.join(save_folder, os.path.basename(args.path))
        else:
            save_path = os.path.join(save_folder, "camera.mp4")
        logger.info(f"video save_path is {save_path}")
        vid_writer = cv2.VideoWriter(
            save_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (int(width), int(height))
        )
    while True:
        ret_val, frame = cap.read()
        if ret_val:
            outputs, img_info = predictor.inference(frame)
            result_frame = predictor.visual(outputs[0], img_info, predictor.confthre)
            if args.save_result:
                vid_writer.write(result_frame)
            else:
                cv2.namedWindow("yolox", cv2.WINDOW_NORMAL)
                cv2.imshow("yolox", result_frame)
            ch = cv2.waitKey(1)
            if ch == 27 or ch == ord("q") or ch == ord("Q"):
                break
        else:
            break

In [65]:
IMAGE_EXT = [".jpg", ".jpeg", ".webp", ".bmp", ".png"]

# variable for creating the parser

demo = 'image' # demo type, eg. image, video and webcam
experiment_name = '' # 
name = 'yolox-nano' # model name
path = 'assets/val_image/d77774e5-1db0-4639-b50b-a9e64e76d487_2000.jpg' # path to images or video
save_result = True # whether to save the inference result of image/video, action="store_true"
camid = 0 # webcam demo camera id

# exp file
exp_file = 'exps/example/custom/nano.py' # experiment description file
ckpt = None # ckpt for eval
device = 'cuda'
conf = 0.25
nms = 0.45
tsize = 640
fp16 = False # Adopting mix precision evaluating
legacy = False # To be compatible with older versions
fuse = False # Fuse conv and bn for testing
trt = False # Using TensorRT model for testing

In [66]:
if __name__ == "__main__":
    
    exp = get_exp(exp_file, name)
    
    if not experiment_name:
        experiment_name = name
    
    file_name = os.path.join(exp.output_dir, experiment_name)
    os.makedirs(file_name, exist_ok=True)

    vis_folder = None
    if save_result:
        vis_folder = os.path.join(file_name, "vis_res")
        os.makedirs(vis_folder, exist_ok=True)

    if conf is not None:
        exp.test_conf = conf
    if nms is not None:
        exp.nmsthre = nms
    if tsize is not None:
        exp.test_size = (tsize, tsize)

    model = exp.get_model()
    print (f'Model Summary: {get_model_info(model, exp.test_size)}')

    if device == "gpu":
        model.cuda()
        if fp16:
            model.half()  # to FP16
    model.eval()

    if not trt:
        if ckpt is None:
            ckpt_file = os.path.join(file_name, "best_ckpt.pth")
        else:
            ckpt_file = ckpt
        print('loading checkpoint')
        ckpt = torch.load(ckpt_file, map_location="cpu")
        # load the model state dict
        model.load_state_dict(ckpt["model"])
        print('loaded checkpoint done')
    
    if fuse:
        print('Fusing model...')
        model = fuse_model(model)
    
    if trt:
        assert not fuse, "TensorRT model is not support model fusing!"
        trt_file = os.path.join(file_name, "model_trt.pth")
        assert os.path.exists(
            trt_file
        ), "TensorRT model is not found!\n Run python3 tools/trt.py first!"
        model.head.decode_in_inference = False
        decoder = model.head.decode_outputs
        print('Using TensorRT to inference')
    else:
        trt_file = None
        decoder = None

    predictor = Predictor(
        model, exp, COCO_CLASSES, trt_file, decoder,
        device, fp16, legacy,
    )

    #######################
    ## perform inference ##
    #######################
    current_time = time.localtime()
    if demo == "image":
        image_demo(predictor, vis_folder, path, current_time, save_result)
    elif demo == "video" or demo == "webcam":
        imageflow_demo(predictor, vis_folder, current_time, args) # this args shoud be saparated

Model Summary: Params: 0.90M, Gflops: 2.55
loading checkpoint
loaded checkpoint done


2022-10-20 09:55:04.781 | INFO     | __main__:inference:66 - Infer time: 0.1171s
2022-10-20 09:55:04.790 | INFO     | __main__:image_demo:18 - Saving detection result in ./YOLOX_outputs/yolox-nano/vis_res/2022_10_20_09_55_04/d77774e5-1db0-4639-b50b-a9e64e76d487_2000.jpg



******outputs******
 [tensor([[246.5511,  90.5130, 324.0844, 154.5399,   0.9674,   0.8186,   4.0000],
        [321.0427,  55.0014, 411.5424, 116.0059,   0.8407,   0.7520,   4.0000],
        [335.9178,  82.8158, 412.6770, 119.9066,   0.6954,   0.7629,   4.0000],
        [129.7812, 169.6613, 205.3798, 188.9697,   0.6871,   0.5884,   2.0000],
        [256.7388, 128.9429, 332.3531, 171.4595,   0.4800,   0.7359,   4.0000],
        [266.5781, 180.6467, 359.3124, 208.0972,   0.5135,   0.6554,   4.0000]])] 
******img_info******
 dict_keys(['id', 'file_name', 'height', 'width', 'raw_img', 'ratio'])
