OVSeg
====

 **Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP**

* Paper: https://arxiv.org/abs/2210.04150

![OVSeg Overview](../assets/ovseg_overview.jpg)
![OVSeg Prompting](../assets/ovseg_prompt.jpg)
![OVSeg Two-Stage](../assets/ovseg_twostage.jpg)
![OVSeg Image-Text](../assets/ovseg_imgtextmatching.jpg)



 * Installation

```bash
git clone https://github.com/facebookresearch/ov-seg.git OVSeg_repo

cd OVSeg_repo

conda create --name ovseg python=3.8 -y
conda activate ovseg
conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
pip install -r requirements.txt


python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html


cd third_party/CLIP
python -m pip install -Ue .
```

* Download model: https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view

```bash
pip install -U gdown
```

```python
gdown --id 1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy
```

In [10]:
import os
import sys
import time
import argparse

import cv2
import tqdm

from detectron2.config import get_cfg

from detectron2.data import MetadataCatalog
from detectron2.projects.deeplab import add_deeplab_config
from detectron2.data.detection_utils import read_image
from detectron2.utils.logger import setup_logger

sys.path.append("OVSeg_repo")
from open_vocab_seg import add_ovseg_config
#from open_vocab_seg.utils import VisualizationDemo
from open_vocab_seg import utils

# constants
WINDOW_NAME = "Open vocabulary segmentation"


def setup_cfg(args):
    # load config from file and command-line arguments
    cfg = get_cfg()
    # for poly lr schedule
    add_deeplab_config(cfg)
    add_ovseg_config(cfg)
    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()
    return cfg


def parse_args(argv):
    parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
    parser.add_argument(
        "--config-file",
        default="configs/ovseg_swinB_vitL_demo.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "--input_path", type=str,
        help="or a single glob pattern such as 'directory/*.jpg'",
    )
    parser.add_argument(
        "--class-names",
        nargs="+",
        help="A list of user-defined class_names"
    )
    parser.add_argument(
        "--output",
        help="A file or directory to save output visualizations. "
        "If not given, will show output in an OpenCV window.",
    )
    parser.add_argument(
        "--opts",
        help="Modify config options using the command-line 'KEY VALUE' pairs",
        default=[],
        nargs=argparse.REMAINDER,
    )
    return parser.parse_args(argv)



In [8]:
argv = [
    "--input_path", "plants_small.jpg",
    "--class-names", "plant,leaf,pot,vase",
    "--config-file", "OVSeg_repo/configs/ovseg_swinB_vitL_demo.yaml",
    "--opts", "MODEL.WEIGHTS", "ovseg_swinbase_vitL14_ft_mpt.pth",
]

args = parse_args(argv)
cfg = setup_cfg(args)

cfg



CfgNode({'VERSION': 2, 'MODEL': CfgNode({'LOAD_PROPOSALS': False, 'MASK_ON': False, 'KEYPOINT_ON': False, 'DEVICE': 'cuda', 'META_ARCHITECTURE': 'OVSegDEMO', 'WEIGHTS': 'ovseg_swinbase_vitL14_ft_mpt.pth', 'PIXEL_MEAN': [123.675, 116.28, 103.53], 'PIXEL_STD': [58.395, 57.12, 57.375], 'BACKBONE': CfgNode({'NAME': 'D2SwinTransformer', 'FREEZE_AT': 0}), 'FPN': CfgNode({'IN_FEATURES': [], 'OUT_CHANNELS': 256, 'NORM': '', 'FUSE_TYPE': 'sum'}), 'PROPOSAL_GENERATOR': CfgNode({'NAME': 'RPN', 'MIN_SIZE': 0}), 'ANCHOR_GENERATOR': CfgNode({'NAME': 'DefaultAnchorGenerator', 'SIZES': [[32, 64, 128, 256, 512]], 'ASPECT_RATIOS': [[0.5, 1.0, 2.0]], 'ANGLES': [[-90, 0, 90]], 'OFFSET': 0.0}), 'RPN': CfgNode({'HEAD_NAME': 'StandardRPNHead', 'IN_FEATURES': ['res4'], 'BOUNDARY_THRESH': -1, 'IOU_THRESHOLDS': [0.3, 0.7], 'IOU_LABELS': [0, -1, 1], 'BATCH_SIZE_PER_IMAGE': 256, 'POSITIVE_FRACTION': 0.5, 'BBOX_REG_LOSS_TYPE': 'smooth_l1', 'BBOX_REG_LOSS_WEIGHT': 1.0, 'BBOX_REG_WEIGHTS': (1.0, 1.0, 1.0, 1.0), 'SMO

In [18]:
from PIL import Image
img_orig = Image.open("../samples/plants.jpg")
w, h = img_orig.size
img_orig.resize((int(w/4), int(h/4))).save("plants_small.jpg")

img = read_image("plants_small.jpg", format="BGR")
img.shape

(284, 267, 3)

In [22]:
import torch
import numpy as np
from detectron2.utils.visualizer import ColorMode

metadata = MetadataCatalog.get(
    cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
)

cpu_device = torch.device("cpu")
instance_mode = ColorMode.IMAGE

predictor = utils.predictor.OVSegPredictor(cfg)

[32m[09/02 00:03:57 fvcore.common.checkpoint]: [0m[Checkpointer] Loading from ovseg_swinbase_vitL14_ft_mpt.pth ...
