# VL-T5 inference on custom images

## Download code and install dependencies

In [1]:
!git clone https://github.com/j-min/VL-T5

fatal: destination path 'VL-T5' already exists and is not an empty directory.


In [2]:
cd VL-T5

/content/VL-T5


In [3]:
!pip uninstall param -y # to resolve name conflict with src.param.py
!pip install -r requirements.txt
!python download_backbones.py

Collecting git+git://github.com/j-min/language-evaluation@master (from -r requirements.txt (line 12))
  Cloning git://github.com/j-min/language-evaluation (to revision master) to /tmp/pip-req-build-ss1sstxo
  Running command git clone -q git://github.com/j-min/language-evaluation /tmp/pip-req-build-ss1sstxo
Downloading checkpoints if not cached
T5-base
BART-base
Done!


## Download the pretrained checkpoint

In [4]:
!pip install --upgrade gdown



In [5]:
import gdown

In [6]:
!mkdir -p VL-T5/snap/pretrain/VLT5

In [7]:
gdown.download('https://drive.google.com/uc?id=100qajGncE_vc4bfjVxxICwz3dwiAxbIZ', 'VL-T5/snap/pretrain/VLT5/Epoch30.pth', quiet=False)

Downloading...
From: https://drive.google.com/uc?id=100qajGncE_vc4bfjVxxICwz3dwiAxbIZ
To: /content/VL-T5/VL-T5/snap/pretrain/VLT5/Epoch30.pth
100%|██████████| 898M/898M [00:16<00:00, 56.1MB/s]


'VL-T5/snap/pretrain/VLT5/Epoch30.pth'

## Add source code path

In [8]:
import sys

In [9]:
sys.path.append('/content/VL-T5/VL-T5/src')
sys.path.append('/content/VL-T5/VL-T5/inference')

In [10]:
cd VL-T5

/content/VL-T5/VL-T5


## Build a model and load weights from the pretrained checkpoint

In [11]:
!pip uninstall param -y



In [12]:
from param import parse_args

In [13]:
args = parse_args(
    parse=False,
    backbone='t5-base',
    load='snap/pretrain/VLT5/Epoch30'
)
args.gpu = 0

In [14]:
from vqa import Trainer

In [15]:
trainer = Trainer(args,
                  train=False
                  )

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Building Model at GPU 0


Some weights of VLT5VQA were not initialized from the model checkpoint at t5-base and are newly initialized: ['encoder.visual_embedding.feat_embedding.0.weight', 'encoder.visual_embedding.feat_embedding.0.bias', 'encoder.visual_embedding.feat_embedding.1.weight', 'encoder.visual_embedding.absolute_vis_pos_embedding.0.weight', 'encoder.visual_embedding.absolute_vis_pos_embedding.0.bias', 'encoder.visual_embedding.absolute_vis_pos_embedding.1.weight', 'encoder.visual_embedding.obj_order_embedding.weight', 'encoder.visual_embedding.img_order_embedding.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded from  snap/pretrain/VLT5/Epoch30.pth
_IncompatibleKeys(missing_keys=[], unexpected_keys=['encoder.visual_embedding.layer_norm.weight'])
Model Launching at GPU 0
It took 2.8s


# Faster R-CNN inference script (from [Huggingface transformers LXMERT demo](https://github.com/huggingface/transformers/tree/master/examples/research_projects/lxmert))

In [16]:
from IPython.display import clear_output, Image, display
import PIL.Image
import io
import json
import torch
import numpy as np
from inference.processing_image import Preprocess
from inference.visualizing_image import SingleImageViz
from inference.modeling_frcnn import GeneralizedRCNN
from inference.utils import Config, get_data
import unicodedata

import wget
import pickle
import os


URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg"
OBJ_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/objects_vocab.txt"
ATTR_URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/genome/1600-400-20/attributes_vocab.txt"
GQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/gqa/trainval_label2ans.json"
VQA_URL = "https://raw.githubusercontent.com/airsplay/lxmert/master/data/vqa/trainval_label2ans.json"

objids = get_data(OBJ_URL) 
attrids = get_data(ATTR_URL)
gqa_answers = get_data(GQA_URL) 
vqa_answers = get_data(VQA_URL) 
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) 
image_preprocess = Preprocess(frcnn_cfg) 

# for visualizing output
def showarray(a, fmt='jpeg'):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

%s not found in cache or force_download set to True, downloading to %s https://s3.amazonaws.com/models.huggingface.co/bert/unc-nlp/frcnn-vg-finetuned/config.yaml /root/.cache/torch/transformers/tmp3uvv30u6


Downloading:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

loading configuration file cache
%s not found in cache or force_download set to True, downloading to %s https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin /root/.cache/torch/transformers/tmpi0vzo9gy


Downloading:   0%|          | 0.00/262M [00:00<?, ?B/s]

loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /root/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [17]:
image_filename = "/content/drive/MyDrive/特別研究/frame_mp4/frame/bar/barBGR001.jpg"
#image_filename = None

if image_filename is None:
    # サンプル画像をダウンロードする
    URL = "https://raw.githubusercontent.com/airsplay/py-bottom-up-attention/master/demo/data/images/input.jpg"
    image_filename = wget.download(URL)

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
image_dirname = image_filename

frcnn_visualizer = SingleImageViz(image_filename, id2obj=objids, id2attr=attrids) 

images, sizes, scales_yx = image_preprocess(image_filename) 

output_dict = frcnn(
    images, 
    sizes, 
    scales_yx = scales_yx, 
    padding = 'max_detections', 
    max_detections = frcnn_cfg.max_detections, 
    return_tensors = 'pt' 
)

# add boxes and labels to the image 
frcnn_visualizer.draw_boxes(
    output_dict.get("boxes"), 
    output_dict.get("obj_ids"),
    output_dict.get("obj_probs"),
    output_dict.get("attr_ids"), 
    output_dict.get("attr_probs"),
)

showarray(frcnn_visualizer._get_buffer())

normalized_boxes = output_dict.get("normalized_boxes") 
features = output_dict.get("roi_features") 

Output hidden; open in https://colab.research.google.com to view.

## Load Tokenizer

In [19]:
from tokenization import VLT5TokenizerFast

In [20]:
tokenizer = VLT5TokenizerFast.from_pretrained('t5-base')

## Inference

In [21]:
questions = [
             "caption region: <vis_extra_id_0>",
             "caption region: <vis_extra_id_1>",
             "caption region: <vis_extra_id_2>",
             "caption region: <vis_extra_id_3>",
             "caption region: <vis_extra_id_4>",
             "caption region: <vis_extra_id_5>",
             "caption region: <vis_extra_id_6>",
             "caption region: <vis_extra_id_7>",
             "caption region: <vis_extra_id_8>",
             "caption region: <vis_extra_id_9>",
             "caption region: <vis_extra_id_10>",
             "caption region: <vis_extra_id_11>",
             "caption region: <vis_extra_id_12>",
             "caption region: <vis_extra_id_13>",
             "caption region: <vis_extra_id_14>",
             "caption region: <vis_extra_id_15>",
             "caption region: <vis_extra_id_16>",
             "caption region: <vis_extra_id_17>",
             "caption region: <vis_extra_id_18>",
             "caption region: <vis_extra_id_19>",
             "caption region: <vis_extra_id_20>",
             "caption region: <vis_extra_id_21>",
             "caption region: <vis_extra_id_22>",
             "caption region: <vis_extra_id_23>",
             "caption region: <vis_extra_id_24>",
             "caption region: <vis_extra_id_25>",
             "caption region: <vis_extra_id_26>",
             "caption region: <vis_extra_id_27>",
             "caption region: <vis_extra_id_28>",
             "caption region: <vis_extra_id_29>",
             "caption region: <vis_extra_id_30>",
             "caption region: <vis_extra_id_31>",
             "caption region: <vis_extra_id_32>",
             "caption region: <vis_extra_id_33>",
             "caption region: <vis_extra_id_34>",
             "caption region: <vis_extra_id_35>",
             "caption region: <vis_extra_id_36>",
             ]

In [22]:
for question in questions:
    input_ids = tokenizer(question, return_tensors='pt', padding=True).input_ids
    batch = {}
    batch['input_ids'] = input_ids
    batch['vis_feats'] = features
    batch['boxes'] = normalized_boxes

    result = trainer.model.test_step(batch)
    #print(f"Q: {question}")
    print(f"{result['pred_ans'][0]}.")

red table.
green bottle.
smiling man.
red table.
brown hair.
clear glass.
small hand.
white wall.
smiling woman.
brown glasses.
red table.
black shirt.
blue wall.
red table.
black shirt.
white wall.
smiling.
smiling man.
black glass.
red table.
blue wall.
red table.
white wall.
black bottle.
black table.
silver ring.
white wall.
white glass.
red cap.
silver cup.
blue sky.
red table.
smiling.
sitting woman.
blue wall.
black table.
silver spoon.


In [23]:
for question in questions:
    input_ids = tokenizer(question, return_tensors='pt', padding=True).input_ids
    batch = {}
    batch['input_ids'] = input_ids
    batch['vis_feats'] = features
    batch['boxes'] = normalized_boxes

    result = trainer.model.test_step(batch)
    print(f"Q: {question}")
    print(f"A: {result['pred_ans'][0]}")

Q: caption region: <vis_extra_id_0>
A: red table
Q: caption region: <vis_extra_id_1>
A: green bottle
Q: caption region: <vis_extra_id_2>
A: smiling man
Q: caption region: <vis_extra_id_3>
A: red table
Q: caption region: <vis_extra_id_4>
A: brown hair
Q: caption region: <vis_extra_id_5>
A: clear glass
Q: caption region: <vis_extra_id_6>
A: small hand
Q: caption region: <vis_extra_id_7>
A: white wall
Q: caption region: <vis_extra_id_8>
A: smiling woman
Q: caption region: <vis_extra_id_9>
A: brown glasses
Q: caption region: <vis_extra_id_10>
A: red table
Q: caption region: <vis_extra_id_11>
A: black shirt
Q: caption region: <vis_extra_id_12>
A: blue wall
Q: caption region: <vis_extra_id_13>
A: red table
Q: caption region: <vis_extra_id_14>
A: black shirt
Q: caption region: <vis_extra_id_15>
A: white wall
Q: caption region: <vis_extra_id_16>
A: smiling
Q: caption region: <vis_extra_id_17>
A: smiling man
Q: caption region: <vis_extra_id_18>
A: black glass
Q: caption region: <vis_extra_id_19