In [1]:
import sys
sys.path.append("..")

In [3]:
import torch
from heron.models.video_blip import VideoBlipForConditionalGeneration, VideoBlipProcessor
from transformers import LlamaTokenizer
import wandb

device_id = 0
device = f"cuda:{device_id}"

max_length = 512

[2024-01-08 06:38:34,481] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [4]:
MODEL_NAME = "turing-motors/heron-chat-blip-ja-stablelm-base-7b-v1"
model = VideoBlipForConditionalGeneration.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, ignore_mismatched_sizes=True
)
tokenizer = LlamaTokenizer.from_pretrained("novelai/nerdstash-tokenizer-v1", additional_special_tokens=['▁▁'])

Downloading config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of VideoBlipForConditionalGeneration were not initialized from the model checkpoint at /mnt/disks/disk2/model_out/stablelm-beta/abci-exp001 and are newly initialized because the shapes did not match:
- text_projection.bias: found shape torch.Size([2560]) in the checkpoint and torch.Size([4096]) in the model instantiated
- text_projection.weight: found shape torch.Size([2560, 768]) in the checkpoint and torch.Size([4096, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in htt

In [None]:
model = model.half()
model.eval()
model.to(device)

In [6]:
# prepare a processor
processor = VideoBlipProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
processor.tokenizer = tokenizer

import requests
from PIL import Image

def generate_response(question, image):
    # prepare inputs
    text = f"##human: {question}\n##gpt: "

    # do preprocessing
    inputs = processor(
        text=text,
        images=image,
        return_tensors="pt",
        add_special_tokens=False,
        truncation=True,
    )

    inputs = {k: v.to(device) for k, v in inputs.items()}
    inputs["pixel_values"] = inputs["pixel_values"].to(device, torch.float16)
    
    # set eos token
    eos_token_id_list = [
        processor.tokenizer.pad_token_id,
        processor.tokenizer.eos_token_id,
        int(tokenizer.convert_tokens_to_ids("\n"))
    ]

    # do inference
    with torch.no_grad():
        out = model.generate(**inputs, max_length=256, do_sample=False, temperature=0., eos_token_id=eos_token_id_list, no_repeat_ngram_size=2)
    res = processor.tokenizer.batch_decode(out, skip_special_tokens=True)
    return res[0]

In [11]:
import json
from PIL import Image

def load_q(p):
    data = []
    for line in open(p):
        data.append(json.loads(line))
    return data

q_data = load_q("qa90_questions_ja.jsonl")

In [9]:
def answer_data(q_data):
    result = []
    for q in q_data:
        image = Image.open("val2014/COCO_val2014_" + q["image"])
        question = q["text_JA"]
        display(image)
        res = generate_response(question, image)
        print(question)
        if "##" in res:
            res = res.split("##")[0]
        print("final", res)
        q["answer"] = res
        result.append(q)
    return result

In [None]:
result = answer_data(q_data)

In [128]:
model_name = "stablelm-alpha-exp001"

In [None]:
# wandbに結果をアップロードしたい場合
def upload_result(result, name):
    wandb.init(project="heron-eval", name=name)
    table = wandb.Table(columns=['ID', 'Image', 'Question', 'Answer'])
    for r in result:
        image = Image.open("val2014/COCO_val2014_" + r["image"])
        answer = r["answer"]
        img = wandb.Image(image, caption=answer)
        idx = r["question_id"]
        table.add_data(idx, img, r["text_JA"], answer)
    wandb.log({"Table" : table})
    
upload_result(result, model_name)

In [129]:
def save_jsonl(jsonl, model_name):
    with open(f"{model_name}_answer.jsonl", "w") as f:
        for r in jsonl:
            f.write(json.dumps(r)+"\n")

In [130]:
save_jsonl(result, model_name)