# TinyLLaVa inference code

In [None]:
!git clone https://github.com/DLCV-BUAA/TinyLLaVABench.git
%cd TinyLLaVABench
!pip install -q --upgrade pip  # enable PEP 660 support
!pip install -q -e .

Cloning into 'TinyLLaVABench'...
remote: Enumerating objects: 820, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (46/46), done.[K
remote: Total 820 (delta 80), reused 70 (delta 56), pack-reused 718[K
Receiving objects: 100% (820/820), 1.98 MiB | 5.04 MiB/s, done.
Resolving deltas: 100% (418/418), done.
/content/TinyLLaVABench
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [None]:
from tinyllava.model.builder import load_pretrained_model
from tinyllava.mm_utils import get_model_name_from_path

import argparse
import torch

from tinyllava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from tinyllava.conversation import conv_templates, SeparatorStyle
from tinyllava.model.builder import load_pretrained_model
from tinyllava.utils import disable_torch_init
from tinyllava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
    KeywordsStoppingCriteria,
)

from PIL import Image

import requests
from PIL import Image
from io import BytesIO
import re

def image_parser(args):
    out = args.image_file.split(args.sep)
    return out


def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out

def eval_model(args, tokenizer, model, image_processor, context_len):
    # Model
    disable_torch_init()

    qs = args.query
    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    if IMAGE_PLACEHOLDER in qs:
        if model.config.mm_use_im_start_end:
            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
        else:
            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
    else:
        if model.config.mm_use_im_start_end:
            qs = image_token_se + "\n" + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

    conv = conv_templates[args.conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    image_files = image_parser(args)
    images = load_images(image_files)
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = (
        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        .unsqueeze(0)
        .cuda()
    )

    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            do_sample=True if args.temperature > 0 else False,
            temperature=args.temperature,
            top_p=args.top_p,
            num_beams=args.num_beams,
            pad_token_id=tokenizer.pad_token_id,
            max_new_tokens=args.max_new_tokens,
            use_cache=True,
            stopping_criteria=[stopping_criteria],
        )

    outputs = tokenizer.batch_decode(
        output_ids, skip_special_tokens=True)[0]

    outputs = outputs.strip()
    if outputs.endswith(stop_str):
        outputs = outputs[: -len(stop_str)]
    outputs = outputs.strip()
    return outputs

In [None]:
# helper function to build the necessary arguments
def make_args(prompt, image):
  args = type('Args', (), {
    "model_path": model_path,
    "model_base": None,
    "model_name": get_model_name_from_path(model_path),
    "query": prompt,
    "conv_mode": "phi",
    "image_file": image,
    "sep": ",",
    "temperature": 0,
    "top_p": None,
    "num_beams": 1,
    "max_new_tokens": 512
  })()

  return args

In [None]:
# model download
model_path = "bczhou/TinyLLaVA-3.1B"

model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path, None, model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.40k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/88.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.78G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of the model checkpoint at bczhou/TinyLLaVA-3.1B were not used when initializing TinyLlavaPhiForCausalLM: ['model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias', 'model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight', 'model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias', 'model.vision_tower.vision_tower.vision_model.encoder.layers.0.ml

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/796M [00:00<?, ?B/s]

In [None]:
import os
import pandas as pd
from tqdm.auto import tqdm

In [None]:
# import prompts from file

def import_prompts(filename):
  prompts = []
  with open("/content/prompt.txt", "r") as f:
    data = f.read()
    prompts = data.split("\n")
  return prompts

In [None]:
labels = ['filename','gender','age','ethnical group','hair color','hair length','hair type',
          'eye position','eyebrows position','eyebrows shape','eyelid visibility',
          'epicanthic fold','upper eyelid visible','sclera color','iris color','eyes area',
          'lashes visible','eyes expression','nasal bridge','nose shape',
          'nose tip shape','mouth shape','mouth vertical thickness','lips thickness',
          'teeth visible','lips color','facial hair','ears dimension','ears protruding',
          'ears inside visible','ears stick out','height','weight','tattoos',
          'moles','scars','makeup','dressing']

# prompt.txt must be uploaded
prompts = import_prompts('/content/prompt.txt')

In [None]:
# function to run the inference of all prompts for an image
def infer_image(prompts, directory, filename):
  result = []
  image_file = os.path.join(directory, filename)
  if os.path.isfile(image_file):
    result.append(filename)
    for prompt in tqdm(prompts, position=1, leave=False):
      args = make_args(prompt, image_file)
      result.append(eval_model(args, tokenizer, model, image_processor, context_len))
  return result

In [None]:
# function to run the inference to all images in directory
def infer_folder(prompts, labels, directory):
  results_list = []

  for filename in tqdm(os.listdir(directory)):
    res = infer_image(prompts, directory, filename)
    results_list.append(res)

  return pd.DataFrame(results_list, columns=labels)

In [None]:
# path of images directory
directory = '/content/original'
result = infer_folder(prompts, labels, directory)

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]



  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

  0%|          | 0/37 [00:00<?, ?it/s]

In [None]:
result

Unnamed: 0,filename,gender,age,ethnical group,hair color,hair length,hair type,eye position,eyebrows position,eyebrows shape,...,ears protruding,ears inside visible,ears stick out,height,weight,tattoos,moles,scars,makeup,dressing
0,TVDenoise_natalia burlinova.jpeg,Female,40.0,Caucasian,Brown,Short,Straight,Wide,High,Straight across,...,Yes,No,The ears stick out more at the top.,"5'6""",120.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","Yes, the subject is wearing makeup. She has li...",The person in the image is wearing a black shi...
1,TVDenoise_yulan adonay archaga carias.jpeg,Male,30.0,White,Black,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,"5'8""",200.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","No, the subject is not wearing makeup. He has ...",The person in the image is wearing a white shirt.
2,,,,,,,,,,,...,,,,,,,,,,
3,TVDenoise_jane mcdonald.jpeg,Female,30.0,White,Brown,Long,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top of the hat.,"5'6""",120.0,"Yes, the woman in the image has tattoos. They ...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","Yes, the subject is wearing makeup. She is wea...",The person in the image is wearing a white cow...
4,TVDenoise_oleg mikhaylovich.jpeg,Male,50.0,Caucasian,Gray,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,5'9,250.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","Yes, the subject is wearing makeup. He is wear...",The person in the image is wearing a blue jack...
5,TVDenoise_david durham.jpeg,Male,30.0,Caucasian,Brown,Long,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top of the man'...,6 feet,200.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","No, the subject is not wearing any makeup.",The person in the image is wearing a green shirt.
6,TVDenoise_juan carlos mayorga.jpeg,Male,20.0,Asian,Black,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,"5'6""",200.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","No, the subject is not wearing any makeup.",The person in the image is wearing a shirt.
7,TVDenoise_alexis flores.jpeg,Male,20.0,White,Black,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,5'8,200.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has a mole on the...","Yes, the person in the image has scars. They a...","No, the subject is not wearing any makeup.",The person in the image is wearing a plaid shirt.
8,TVDenoise_john porcaro.jpeg,Male,40.0,White,Black,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,5'8,200.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","Yes, the subject is wearing makeup, specifical...",The person in the image is wearing a white shirt.
9,TVDenoise_armando vargas.jpeg,Male,20.0,White,Black,Short,Straight,Wide,Down low,Straight across,...,No,No,The ears stick out more at the top.,5'8,180.0,"Yes, the person in the image has tattoos. They...","Yes, the person in the image has visible moles...","Yes, the person in the image has scars. They a...","No, the subject is not wearing makeup. He is w...",The person in the image is wearing a white shirt.


In [None]:
def export_csv(df:pd.DataFrame, filename):
  df.to_csv(filename, columns=labels, index=False)

In [None]:
export_csv(result, '/content/results.csv')