In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content
!git clone -b v1.0 https://github.com/camenduru/LLaVA
%cd /content/LLaVA

!pip install -q transformers==4.36.2
!pip install -q gradio .

from transformers import AutoTokenizer, BitsAndBytesConfig
from llava.model import LlavaLlamaForCausalLM
import torch

model_path = "liuhaotian/llava-v1.6-mistral-7b"
kwargs = {"device_map": "auto"}
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4'
)
model = LlavaLlamaForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)

vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
    vision_tower.load_model()
vision_tower.to(device='cuda')
image_processor = vision_tower.image_processor

import os
import requests
from PIL import Image
from io import BytesIO
from llava.conversation import conv_templates, SeparatorStyle
from llava.utils import disable_torch_init
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
from transformers import TextStreamer

def caption_image(image_file, prompt):
    if image_file.startswith('http') or image_file.startswith('https'):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert('RGB')
    else:
        image = Image.open(image_file).convert('RGB')
    disable_torch_init()
    conv_mode = "llava_v0"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()
    inp = f"{roles[0]}: {prompt}"
    inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)
    raw_prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(raw_prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
    with torch.inference_mode():
      output_ids = model.generate(input_ids, images=image_tensor, do_sample=True, temperature=0.2,
                                  max_new_tokens=1024, use_cache=True, stopping_criteria=[stopping_criteria])
    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
    conv.messages[-1][-1] = outputs
    output = outputs.rsplit('</s>', 1)[0]
    return image, output

/content
Cloning into 'LLaVA'...
remote: Enumerating objects: 1960, done.[K
remote: Total 1960 (delta 0), reused 0 (delta 0), pack-reused 1960[K
Receiving objects: 100% (1960/1960), 13.60 MiB | 37.35 MiB/s, done.
Resolving deltas: 100% (1173/1173), done.
/content/LLaVA
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m89.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [9

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

You are using a model of type llava_mistral to instantiate a model of type llava. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json:   0%|          | 0.00/73.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/262M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlavaLlamaForCausalLM were not initialized from the model checkpoint at liuhaotian/llava-v1.6-mistral-7b and are newly initialized: ['model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.0.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.18.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.15.self_attn.rotary_emb.inv_freq', 'model.layers.1.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_attn.rotary_emb.inv_freq', 'model.layers.21.self_at

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

In [3]:
caption_image("https://llava-vl.github.io/static/images/view.jpg",'Describe the image in detail.')



(<PIL.Image.Image image mode=RGB size=1000x667>,
 'The image depicts a serene and tranquil setting. In the foreground, there is a wooden dock extending into a calm lake. The dock is surrounded by a natural landscape, featuring a backdrop of lush trees and a mountain range. The overall atmosphere of the image is peaceful and idyllic. ')

In [4]:
import zipfile
print("Unzipping main image files")
#zip_file_path is whereever the train2017.zip file is stored at, this path would then be used to downlod the zip file into local disk for faster retrieval
zip_file_path = '/content/drive/MyDrive/ShareGPT4V/data/coco/train2017.zip'
#zip_file_path = '/data2/wuyinjun/mscoco/train2017.zip'
#extract_dir holds the directory path where the extracted files will be stored
extract_dir = '/content/unzipped_images/train2017'
# extract_dir = 'C:/Users/YourUsername/Documents/unzipped_images/train2017'
#extract_dir = "/data2/wuyinjun/mscoco/images/"

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
  zip_ref.extractall(extract_dir)
print("Finished unzipping main image files")

Unzipping main image files
Finished unzipping main image files


In [5]:
import pickle
img_caption_file_name = '/content/drive/MyDrive/mscoco_40kdecomposed_dependencies_wholeexp.pkl'

with open(img_caption_file_name, 'rb') as f:
  df_mscoco_40k = pickle.load(f)

In [6]:
df_mscoco_40k.head()

Unnamed: 0,id,image,caption_sharegpt4v,caption_mscoco,embedding,caption_triples_ls,groups,elapsed_time,num_groups
0,9,000000000009.jpg,"In the center of the image, a vibrant blue lun...",Closeup of bins of food that include broccoli ...,"[0.196618, -0.1124176, -0.16870016, 0.03273408...","""Closeup of bins of food.""|""Bins of food inclu...",012,12.546637,1
1,25,000000000025.jpg,This image captures a serene moment in a zoo e...,A giraffe eating food from the top of the tree.,"[0.3378602, 0.00365863, -0.32199287, 0.1653291...","""A giraffe.""|""Eating food from the top of the ...",0 | 1,9.056211,2
2,30,000000000030.jpg,"The image presents a serene garden scene, cent...",A flower vase is sitting on a porch stand.,"[-0.034444228, 0.29082185, -0.022862792, -0.42...","""A flower vase.""|""Sitting on a porch stand.""",0 | 1,9.195682,2
3,34,000000000034.jpg,This is a detailed description of the image:\n...,A zebra grazing on lush green grass in a field.,"[-0.12711053, -0.30761, 0.0056521297, -0.01875...","""A zebra grazing."" | ""Lush green grass in a fi...",0 | 1,8.578875,2
4,36,000000000036.jpg,"In the image, there is a woman standing in fro...",Woman in swim suit holding parasol on sunny day.,"[0.2885676, -0.18000656, -0.5650209, 0.0151512...","""Woman in swim suit."" | ""Woman holding parasol...",012,14.776144,1


In [7]:
def generate_caption(img_name):
  img_root_dir = '/content/unzipped_images/train2017/train2017/'
  response = caption_image(img_root_dir + img_name, 'Describe the image in detail.')
  return response[1]

In [8]:
generate_caption(df_mscoco_40k['image'][0])

'The image shows a meal served in a blue bento box. The box is divided into three compartments, each containing a different food item.\n\nIn the top left compartment, there is a serving of bread with a spread on it.\n\nThe top right compartment contains a serving of fruit, which includes slices of orange and pineapple chunks.\n\nThe bottom left compartment holds a serving of meatballs, which are covered in a sauce.\n\nThe bottom right compartment contains a serving of broccoli, which is cooked and garnished with a sprinkle of cheese.\n\nThe bento box is placed on a surface, and the image is taken from a perspective that allows for a clear view of the contents of the bento box. '

In [15]:
df_test = df_mscoco_40k[0:100]
df_test

Unnamed: 0,id,image,caption_sharegpt4v,caption_mscoco,embedding,caption_triples_ls,groups,elapsed_time,num_groups
0,9,000000000009.jpg,"In the center of the image, a vibrant blue lun...",Closeup of bins of food that include broccoli ...,"[0.196618, -0.1124176, -0.16870016, 0.03273408...","""Closeup of bins of food.""|""Bins of food inclu...",012,12.546637,1
1,25,000000000025.jpg,This image captures a serene moment in a zoo e...,A giraffe eating food from the top of the tree.,"[0.3378602, 0.00365863, -0.32199287, 0.1653291...","""A giraffe.""|""Eating food from the top of the ...",0 | 1,9.056211,2
2,30,000000000030.jpg,"The image presents a serene garden scene, cent...",A flower vase is sitting on a porch stand.,"[-0.034444228, 0.29082185, -0.022862792, -0.42...","""A flower vase.""|""Sitting on a porch stand.""",0 | 1,9.195682,2
3,34,000000000034.jpg,This is a detailed description of the image:\n...,A zebra grazing on lush green grass in a field.,"[-0.12711053, -0.30761, 0.0056521297, -0.01875...","""A zebra grazing."" | ""Lush green grass in a fi...",0 | 1,8.578875,2
4,36,000000000036.jpg,"In the image, there is a woman standing in fro...",Woman in swim suit holding parasol on sunny day.,"[0.2885676, -0.18000656, -0.5650209, 0.0151512...","""Woman in swim suit."" | ""Woman holding parasol...",012,14.776144,1
...,...,...,...,...,...,...,...,...,...
95,562,000000000562.jpg,The image presents a simple yet vibrant scene ...,A glass of toothbrushes sitting on a counter,"[-0.34937996, 0.31451625, -0.30223146, -0.1166...","""A glass of toothbrushes.""|""Sitting on a count...",01,6.774847,1
96,564,000000000564.jpg,The image captures a dynamic moment in an indo...,There is no image here to provide a descriptio...,"[0.115994036, 0.19655688, -0.017465632, 0.0498...","""There is no image here.""|""To provide a descri...",0 | 1,6.712892,2
97,569,000000000569.jpg,This image captures a serene scene in a park. ...,A woman in a trench coat playing with her kite,"[0.09477796, 0.1708071, -0.06789127, -0.352763...","""A woman in a trench coat.""|""Playing with her ...",0 | 1,6.985086,2
98,572,000000000572.jpg,This image captures a moment between two indiv...,A man leaning over as she inspects a woman's s...,"[-0.16748847, -0.2925706, -0.025767297, -0.104...","""A man leaning over.""|""A woman's shoes being i...",01,6.582739,1


In [16]:
import time
import pickle
start_time = time.time()
df_test['llava_caption'] = df_test['image'].apply(generate_caption)
print("--- %s seconds ---" % (time.time() - start_time))
#pickle_file_path = '/content/drive/MyDrive/df_mscoco_40k_llava_captions_whole.pkl'
#with open(pickle_file_path, 'wb') as f:
#  pickle.dump(df_test, f)

--- 1020.2534029483795 seconds ---


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['llava_caption'] = df_test['image'].apply(generate_caption)


In [17]:
df_test

Unnamed: 0,id,image,caption_sharegpt4v,caption_mscoco,embedding,caption_triples_ls,groups,elapsed_time,num_groups,llava_caption
0,9,000000000009.jpg,"In the center of the image, a vibrant blue lun...",Closeup of bins of food that include broccoli ...,"[0.196618, -0.1124176, -0.16870016, 0.03273408...","""Closeup of bins of food.""|""Bins of food inclu...",012,12.546637,1,The image presents a meal served in a blue ben...
1,25,000000000025.jpg,This image captures a serene moment in a zoo e...,A giraffe eating food from the top of the tree.,"[0.3378602, 0.00365863, -0.32199287, 0.1653291...","""A giraffe.""|""Eating food from the top of the ...",0 | 1,9.056211,2,The image features a majestic giraffe in a zoo...
2,30,000000000030.jpg,"The image presents a serene garden scene, cent...",A flower vase is sitting on a porch stand.,"[-0.034444228, 0.29082185, -0.022862792, -0.42...","""A flower vase.""|""Sitting on a porch stand.""",0 | 1,9.195682,2,The image shows a white vase filled with a bou...
3,34,000000000034.jpg,This is a detailed description of the image:\n...,A zebra grazing on lush green grass in a field.,"[-0.12711053, -0.30761, 0.0056521297, -0.01875...","""A zebra grazing."" | ""Lush green grass in a fi...",0 | 1,8.578875,2,The image depicts a zebra in a natural setting...
4,36,000000000036.jpg,"In the image, there is a woman standing in fro...",Woman in swim suit holding parasol on sunny day.,"[0.2885676, -0.18000656, -0.5650209, 0.0151512...","""Woman in swim suit."" | ""Woman holding parasol...",012,14.776144,1,The image features a woman standing in front o...
...,...,...,...,...,...,...,...,...,...,...
95,562,000000000562.jpg,The image presents a simple yet vibrant scene ...,A glass of toothbrushes sitting on a counter,"[-0.34937996, 0.31451625, -0.30223146, -0.1166...","""A glass of toothbrushes.""|""Sitting on a count...",01,6.774847,1,The image shows a clear glass filled with toot...
96,564,000000000564.jpg,The image captures a dynamic moment in an indo...,There is no image here to provide a descriptio...,"[0.115994036, 0.19655688, -0.017465632, 0.0498...","""There is no image here.""|""To provide a descri...",0 | 1,6.712892,2,The image depicts a tennis match in progress. ...
97,569,000000000569.jpg,This image captures a serene scene in a park. ...,A woman in a trench coat playing with her kite,"[0.09477796, 0.1708071, -0.06789127, -0.352763...","""A woman in a trench coat.""|""Playing with her ...",0 | 1,6.985086,2,The image shows a person standing on a concret...
98,572,000000000572.jpg,This image captures a moment between two indiv...,A man leaning over as she inspects a woman's s...,"[-0.16748847, -0.2925706, -0.025767297, -0.104...","""A man leaning over.""|""A woman's shoes being i...",01,6.582739,1,The image shows a man and a woman in a hallway...
