<a href="https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Quick_demo_of_HuggingFace_version_of_Vision_Transformer_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Adapted from Quick demo: Vision Transformer (ViT) by Google Brain

In [1]:
import logging
import sys
logger = logging.getLogger(__name__)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')
logger.info("Logger set up")

import dataloader

07:47:15 INFO:Logger set up


### Config

In [2]:
from transformers import set_seed
set_seed(123)
num_examples = 1000
patch_height, patch_width = 16, 16
max_patches = 4000
image_encoder_path = "google/vit-base-patch16-224"
lm_path = "mistralai/Mistral-7B-v0.1"



# Part 1: Preprocess Data
Mark all pixels that belongs to the bounding boxes of positive candidates as targets

### Preprocess data

In [3]:
from datasets import load_dataset

# logger.info(f"Use model {cfg.model.pretrained_model_name_or_path}")
# output_dir = HydraConfig.get().runtime.output_dir
if num_examples:
    train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train").select(range(num_examples))
else:
    train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train")
print(train_dataset)
train_dataset = train_dataset.remove_columns(["neg_candidates", "raw_html", "cleaned_html"])
train_dataset = dataloader.get_previous_actions(train_dataset)
# filter out those without pos_candidates
train_dataset = train_dataset.filter(lambda x: len(x)==1, input_columns=['pos_candidates'])
train_dataset = train_dataset.remove_columns(['action_reprs'])
print(train_dataset)

07:47:16 INFO:PyTorch version 2.0.1 available.


Resolving data files:   0%|          | 0/27 [00:00<?, ?it/s]

Dataset({
    features: ['action_uid', 'raw_html', 'cleaned_html', 'operation', 'pos_candidates', 'neg_candidates', 'website', 'domain', 'subdomain', 'annotation_id', 'confirmed_task', 'screenshot', 'action_reprs'],
    num_rows: 1000
})
Dataset({
    features: ['action_uid', 'operation', 'pos_candidates', 'website', 'domain', 'subdomain', 'annotation_id', 'confirmed_task', 'screenshot', 'previous_actions'],
    num_rows: 892
})


### Generate prompt and label
The full prompt is:

[patch embeddings] \n Based on the webpage screenshot, try to complete the following task:\n Task: [task] \n Previous actions:\n [actions] \n Which image patch contains the element to interact with next?"

In [4]:
cols_to_remove = set(train_dataset.column_names)
cols_to_remove.remove("screenshot")
train_dataset = train_dataset.map(
    dataloader.get_prompt_target,
    batched=False,
    remove_columns=list(cols_to_remove)
)
train_dataset[2]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

{'screenshot': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x5429>,
 'question': 'Based on the HTML webpage, try to complete the following task:\nTask: rent a car in Brooklyn - Central, NY on from April 9 to April 15.\nPrevious actions:\n[heading]  CAR -> CLICK\n[combobox]  Enter pick up city, airport name, or airport code. -> TYPE: Brooklyn Central\nWhat should be the element to interact with next?',
 'boxes': [[114.59375, 365.1875, 306.8125, 25.6875]]}

In [5]:
# filter out those with bounding box out of range

# def box_in_range(example):
#     print(example)
#     l, b, _, _, = example["boxes"]
#     # width, height = example["screenshot"].size
#     width = height = 100
#     return l < width and b < height
    
# train_dataset = train_dataset.filter(lambda x: x, input_columns=['valid'])
# train_dataset = train_dataset.remove_columns(['valid'])
# train_dataset

In [6]:
from transformers import AutoImageProcessor, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(lm_path)
tokenizer.pad_token = tokenizer.eos_token # should be ok for casual LM
processor = AutoImageProcessor.from_pretrained(image_encoder_path)

cols = train_dataset.column_names
cols.remove("screenshot")
train_dataset = train_dataset.map(
    dataloader.get_tokenize_fn(tokenizer),
    remove_columns=cols,
    )
# train_dataset.set_format("pt", columns=["input_ids", "attention_mask", "label"], output_all_columns=True)
print(train_dataset[0])
train_dataset.set_transform(dataloader.get_preprocess_image_fn(processor, max_patches, patch_height, patch_width), output_all_columns=True) # process images on the fly
# split the train_dataset into train and validation
dataset = train_dataset.train_test_split(test_size=0.05) 
train_dataset, eval_dataset = dataset["train"], dataset["test"]
print(train_dataset[0])
logger.info(f"Use device {'gpu' if torch.cuda.is_available() else 'cpu'}")
# logger.info(f"Use batch size {cfg.train.batch_size}")
logger.info(f"Training data size {len(train_dataset)}")
logger.info(f"Eval data size {len(eval_dataset)}")

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

{'screenshot': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1280x5429 at 0x7F5981BD8F50>, 'input_ids': [1, 17158, 356, 272, 13987, 4686, 3005, 28725, 1464, 298, 4160, 272, 2296, 3638, 28747, 13, 4818, 28747, 7358, 264, 1253, 297, 21491, 387, 7993, 28725, 11800, 356, 477, 3999, 28705, 28774, 298, 3999, 28705, 28740, 28782, 28723, 13, 28284, 6768, 28747, 13, 5364, 13, 3195, 1023, 347, 272, 2442, 298, 14113, 395, 1679, 28804, 733, 7637, 28793], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [[283.1875, 220.390625, 93.59375, 33.0]]}
{'screenshot': tensor([[[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1.

NameError: name 'torch' is not defined

### Prepare Model

In [6]:
import multimodal

from transformers import AutoModelForCausalLM, AutoModel, AutoConfig
import torch
# from transformers import Pix2StructVisionModel, ViTImageProcessor, Pix2StructVisionConfig

### Config for notebook
config = AutoConfig.from_pretrained("mistralai/Mistral-7B-v0.1")
config.return_dict = True
config.use_cache = False
config.low_cpu_mem_usage = True
config.rope_theta = 10000.0
config.attn_implementation = "flash_attention_2"
###

# TODO: Move config to somewhere else

# image_encoder_config = Pix2StructVisionConfig.from_pretrained("google/pix2struct-base")
# TODO: try different hidden size?
# image_encoder_config.seq_len = 27145
# image_encoder_config.patch_size = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# image_encoder = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base", config=image_encoder_config, torch_dtype=torch.bfloat16)
# image_encoder.to(device)

image_encoder_config = AutoConfig.from_pretrained(image_encoder_path)
image_encoder = AutoModel.from_pretrained(image_encoder_path, config=image_encoder_config)
image_encoder.to(device)

lm = AutoModelForCausalLM.from_pretrained(lm_path, config=config, torch_dtype=torch.bfloat16)
lm.to(device)

model = multimodal.MultimodalAgent(config, image_encoder, lm, patch_width, patch_height)
model.to(device)
print(torch.cuda.memory_allocated())

print("Layers and their dimensions:")
import torch.nn as nn
for name, module in model.named_modules():
    if isinstance(module, nn.Conv2d) or isinstance(module, nn.Linear):
        print(f"{name}: {module.weight.shape}")

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

15378507776
Layers and their dimensions:
image_encoder.embeddings.patch_embeddings.projection: torch.Size([768, 3, 16, 16])
image_encoder.encoder.layer.0.attention.attention.query: torch.Size([768, 768])
image_encoder.encoder.layer.0.attention.attention.key: torch.Size([768, 768])
image_encoder.encoder.layer.0.attention.attention.value: torch.Size([768, 768])
image_encoder.encoder.layer.0.attention.output.dense: torch.Size([768, 768])
image_encoder.encoder.layer.0.intermediate.dense: torch.Size([3072, 768])
image_encoder.encoder.layer.0.output.dense: torch.Size([768, 3072])
image_encoder.encoder.layer.1.attention.attention.query: torch.Size([768, 768])
image_encoder.encoder.layer.1.attention.attention.key: torch.Size([768, 768])
image_encoder.encoder.layer.1.attention.attention.value: torch.Size([768, 768])
image_encoder.encoder.layer.1.attention.output.dense: torch.Size([768, 768])
image_encoder.encoder.layer.1.intermediate.dense: torch.Size([3072, 768])
image_encoder.encoder.layer.1.

### Tokenize Train Data

### Set up LoRA

In [8]:
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_int8_training
lora_config = LoraConfig(
    # task_type=TaskType.CAUSAL_LM, # task type is not necessary, but this is needed to get the label
    inference_mode=False,
    r=16,
    lora_alpha=32, 
    lora_dropout=0.05,
    target_modules="all-linear",
    modules_to_save=["projector"] # this layer is not pretrained
)

# model.lm.enable_input_require_grads()
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

trainable params: 48,349,184 || all params: 7,379,620,352 || trainable%: 0.6551716984586688
base_model.model.image_encoder.encoder.layer.0.attention.attention.query.lora_A.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.attention.query.lora_B.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.attention.key.lora_A.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.attention.key.lora_B.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.attention.value.lora_A.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.attention.value.lora_B.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.output.dense.lora_A.default.weight
base_model.model.image_encoder.encoder.layer.0.attention.output.dense.lora_B.default.weight
base_model.model.image_encoder.encoder.layer.0.intermediate.dense.lora_A.default.weight
base_model.model.image_encoder.encoder.layer.0.intermediate.dense.lora

### Set up Trainer

In [9]:
from transformers import TrainingArguments

config = {
        'lora_config': lora_config,
        'learning_rate': 1e-3,
        'num_train_epochs': 1,
        'gradient_accumulation_steps': 32,
        'per_device_train_batch_size': 1,
        'per_device_eval_batch_size': 1,
        'eval_accumulation_steps': 32,
        'gradient_checkpointing': True,
}


### Run Training

In [None]:
training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    optim="adamw_torch_fused",
    bf16=True,  # Use BF16 for flash attention
    # evlaution
    label_names=["labels"], # so that trainer will call compute_loss
    evaluation_strategy="steps",
    eval_steps=20,
    include_inputs_for_metrics=True,
    log_level="info",
    # logging strategies
    logging_dir=f"output/logs",
    logging_strategy="steps",
    logging_steps=20,
    save_strategy="no",
    remove_unused_columns=False,
    **{k:v for k,v in config.items() if k != 'lora_config'}
) # TODO: move train arguments to config
trainer = multimodal.MultimodalTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=multimodal.compute_metrics,
    tokenizer=tokenizer,
    data_collator=multimodal.custom_collate,
)
trainer.train()

Using auto half precision backend
***** Running training *****
  Num examples = 847
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 32
  Total optimization steps = 26
  Number of trainable parameters = 48,349,184


Step,Training Loss,Validation Loss


# Sanity check

Pix2Struct, reference: https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb

### Load model and processor

In [None]:
from transformers import AutoProcessor, Pix2StructVisionModel

# processor = AutoProcessor.from_pretrained("google/pix2struct-base")
# model = Pix2StructVisionModel.from_pretrained("google/pix2struct-base")
train_dataset[997]

In [None]:
import requests
from PIL import Image
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor, Pix2StructVisionModel


train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train").select(range(10))
image = train_dataset[3]["screenshot"]

text = "A picture of"

model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-textcaps-base")
# processor = Pix2StructImageProcessor.from_pretrained("google/pix2struct-textcaps-base")

# image only
inputs = processor(images=train_dataset["screenshot"], text=text, return_tensors="pt")
print(inputs.keys())
predictions = model.generate(**inputs)
print(processor.decode(predictions[0], skip_special_tokens=True))


### Using Pix2Struct

In [None]:
from transformers import Pix2StructImageProcessor, Pix2StructVisionModel, Pix2StructConfig, Pix2StructForConditionalGeneration
from datasets import load_dataset
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train").select(range(10))
image = train_dataset[3]["screenshot"]

# TODO: Move config to somewhere else
image_encoder_path = "google/pix2struct-textcaps-base"
image_encoder_config = Pix2StructConfig.from_pretrained(image_encoder_path)
# TODO: try different hidden size?
max_patches = 2000
patch_size = 16
# image_encoder_config.vision_config.seq_len = max_patches
# image_encoder_config.vision_config.patch_size = patch_size
print(image_encoder_config)

image_encoder = Pix2StructForConditionalGeneration.from_pretrained(image_encoder_path, config=image_encoder_config).encoder
print(image_encoder)
image_encoder.to(device)

processor = Pix2StructImageProcessor.from_pretrained(image_encoder_path) # TODO: define this somewhere else
processor.max_patches = max_patches
processor.patch_size = {"height":patch_size, "width":patch_size}
inputs = processor(images=image, return_tensors="pt").to(device)
print(image_encoder(**inputs))
print(torch.cuda.memory_summary())
# 2000 -> 7G
# 3000 -> 14G
# 4000 -> 25G
# 5000 -> 37G

### Using VIT

In [None]:
from transformers import AutoConfig, AutoImageProcessor, AutoModel
from datasets import load_dataset
import torch
import matplotlib.pyplot as plt

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train").select(range(10))
image = train_dataset[3]["screenshot"]

# TODO: Move config to somewhere else
image_encoder_path = "google/vit-base-patch16-224"
image_encoder_config = AutoConfig.from_pretrained(image_encoder_path)
# TODO: try different hidden size?
# print(image_encoder_config)

image_encoder = AutoModel.from_pretrained(image_encoder_path, config=image_encoder_config)
# print(image_encoder)
image_encoder.to(device)

downscale_factor = 4
processor = AutoImageProcessor.from_pretrained(image_encoder_path) # TODO: define this somewhere else
processor.size = {"height":5429//downscale_factor, "width":1280//downscale_factor}
inputs = processor(images=image, return_tensors="pt").to(device)
print(inputs.pixel_values.shape)
plt.figure(figsize=(12, 40))
plt.imshow(inputs.pixel_values.cpu()[0].permute((1,2,0)))
plt.show()
h = image_encoder(inputs["pixel_values"], interpolate_pos_encoding=True).last_hidden_state
# print(torch.cuda.memory_summary())
h.shape

### Match downscaled image patch index to target

In [None]:
train_dataset[3]["pos_candidates"]

### Match target index to patch index


bounding_box_rect is in the format of (left, bottom, width, height), so pixel_values[:,bottom:bottom+height,left:left+width] should be marked as positive

unscaled index 2d -> scaled index 2d -> patch index 2d -> patch index 1d

Shortest width / height:

In [None]:
train_dataset = load_dataset("osunlp/Multimodal-Mind2Web", split="train")
cands = train_dataset["pos_candidates"]
shortest = 100
widths = []
heights = []
import json
for cand_list in cands:
    for cand in cand_list:
        json_data = json.loads(cand)
        attributes = json.loads(json_data['attributes'])
        bounding_box_rect_str = attributes['bounding_box_rect']
        lbwh = tuple(map(float, bounding_box_rect_str.split(',')))
        widths.append(lbwh[2])
        heights.append(lbwh[3])
        # if lbwh[2] <= 0 or lbwh[3] <= 0:
        #     print(cand_list)
        #     print(shortest)

        # shortest = min(shortest, lbwh[2], lbwh[3])
        
import matplotlib.pyplot as plt
# plt.hist(widths, bins=100)
# plt.show()
heights = np.array(heights)
plt.hist(heights[heights < 200], bins=100)
plt.axvline(x=32, color='r', linestyle='--')
plt.title("Pos candidates height")


In [None]:
from transformers import ViTImageProcessor
import torch
sample = train_dataset[3]
print(sample["pos_candidates"])

image = sample["screenshot"]
print(image.size)
processor = ViTImageProcessor(size={"height": 5429, "width": 1280})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs = processor(images=image, return_tensors="pt").to(device)
pixel_values = inputs.pixel_values # [1, 3, 224, 224]
print(pixel_values.shape)

import matplotlib.pyplot as plt
import numpy as np
plt.figure()
plt.imshow(np.transpose(pixel_values[0,:,410:410+46,96:96+106].cpu(), (1,2,0)))


processor2 = ViTImageProcessor(size={"height": 5429//2, "width": 1280//2})
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inputs2 = processor2(images=image, return_tensors="pt").to(device)
pixel_values2 = inputs2.pixel_values # [1, 3, 224, 224]
print(pixel_values2.shape)

import matplotlib.pyplot as plt
import numpy as np
plt.figure()
plt.imshow(np.transpose(pixel_values2[0,:,410//2:(410+46)//2,96//2:(96+106)//2].cpu(), (1,2,0)))

# plt.figure(figsize=(15, 15))
# plt.imshow(np.transpose(pixel_values[0].cpu(), (1,2,0)))
# for i in range(0, 1000, 100):
#     plt.figure()
#     plt.imshow(np.transpose(pixel_values[0,:,i:i+160,i:i+160].cpu(), (1,2,0)))

In [None]:


# for i in range(16):
#     print([i*16+j for j in range(16)])
print(boxes_to_patch_idx_multitarget([96,410.390625,106,46], 640//16))
boxes_to_patch_idx([96,410.390625,106,46], 640//16)