In [1]:
# https://huggingface.co/yuanzhoulvpi/gpt2_chinese
# https://huggingface.co/google/vit-base-patch16-224
# https://huggingface.co/nlpconnect/vit-gpt2-image-captioning

In [2]:
from transformers import (VisionEncoderDecoderModel,
                          ViTModel,GPT2LMHeadModel,
                          AutoTokenizer,ViTImageProcessor,
                          Trainer,TrainingArguments)
from typing import List, Any 
import torch
from torch import Tensor
from PIL import Image
from datasets import load_dataset,Dataset

from tqdm import tqdm 
import numpy as np 
import pandas as pd 

In [3]:
VIT_MODEL_NAME_OR_PATH = "google/vit-base-patch16-224"
GPT_MODEL_NAME_OR_PATH = "yuanzhoulvpi/gpt2_chinese"


VIT_model = ViTModel.from_pretrained(VIT_MODEL_NAME_OR_PATH)
GPT_model = GPT2LMHeadModel.from_pretrained(GPT_MODEL_NAME_OR_PATH, add_cross_attention=True)

GPT_model.config.add_cross_attention# = True

Some weights of the model checkpoint at google/vit-base-patch16-224 were not used when initializing ViTModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing ViTModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ViTModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at yuanzhoulvpi/gpt2_chin

True

In [4]:
processor = ViTImageProcessor.from_pretrained(VIT_MODEL_NAME_OR_PATH)
tokenizer = AutoTokenizer.from_pretrained(GPT_MODEL_NAME_OR_PATH)

In [5]:
def process_image_2_pixel_value(x:str) -> Tensor:
    image = Image.open(x)
    res = processor(images=image, return_tensors='pt')['pixel_values'].squeeze(0)
    return res 


process_image_2_pixel_value(x = "bigdata/image_data/test-9282.jpg").shape 

torch.Size([3, 224, 224])

In [6]:
def process_text_2_input_id(x:str) :
    res = tokenizer(text=x,max_length=100, truncation=True,padding="max_length")['input_ids']
    return res 

len(process_text_2_input_id(x='hhh'))

# len(process_text_2_input_id(x="你好啊，csdhhchsh谁cdshhchshcsdhhhhhhhh")['input_ids'])

100

In [7]:
tokenizer.pad_token_id

21128

In [8]:
# GPT_model.config.add_cross_attention = True
# # GPT_model.crossattention = False
# GPT_model.config.add_cross_attention

# # config.add_cross_attention=True
# # hasattr(GPT_model, "crossattention")

In [9]:
new_encoder_decoder_model = VisionEncoderDecoderModel(
    encoder=VIT_model,
    decoder=GPT_model,
    
)
# new_encoder_decoder_model.config.use_return_dict = False
new_encoder_decoder_model.config.decoder_start_token_id = tokenizer.bos_token_id
new_encoder_decoder_model.config.pad_token_id = tokenizer.pad_token_id

# new_encoder_decoder_model.decoder.config.add_cross_attention=True

In [10]:
torch.tensor(process_text_2_input_id(x='hhh'), dtype=torch.long).unsqueeze(0).shape

torch.Size([1, 100])

In [11]:
new_encoder_decoder_model.config.add_cross_attention = True
new_encoder_decoder_model.config.add_cross_attention

True

In [12]:
dataset = Dataset.from_pandas(df=pd.read_csv("bigdata/clean_train_test/train.csv"))
dataset = dataset.train_test_split(test_size=0.001)


def tokenizer_text(examples) :
    examples['labels'] = [process_text_2_input_id(i) for i in examples['text']]
    # res = [process_text_2_input_id(i) for i in examples['text']]
    # examples['labels'] = [i['input_ids'] for i in res]
    return examples

def transform_images(examples):
    images = [process_image_2_pixel_value(i) for i in examples['image_path']]
    # images = [torch.Tensor(i) for i in images]
    examples['pixel_values'] = images
    return examples

dataset = dataset.map(
    function=tokenizer_text,
    batched=True
)
# dataset = dataset.map(
#     function=transform_images,
#     batched=True
# )

dataset.set_transform(transform=transform_images)


dataset

  0%|          | 0/1308 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['image_path', 'text', 'labels'],
        num_rows: 1307767
    })
    test: Dataset({
        features: ['image_path', 'text', 'labels'],
        num_rows: 1310
    })
})

In [13]:
def collate_fn(examples):
    pixel_values = torch.stack([i['pixel_values'] for i in examples])
    labels = torch.tensor([example["labels"] for example in examples], dtype=torch.long)
    return {
        "pixel_values": pixel_values,
        "labels": labels
    }


train_argument = TrainingArguments(
    output_dir="vit-gpt2-image-chinese-captioning",
    per_device_train_batch_size=48,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=400,
    logging_steps=400,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=400,
    fp16=True,
    remove_unused_columns=False,
    save_total_limit=4

)



trainer = Trainer(
    model=new_encoder_decoder_model,
    args=train_argument,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=collate_fn,
)
trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 1307767
  Num Epochs = 1
  Instantaneous batch size per device = 48
  Total train batch size (w. parallel, distributed & accumulation) = 384
  Gradient Accumulation steps = 8
  Total optimization steps = 3405
  Number of trainable parameters = 216825600


  0%|          | 0/3405 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 1.6245, 'learning_rate': 0.0004831670371886689, 'epoch': 0.12}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-400
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-400\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-400\generation_config.json


{'eval_loss': 1.1368376016616821, 'eval_runtime': 66.9577, 'eval_samples_per_second': 19.565, 'eval_steps_per_second': 0.612, 'epoch': 0.12}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-400\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 1.0244, 'learning_rate': 0.0004349349378507369, 'epoch': 0.23}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-800
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-800\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-800\generation_config.json


{'eval_loss': 0.934100866317749, 'eval_runtime': 37.9955, 'eval_samples_per_second': 34.478, 'eval_steps_per_second': 1.079, 'epoch': 0.23}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-800\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.9268, 'learning_rate': 0.0003617988150619466, 'epoch': 0.35}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-1200
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-1200\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-1200\generation_config.json


{'eval_loss': 0.8836056590080261, 'eval_runtime': 40.0826, 'eval_samples_per_second': 32.683, 'eval_steps_per_second': 1.023, 'epoch': 0.35}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-1200\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-1000] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.8801, 'learning_rate': 0.0002736074499028474, 'epoch': 0.47}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-1600
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-1600\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-1600\generation_config.json


{'eval_loss': 0.8513150215148926, 'eval_runtime': 33.9441, 'eval_samples_per_second': 38.593, 'eval_steps_per_second': 1.208, 'epoch': 0.47}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-1600\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-1500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.8534, 'learning_rate': 0.00018223701813346817, 'epoch': 0.59}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-2000
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2000\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2000\generation_config.json


{'eval_loss': 0.8301102519035339, 'eval_runtime': 31.7757, 'eval_samples_per_second': 41.226, 'eval_steps_per_second': 1.29, 'epoch': 0.59}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-2000\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-400] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.8339, 'learning_rate': 9.999180039404274e-05, 'epoch': 0.7}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-2400
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2400\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2400\generation_config.json


{'eval_loss': 0.8126574158668518, 'eval_runtime': 33.2058, 'eval_samples_per_second': 39.451, 'eval_steps_per_second': 1.235, 'epoch': 0.7}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-2400\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-800] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.8214, 'learning_rate': 3.794724221751192e-05, 'epoch': 0.82}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-2800
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2800\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-2800\generation_config.json


{'eval_loss': 0.8048203587532043, 'eval_runtime': 33.377, 'eval_samples_per_second': 39.249, 'eval_steps_per_second': 1.228, 'epoch': 0.82}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-2800\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-1200] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1310
  Batch size = 32


{'loss': 0.8142, 'learning_rate': 4.4584935273235815e-06, 'epoch': 0.94}


  0%|          | 0/41 [00:00<?, ?it/s]

Saving model checkpoint to vit-gpt2-image-chinese-captioning\checkpoint-3200
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-3200\config.json
Configuration saved in vit-gpt2-image-chinese-captioning\checkpoint-3200\generation_config.json


{'eval_loss': 0.8004471659660339, 'eval_runtime': 27.0937, 'eval_samples_per_second': 48.351, 'eval_steps_per_second': 1.513, 'epoch': 0.94}


Model weights saved in vit-gpt2-image-chinese-captioning\checkpoint-3200\pytorch_model.bin
Deleting older checkpoint [vit-gpt2-image-chinese-captioning\checkpoint-1600] due to args.save_total_limit


In [None]:
'   '