## Import

In [1]:
!pip install transformers
!pip install timm

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m73.3 MB/s[0m eta [36m0:00:0

In [2]:
import os
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import torchvision.models as models # 이미지
from torchvision import transforms
from PIL import Image

from transformers import BertTokenizer, BertModel # 텍스트

from tqdm.auto import tqdm
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform

In [3]:
!wget —load-cookies ~/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget —quiet —save-cookies ~/cookies.txt —keep-session-cookies —no-check-certificate 'https://docs.google.com/uc?export=download&id=1a9XB3r83ZCFWLOHBp8ooz3zQFl9rEIei' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1a9XB3r83ZCFWLOHBp8ooz3zQFl9rEIei" -O open.zip && rm -rf ~/cookies.txt

--2023-08-16 08:53:23--  http://xn--quiet-3u3b/
Resolving xn--quiet-3u3b (xn--quiet-3u3b)... failed: Name or service not known.
wget: unable to resolve host address ‘xn--quiet-3u3b’
--2023-08-16 08:53:23--  http://xn--save-cookies-w19f/
Resolving xn--save-cookies-w19f (xn--save-cookies-w19f)... failed: Name or service not known.
wget: unable to resolve host address ‘xn--save-cookies-w19f’
/root/cookies.txt: Scheme missing.
--2023-08-16 08:53:23--  http://xn--keep-session-cookies-ou2l/
Resolving xn--keep-session-cookies-ou2l (xn--keep-session-cookies-ou2l)... failed: Name or service not known.
wget: unable to resolve host address ‘xn--keep-session-cookies-ou2l’
--2023-08-16 08:53:23--  http://xn--no-check-certificate-ou2l/
Resolving xn--no-check-certificate-ou2l (xn--no-check-certificate-ou2l)... failed: Name or service not known.
wget: unable to resolve host address ‘xn--no-check-certificate-ou2l’
--2023-08-16 08:53:23--  https://docs.google.com/uc?export=download&id=1a9XB3r83ZCFWLOHBp

In [4]:
!unzip open.zip -d ./open

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: ./open/image/test/test_05880.jpg  
  inflating: ./open/image/test/test_07566.jpg  
  inflating: ./open/image/test/test_09290.jpg  
  inflating: ./open/image/test/test_01284.jpg  
  inflating: ./open/image/test/test_00037.jpg  
  inflating: ./open/image/test/test_07701.jpg  
  inflating: ./open/image/test/test_03156.jpg  
  inflating: ./open/image/test/test_00986.jpg  
  inflating: ./open/image/test/test_01531.jpg  
  inflating: ./open/image/test/test_04526.jpg  
  inflating: ./open/image/test/test_05109.jpg  
  inflating: ./open/image/test/test_10827.jpg  
  inflating: ./open/image/test/test_07676.jpg  
  inflating: ./open/image/test/test_01460.jpg  
  inflating: ./open/image/test/test_09486.jpg  
  inflating: ./open/image/test/test_08309.jpg  
  inflating: ./open/image/test/test_03076.jpg  
  inflating: ./open/image/test/test_07918.jpg  
  inflating: ./open/image/test/test_09718.jpg  
  inflating: ./open/image/test/test_01

## Dataset

In [5]:
class VQADataset(Dataset):
    def __init__(self, df, tokenizer, transform, img_path, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.img_path = img_path
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_name = os.path.join(self.img_path, row['image_id'] + '.jpg') # 이미지
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)

        question = row['question'] # 질문
        question = self.tokenizer.encode_plus(
            question,
            truncation=True,
            add_special_tokens=True,
            max_length=32,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        if not self.is_test:
            answer = row['answer'] # 답변
            answer = self.tokenizer.encode_plus(
                answer,
                max_length=32,
                padding='max_length',
                truncation=True,
                return_tensors='pt')
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'answer': answer['input_ids'].squeeze()
            }
        else:
            return {
                'image': image,
                'question': question['input_ids'].squeeze(),
            }

## Model

In [6]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size):
        super(VQAModel, self).__init__()
        self.vocab_size = vocab_size
        self.vit = timm.create_model('vit_base_patch16_224', pretrained=True)
        config = resolve_data_config({}, model=self.vit)
        transform = create_transform(**config)

        # self.gpt2 = GPT2Model.from_pretrained('gpt2')
        # self.gpt2.resize_token_embeddings(vocab_size) # 추가한 [PAD] 토큰 반영

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.bert.resize_token_embeddings(vocab_size)

        combined_features_size = 1000 + self.bert.config.hidden_size # resnet 출력 차원 + bert 출력 차원
        self.classifier = nn.Linear(combined_features_size, vocab_size)

    def forward(self, images, question):
        image_features = self.vit(images)
        image_features = image_features.view(image_features.size(0),-1)

        outputs = self.bert(question)
        output_features = outputs.last_hidden_state # [batch, sequence, hidden]

        image_features = image_features.unsqueeze(1).expand(-1, output_features.size(1),-1) # [batch, sequence, 1000]
        combined = torch.cat([image_features, output_features], dim=-1) # [batch, sequence, 1000+hidden]
        output = self.classifier(combined) # [batch, vocab_size]
        return output

## DataLoader

In [7]:
# 데이터 불러오기
train_df = pd.read_csv('open/train.csv')
test_df = pd.read_csv('open/test.csv')
sample_submission = pd.read_csv('open/sample_submission.csv')
train_img_path = 'open/image/train'
test_img_path = 'open/image/test'

# dataset & dataloader
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab_size = len(tokenizer)

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VQADataset(train_df, tokenizer, transform, train_img_path, is_test=False)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

## Train & Inference

In [8]:
# device
import torch

device = torch.device("cuda")
print(f"current device is {device}")

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for data in tqdm(loader, total=len(loader)):
        images = data['image'].to(device)
        question = data['question'].to(device)
        answer = data['answer'].to(device)

        optimizer.zero_grad()
        #여기서 문제 발생
        outputs = model(images, question)

        # output: [batch, sequence, vocab], answer : [batch, sequence]
        loss = criterion(outputs.view(-1, outputs.size(-1)), answer.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(loader)
    return avg_loss

current device is cuda


In [9]:
def inference(model, loader):
    model.eval()
    preds = []
    with torch.no_grad():
        for data in tqdm(loader, total=len(loader)):
            images = data['image'].to(device)
            question = data['question'].to(device)
            outputs = model(images, question) # [batch, sequence, vocab]
            _, pred = torch.max(outputs, dim=2) # values, indices = _, pred
            preds.extend(pred.cpu().numpy())

    return preds

## Run!

In [10]:
# Model
model = VQAModel(vocab_size).to(device)

# Criterion and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(1):
    avg_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch: {epoch+1}, Loss: {avg_loss:.4f}")

Downloading model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

  0%|          | 0/5618 [00:00<?, ?it/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: ignored

## Post-Processing

In [None]:
# Dataset & DataLoader
test_dataset = VQADataset(test_df, tokenizer, transform, test_img_path, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# inference
preds = inference(model, test_loader)

no_pad_output = []
for pred in preds:
    output = pred[pred != 50257] # [PAD] token 제외
    no_pad_output.append(tokenizer.decode(output).strip()) # 토큰 id -> 토큰

## Submission

In [None]:
sample_submission['answer'] = no_pad_output
sample_submission.to_csv('submission.csv', index=False)

In [None]:
solution = pd.read_csv('solution.csv')

##Custom - Prompt

In [None]:
# Model
checkpoint = torch.load('baseline_2.pth')
model = checkpoint['model']
optimizer = checkpoint['optimizer']
epoch = checkpoint['epoch']

image_path = 'image/test'
image = 'test_00000'
question = input()
test_df = pd.DataFrame({"image_id":[image], "question":[question]})

test_dataset = VQADataset(test_df, tokenizer, transform, test_img_path, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
preds = inference(model, test_loader)
for pred in preds:
    output = pred[pred != 50257] # [PAD] token 제외

print('Question: ',question)
print('Answer: ', tokenizer.decode(output).strip()) # 토큰 id -> 토큰
Image.open(f'{image_path}/{image}.jpg').convert('RGB')