In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.9 MB/s[0m eta [36m0:00:0

In [None]:
import os
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional

import torchvision.models as models # 이미지
from torchvision import transforms
from PIL import Image

from transformers import GPT2Tokenizer, GPT2Model # 텍스트

from tqdm.auto import tqdm

In [None]:
class VQADataset(Dataset):
    def __init__(self, df, tokenizer, transform, img_path, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.transform = transform
        self.img_path = img_path
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        img_name = os.path.join(self.img_path, row['image_id'] + '.jpg') # 이미지
        image = Image.open(img_name).convert('RGB')
        image = self.transform(image)

        question = row['question'] # 질문
        question = self.tokenizer.encode_plus(
            question,
            truncation=True,
            add_special_tokens=True,
            max_length=32,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        if not self.is_test:
            answer = row['answer'] # 답변
            answer = self.tokenizer.encode_plus(
                answer,
                max_length=32,
                padding='max_length',
                truncation=True,
                return_tensors='pt')
            return {
                'image': image.squeeze(),
                'question': question['input_ids'].squeeze(),
                'answer': answer['input_ids'].squeeze()
            }
        else:
            return {
                'image': image,
                'question': question['input_ids'].squeeze(),
            }

In [None]:
class VQAModel(nn.Module):
    def __init__(self, vocab_size):
        super(VQAModel, self).__init__()
        self.vocab_size = vocab_size

        self.resnet = models.resnet50(pretrained=True)
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        self.gpt2.resize_token_embeddings(vocab_size) # 추가한 [PAD] 토큰 반영

        combined_features_size = 1000 + self.gpt2.config.hidden_size # resnet 출력 차원 + gpt2 출력 차원
        self.classifier = nn.Linear(combined_features_size, vocab_size)

    def forward(self, images, question):
        image_features = self.resnet(images)
        image_features = image_features.view(image_features.size(0),-1)

        outputs = self.gpt2(question)
        output_features = outputs.last_hidden_state # [batch, sequence, hidden]

        image_features = image_features.unsqueeze(1).expand(-1, output_features.size(1),-1) # [batch, sequence, 1000]

        combined = torch.cat([image_features, output_features], dim=-1) # [batch, sequence, 1000+hidden]
        output = self.classifier(combined) # [batch, vocab_size]
        return output

In [None]:
train_df = pd.read_csv('/content/train5.csv')
#test_df = pd.read_csv('/content/test5.csv')
#sample_submission = pd.read_csv('/content/sample_submission.csv')
train_img_path = '/content/train5'
#test_img_path = '/content/test5'

# dataset & dataloader
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
vocab_size = len(tokenizer)

transform = transforms.Compose([
    #transforms.Resize((224, 224)),
    transforms.ToTensor(),
    #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

train_dataset = VQADataset(train_df, tokenizer, transform, train_img_path, is_test=False)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0

    for data in tqdm(loader, total=len(loader)):
        images = data['image'].to(device)
        question = data['question'].to(device)
        answer = data['answer'].to(device)

        optimizer.zero_grad()

        outputs = model(images, question)

        # output: [batch, sequence, vocab], answer : [batch, sequence]
        loss = criterion(outputs.view(-1, outputs.size(-1)), answer.view(-1))
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(loader)
    return avg_loss

#clip으로 이미지 인코딩하기

In [None]:
from transformers import AutoProcessor, CLIPModel
from PIL import Image

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
row=train_df.iloc[0]
img_name = os.path.join(train_img_path, row['image_id'] + '.jpg') # 이미지
image = Image.open(img_name).convert('RGB')
tf_toTensor = transforms.ToTensor()
image = tf_toTensor(image)

AttributeError: ignored

In [None]:
inputs = processor(images=image,return_tensors="pt")

ValueError: ignored

In [None]:
image_features = model.get_image_features(**inputs)

In [None]:
image_features.shape

torch.Size([1, 512])

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
#질문도 인코딩
question = row['question'] # 질문
print('question: ',question)
question = tokenizer(question, return_tensors="pt")
question

question:  Is this in the wild or zoo?


{'input_ids': tensor([[ 3792,   428,   287,   262,  4295,   393, 26626,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
#정답도 인코딩
answer = row['answer'] # 답변
print('answer: ',answer)
answer = tokenizer.encode_plus(
                answer,
                max_length=32,
                padding='max_length',
                truncation=True,
                return_tensors='pt')
answer

answer:  zoo


{'input_ids': tensor([[  89, 2238]]), 'attention_mask': tensor([[1, 1]])}

In [None]:
len(tokenizer)

50258

#mlp하기

In [None]:
def mlp(x):
  model = nn.Sequential(
      nn.Linear(512,768*10//2),
      nn.Linear(768*10//2,768*10)
  )
  return model(x).view(-1, 10, 768)

In [None]:
image_prefix=mlp(image_features)

In [None]:
print(image_prefix)
print(image_prefix.shape)

tensor([[[ 0.2979, -0.0943, -0.0903,  ..., -0.1858, -0.2594,  0.0443],
         [ 0.1376, -0.1685, -0.1475,  ...,  0.0688, -0.0308,  0.0699],
         [-0.0504,  0.0376,  0.1600,  ...,  0.0264, -0.0704,  0.1681],
         ...,
         [-0.0528, -0.0857,  0.2826,  ...,  0.0148,  0.0682,  0.2821],
         [-0.0612, -0.0903,  0.0092,  ..., -0.1612, -0.0103, -0.0688],
         [-0.0527, -0.0641,  0.1242,  ...,  0.1958,  0.2127,  0.0733]]],
       grad_fn=<ViewBackward0>)
torch.Size([1, 10, 768])


#cat

In [None]:
from transformers import GPT2LMHeadModel
gpt = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
gpt_embedding_size =gpt.transformer.wte.weight.shape[1]
gpt_embedding_size

768

In [None]:
embedding_text=question['input_ids']

In [None]:
embedding_text = gpt.transformer.wte(embedding_text)

In [None]:
embedding_text.shape

torch.Size([1, 8, 768])

In [None]:
label=answer['input_ids']

In [None]:
label.shape

torch.Size([1, 2])

In [None]:
embedding_cat = torch.cat((image_prefix, embedding_text), dim=1)

In [None]:
embedding_cat.shape

torch.Size([1, 18, 768])

#GPT2LMHeadModel에 넣기

*   loss계산하기



gpt.generate

In [None]:
output = gpt.generate(inputs_embeds=embedding_cat,max_length = 1, num_return_sequences=1)
predicted_text = tokenizer.decode(output[0],skip_special_tokens=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 1, but `max_length` is set to 1. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
output[0]

tensor([50256,   198])

In [None]:
predicted_text

'\n'

gpt


In [None]:
out = gpt(inputs_embeds=embedding_cat)

In [None]:
predictions=out[0]

In [None]:
predicted_index = torch.argmax(predictions[0, -1, :]).item()

In [None]:
predictions.shape

torch.Size([1, 18, 50257])

In [None]:
predicted_text = tokenizer.decode([predicted_index])

In [None]:
predicted_text

'\n'

In [None]:
logits = out.logits

In [None]:
logits.shape

torch.Size([1, 18, 50257])

In [None]:
a=logits.reshape(-1, logits.shape[-1])

In [None]:
a.shape

torch.Size([8, 50257])

In [None]:
label

tensor([[  89, 2238]])

In [None]:
label.shape

torch.Size([1, 2])

In [None]:
embedding_label = gpt.transformer.wte(label)

In [None]:
embedding_label.shape

torch.Size([1, 2, 768])

In [None]:
embedding_label

tensor([[[-0.0141, -0.0427,  0.0941,  ...,  0.0899,  0.0009, -0.0615],
         [-0.0748,  0.0082,  0.0868,  ..., -0.0681, -0.1397,  0.1576]]],
       grad_fn=<EmbeddingBackward0>)

오차계산

In [None]:
target_ids = tokenizer.encode(row['answer'], return_tensors='pt')
criterion = nn.CrossEntropyLoss()
loss = criterion(a, target_ids.view(-1))

ValueError: ignored

In [None]:
target_ids.view(-1)

tensor([  89, 2238])

In [None]:


# 정답 텍스트 'zoo' 토크나이저로 인코딩
target_text = 'zoo'
target_ids = tokenizer.encode(target_text, return_tensors='pt')

In [None]:
target_ids.view(-1)

tensor([  89, 2238])

In [None]:
 loss = functional.cross_entropy(logits, target_ids, ignore_index=0)

RuntimeError: ignored