## TO-DO:

- Build pure ViT with new TP architecture
    - Use lucidrains base, 2 layers
    - Create my TP-Block
- Extend to language
- Explore use of T2T for vision pipeline

## Modeling

In [None]:
from transformers import ViltProcessor, ViltModel
from PIL import Image
import requests

# prepare image and text
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "hello world"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

inputs = processor(image, text, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state

## Data

In [1]:
from torch.utils.data import Dataset, DataLoader


class RandomImageDataset(Dataset):
    def __init__(self, n_samples):
        assert n_samples % 2 == 0, "n_samples must be an even number"
        self.n_samples = n_samples
        self.labels = [0] * int(self.n_samples/2) + [1] * int(self.n_samples/2)
        
    def __len__(self):
        return self.n_samples
    
    def __getitem__(self, idx):
        if idx < self.n_samples/2:
            x = torch.normal(0, 0.1, size=(3, 224, 224))
        else:
            x = torch.normal(1, 0.1, size=(3, 224, 224))
        y = self.labels[idx]
        return x, y
    
train_ds = RandomImageDataset(100)
train_datagen = DataLoader(train_ds, batch_size=2, shuffle=True)


## Training

In [2]:
from models.tpr_block_vit import TP_ViT

model = TP_ViT(
    image_size=224,
    patch_size=16,
    num_classes=2,
    dim=768,
    depth=2,
    heads=8,
    mlp_dim=3072
)


NOTE TO SELF:

Query size should be size of dim head - aka not 768 but 768/n_heads

In [5]:
import torch
import torch.optim as optim

from torch import nn


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)


for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(train_datagen, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 5 == 4:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

[1,     5] loss: 0.001
[1,    10] loss: 0.000
[1,    15] loss: 0.000
[1,    20] loss: 0.000
[1,    25] loss: 0.000
[1,    30] loss: 0.000
[1,    35] loss: 0.000
[1,    40] loss: 0.000
[1,    45] loss: 0.000
[1,    50] loss: 0.000
[2,     5] loss: 0.000
[2,    10] loss: 0.000
[2,    15] loss: 0.000
[2,    20] loss: 0.000
[2,    25] loss: 0.000
[2,    30] loss: 0.000
[2,    35] loss: 0.000
[2,    40] loss: 0.000
[2,    45] loss: 0.000
[2,    50] loss: 0.000
Finished Training


## Playground

In [5]:
'''Below is the correct matmul implementation
for TPR with elementwise multiplication across the 
embedding dimension'''


import torch

f = torch.rand((7, 768, 197, 1)) # b, d, n, 1
r = torch.rand((7, 768, 1, 8)) # b, d, 1, r

out = torch.matmul(f, r)

print(out.size())

torch.Size([7, 768, 197, 8])


In [6]:
f = torch.rand((1, 2, 3, 1))
r = torch.rand((1, 2, 1, 4))

out = torch.matmul(f, r)

print(f)
print(r)
print(out)

tensor([[[[0.5913],
          [0.5172],
          [0.1101]],

         [[0.9911],
          [0.5339],
          [0.3225]]]])
tensor([[[[0.5453, 0.7955, 0.5617, 0.5944]],

         [[0.9313, 0.0784, 0.9523, 0.9734]]]])
tensor([[[[0.3224, 0.4704, 0.3321, 0.3515],
          [0.2820, 0.4114, 0.2905, 0.3074],
          [0.0600, 0.0876, 0.0618, 0.0654]],

         [[0.9229, 0.0777, 0.9437, 0.9647],
          [0.4972, 0.0419, 0.5084, 0.5197],
          [0.3004, 0.0253, 0.3071, 0.3139]]]])


In [7]:
print(out.size())

torch.Size([1, 2, 3, 4])


role-1 token-1 = 0.3224, 0.9229

Should be from 0.5453 * 0.5913, 0.9313 * 0.9911

role-3 token-2 = 0.2905, 0.5084

Should be from 0.5617 * 0.5172, 0.9523 * 0.5339

In [8]:
import torch
import torch.nn as nn


net = TP_ViT(
    image_size=224,
    patch_size=16,
    num_classes=2,
    dim=768,
    depth=2,
    heads=12,
    mlp_dim=3072,
    n_roles=12
)
x = torch.ones((1, 3, 224, 224))
# compute the forward pass to create the computation graph
y = net(x)

# use computation graph to find all contributing tensors
def get_contributing_params(y, top_level=True):
    nf = y.grad_fn.next_functions if top_level else y.next_functions
    for f, _ in nf:
        try:
            yield f.variable
        except AttributeError:
            pass  # node has no tensor
        if f is not None:
            yield from get_contributing_params(f, top_level=False)

contributing_parameters = set(get_contributing_params(y))
all_parameters = set(net.parameters())
non_contributing = all_parameters - contributing_parameters
print(non_contributing)  # returns the [999999.0] tensor

{Parameter containing:
tensor([[-0.2226, -0.0301, -0.2079,  0.2085, -0.0009, -0.0717,  0.1685, -0.2243,
          0.0261, -0.1489, -0.1216, -0.0095],
        [ 0.0681,  0.1024, -0.2068, -0.1476,  0.0485,  0.0148, -0.1064,  0.0873,
          0.0888, -0.0157,  0.0807,  0.0339]], requires_grad=True), Parameter containing:
tensor([0.1804, 0.0633], requires_grad=True)}


## Evaluate Stock ViT @ 224

In [1]:
from transformers import ViTFeatureExtractor, ViTForImageClassification
import torch
import torchmetrics
import torchvision
from torch.utils.data import DataLoader

import pytorch_lightning as pl


class PLModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
        self.loss = torch.nn.CrossEntropyLoss()
        self.acc = torchmetrics.Accuracy()
        
    def forward(self, x):
        logits = model(x).logits
        return logits
        
    def test_step(self, test_batch, batch_idx):
        x, y = test_batch
        outputs = self.forward(x)
        loss = self.loss(outputs, y)
        acc = self.acc(self.softmax(outputs, dim=1), y)
        self.log('test_loss', loss)
        self.log('test_acc', acc)
        
        
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

def transform(image):
    inputs = feature_extractor(image, return_tensors="pt")
    return inputs.pixel_values.squeeze()

valid_ds = torchvision.datasets.ImageFolder(
    r'B:\Datasets\ImageNet2\validation',
    transform=transform
)
valid_datagen = DataLoader(valid_ds, batch_size=8, shuffle=False)

In [None]:
trainer = pl.Trainer(accelerator='gpu', devices=1)
model = PLModel()
model.eval;
# trainer.test(model, valid_datagen)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Missing logger folder: B:\PhD\Projects\tp-vilt\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Testing: 0it [00:00, ?it/s]

In [2]:
print(len(valid_ds))

50000


In [2]:
print(valid_ds[0][0].size())

torch.Size([3, 224, 224])


In [6]:
from tqdm.notebook import tqdm

model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
model = model.to('cuda')
acc_fn = torchmetrics.Accuracy().to('cuda')

accs = []
for batch in tqdm(valid_datagen):
    x, y = batch
    with torch.no_grad():
        logits = model(x.to('cuda')).logits
    acc = acc_fn(torch.nn.functional.softmax(logits, dim=1), y.to('cuda'))
    accs.append(acc)
    
print(sum(accs)/len(accs))

  0%|          | 0/6250 [00:00<?, ?it/s]

tensor(0.8033, device='cuda:0')


In [7]:
print(valid_datagen[1].size())

TypeError: 'DataLoader' object is not subscriptable