<a href="https://colab.research.google.com/github/theostoican/Guided-Research/blob/main/gr_dino_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
from transformers import ViTFeatureExtractor, ViTForImageClassification
from transformers import AutoFeatureExtractor, ViTMAEForPreTraining
from transformers import ViTModel

from PIL import Image
import requests
import os
import torch
from torch import nn
import torchvision
import matplotlib.pyplot as plt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 39.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 49.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [None]:
feature_extractor = ViTFeatureExtractor.from_pretrained('facebook/dino-vitb16')
model = ViTModel.from_pretrained('facebook/dino-vitb16', output_attentions=True)

Downloading:   0%|          | 0.00/244 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/454 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/327M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from PIL import Image

def inference():
  for filename in os.listdir('images'):
    os.makedirs('attentions', exist_ok=True)

    f = os.path.join('images', filename)
    with open(f, "rb") as f:
      img = Image.open(f)
      img = img.convert("RGB")

      inputs = feature_extractor(images=img, return_tensors="pt")

      outputs = model(**inputs)
      
      attentions = outputs.attentions[-1].detach()
      nh = attentions.shape[1] # number of heads

      patch_size = 16
      w, h = inputs.pixel_values.shape[-2] - inputs.pixel_values.shape[-2] % patch_size, inputs.pixel_values.shape[-1] - inputs.pixel_values.shape[-1] % patch_size
      inputs.pixel_values = inputs.pixel_values[:, :w, :h].unsqueeze(0)

      w_featmap = inputs.pixel_values.shape[-2] // patch_size
      h_featmap = inputs.pixel_values.shape[-1] // patch_size

      attentions = attentions[0, :, 0, 1:].reshape(nh, -1)

      attentions = attentions.reshape(nh, w_featmap, h_featmap)
      attentions = nn.functional.interpolate(attentions.unsqueeze(0), scale_factor=patch_size, mode="nearest")[0].cpu().numpy()

      fname = os.path.join('attentions', "attn-" + os.path.basename(filename))
      plt.imsave(
                  fname=fname,
                  arr=sum(
                      attentions[i] * 1 / attentions.shape[0]
                      for i in range(attentions.shape[0])
                  ),
                  cmap="inferno",
                  format="jpg",
              )
      print(f"{fname} saved.")


In [None]:
inference()

attentions/attn-kitchen_small_1_79.png saved.
attentions/attn-kitchen_small_1_8.png saved.
attentions/attn-kitchen_small_1_113.png saved.
attentions/attn-kitchen_small_1_109.png saved.
attentions/attn-kitchen_small_1_179.png saved.
attentions/attn-kitchen_small_1_177.png saved.
attentions/attn-kitchen_small_1_141.png saved.
attentions/attn-kitchen_small_1_146.png saved.
attentions/attn-kitchen_small_1_130.png saved.
attentions/attn-kitchen_small_1_116.png saved.
attentions/attn-kitchen_small_1_131.png saved.
attentions/attn-kitchen_small_1_68.png saved.
attentions/attn-kitchen_small_1_27.png saved.
attentions/attn-kitchen_small_1_72.png saved.
attentions/attn-kitchen_small_1_21.png saved.
attentions/attn-kitchen_small_1_101.png saved.
attentions/attn-kitchen_small_1_160.png saved.
attentions/attn-kitchen_small_1_106.png saved.
attentions/attn-kitchen_small_1_161.png saved.
attentions/attn-kitchen_small_1_64.png saved.
attentions/attn-kitchen_small_1_86.png saved.
attentions/attn-kitche

In [None]:
import cv2
import numpy as np
import glob

from tqdm import tqdm

FOURCC = {
    "mp4": cv2.VideoWriter_fourcc(*"MP4V"),
    "avi": cv2.VideoWriter_fourcc(*"XVID"),
}

def generate_video(images_filename, video_filename):
  img_array = []
  if images_filename == 'attentions':
    attention_images_list = sorted(glob.glob(os.path.join(images_filename, "attn-*.png")))
  else:
    attention_images_list = sorted(glob.glob(os.path.join(images_filename, "*.png")))
  os.makedirs('videos', exist_ok=True)

  attention_images_list.sort()
  attention_images_list.sort(key=len)
  print(attention_images_list)
  # Get size of the first image
  with open(attention_images_list[0], "rb") as f:
    img = Image.open(f)
    img = img.convert("RGB")
    size = (img.width, img.height)
    img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR))

  print(f"Generating video {size} to videos/")

  for filename in tqdm(attention_images_list[1:]):
    with open(filename, "rb") as f:
      img = Image.open(f)
      img = img.convert("RGB").resize(size, Image.ANTIALIAS)
      img_array.append(cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR))

  out = cv2.VideoWriter(
      os.path.join('videos', video_filename),
      FOURCC["mp4"],
      30,
      size,
  )

  for i in range(len(img_array)):
    out.write(img_array[i])
  out.release()
  print("Done")

In [None]:
generate_video('attentions', 'attention_video.mp4')
generate_video('images', 'video.mp4')

['attentions/attn-kitchen_small_1_1.png', 'attentions/attn-kitchen_small_1_2.png', 'attentions/attn-kitchen_small_1_3.png', 'attentions/attn-kitchen_small_1_4.png', 'attentions/attn-kitchen_small_1_5.png', 'attentions/attn-kitchen_small_1_6.png', 'attentions/attn-kitchen_small_1_7.png', 'attentions/attn-kitchen_small_1_8.png', 'attentions/attn-kitchen_small_1_9.png', 'attentions/attn-kitchen_small_1_10.png', 'attentions/attn-kitchen_small_1_11.png', 'attentions/attn-kitchen_small_1_12.png', 'attentions/attn-kitchen_small_1_13.png', 'attentions/attn-kitchen_small_1_14.png', 'attentions/attn-kitchen_small_1_15.png', 'attentions/attn-kitchen_small_1_16.png', 'attentions/attn-kitchen_small_1_17.png', 'attentions/attn-kitchen_small_1_18.png', 'attentions/attn-kitchen_small_1_19.png', 'attentions/attn-kitchen_small_1_20.png', 'attentions/attn-kitchen_small_1_21.png', 'attentions/attn-kitchen_small_1_22.png', 'attentions/attn-kitchen_small_1_23.png', 'attentions/attn-kitchen_small_1_24.png', 

100%|██████████| 179/179 [00:00<00:00, 1006.85it/s]


Done
['images/kitchen_small_1_1.png', 'images/kitchen_small_1_2.png', 'images/kitchen_small_1_3.png', 'images/kitchen_small_1_4.png', 'images/kitchen_small_1_5.png', 'images/kitchen_small_1_6.png', 'images/kitchen_small_1_7.png', 'images/kitchen_small_1_8.png', 'images/kitchen_small_1_9.png', 'images/kitchen_small_1_10.png', 'images/kitchen_small_1_11.png', 'images/kitchen_small_1_12.png', 'images/kitchen_small_1_13.png', 'images/kitchen_small_1_14.png', 'images/kitchen_small_1_15.png', 'images/kitchen_small_1_16.png', 'images/kitchen_small_1_17.png', 'images/kitchen_small_1_18.png', 'images/kitchen_small_1_19.png', 'images/kitchen_small_1_20.png', 'images/kitchen_small_1_21.png', 'images/kitchen_small_1_22.png', 'images/kitchen_small_1_23.png', 'images/kitchen_small_1_24.png', 'images/kitchen_small_1_25.png', 'images/kitchen_small_1_26.png', 'images/kitchen_small_1_27.png', 'images/kitchen_small_1_28.png', 'images/kitchen_small_1_29.png', 'images/kitchen_small_1_30.png', 'images/kitch

100%|██████████| 179/179 [00:02<00:00, 86.83it/s]


Done
