In [1]:
from nvidia.dali import pipeline_def, math
from nvidia.dali.pipeline import Pipeline
from nvidia.dali.ops import WarpAffine
import nvidia.dali.fn as fn
import nvidia.dali.types as types

import cv2
import numpy as np
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt

from utils import *

load_dotenv()
Migrator().run()

In [2]:
query = "wedding*moose"
models = Model.find(Model.filename % query).all()
models = [model for model in models if model.faces]
max_batch_size = 8

In [3]:
def show_images(image_batch):
    columns = 4
    rows = (max_batch_size + 1) // (columns)
    fig = plt.figure(figsize=(24, (24 // columns) * rows))
    gs = gridspec.GridSpec(rows, columns)
    for j in range(rows * columns):
        plt.subplot(gs[j])
        plt.axis("off")
        plt.imshow(image_batch.at(j))

In [4]:
class FPENetExternalInputIterator(object):
    def __init__(self, models, batch_size, device_id=0, num_gpus=1):
        self.batch_size = batch_size
        self.filenames_iterable = []
        self.bboxes_iterable = []

        for model in models:
            image_tensor, _ = fn.readers.file(
                files=model.filename, random_shuffle=False
            )
            for face in model.faces:
                self.filenames_iterable.append(model.filename)
                self.bboxes_iterable.append(face.bbox)

        # whole data set size
        self.data_set_len = len(self.filenames_iterable)

        # based on the device_id and total number of GPUs - world size
        # get proper shard
        self.filenames_iterable = self.filenames_iterable[
            self.data_set_len
            * device_id
            // num_gpus : self.data_set_len
            * (device_id + 1)
            // num_gpus
        ]

        self.bboxes_iterable = self.bboxes_iterable[
            self.data_set_len
            * device_id
            // num_gpus : self.data_set_len
            * (device_id + 1)
            // num_gpus
        ]

        self.n = len(self.filenames_iterable)

    def __iter__(self):
        self.i = 0
        return self

    def __next__(self):
        images = []
        bboxes = []

        if self.i >= self.n:
            self.__iter__()
            raise StopIteration

        for _ in range(self.batch_size):
            filename = self.filenames_iterable[self.i % self.n]
            f = open(filename, "rb")
            images.append(np.frombuffer(f.read(), dtype=np.uint8))
            bbox = self.bboxes_iterable[self.i % self.n]
            bboxes.append(
                np.array((bbox.x1, bbox.y1, bbox.x2, bbox.y2), dtype=np.int32)
            )
            self.i = (self.i + 1) % self.n
        return (images, bboxes)

    def __len__(self):
        return self.data_set_len

    next = __next__

In [5]:
def FPENetExternalSourcePipeline(batch_size, num_threads, device_id, external_data):
    pipe = Pipeline(batch_size, num_threads, device_id)
    with pipe:
        image, bbox = fn.external_source(
            source=external_data, num_outputs=2, dtype=[types.UINT8, types.INT32]
        )
        shapes = fn.peek_image_shape(image)
        if_rotate = shapes[0] > shapes[1]
        angle = 90.0 * if_rotate

        image_tensor = fn.decoders.image(image, output_type=types.GRAY, device="mixed")

        image_tensor = fn.rotate(image_tensor, angle=angle)

        image_tensor = fn.slice(
            image_tensor,
            start=bbox[:2],
            end=bbox[2:],
        )
        image_tensor = fn.resize(
            image_tensor,
            resize_x=80,
            resize_y=80,
            interp_type=types.DALIInterpType.INTERP_LANCZOS3,
        )
        pipe.set_outputs(image_tensor)

    return pipe

In [6]:
eii = FPENetExternalInputIterator(models, max_batch_size)

pipe = FPENetExternalSourcePipeline(
    batch_size=max_batch_size, num_threads=2, device_id=0, external_data=eii
)

In [22]:
# %matplotlib inline
# pipe.build()
# pipe_out,  = pipe.run()
# show_images(pipe_out.as_cpu())

In [8]:
query = "wedding*moose"
models = Model.find(Model.filename % query).all()
models = [model for model in models if model.faces]
max_batch_size = 8

In [None]:
pipe = Pipeline(16, num_threads=8, device_id=0)

with pipe:
    out = []
    for model in models:
        h = model.height
        w = model.width
        dummy_img = np.zeros((h, w, 3))
        for face in model.faces:
            bbox = np.array(
                (face.bbox.x1, face.bbox.y1, face.bbox.x2, face.bbox.y2), dtype=np.int32
            )
            rotation = face.rotation
            center = (face.bbox.x2 - face.bbox.x1, face.bbox.y2 - face.bbox.y1)
            m = cv2.getRotationMatrix2D(center=center, angle=rotation, scale=1)
            abs_cos = abs(m[0, 0])
            abs_sin = abs(m[0, 1])
            bound_w = int(h * abs_sin + w * abs_cos)
            bound_h = int(h * abs_cos + w * abs_sin)
            m[0, 2] += bound_w / 2 - center[0]
            m[1, 2] += bound_h / 2 - center[1]
            warp = fn.warp_affine(
                dummy_img, matrix=m, size=(bound_w, bound_h), device="gpu"
            )
            out.append(warp)

    pipe.set_outputs(*out)

In [None]:
pipe.build()
(pipe_out,) = pipe.run()