# AI dataset preparation
We'd like to collect some distinct ads images for training the logo detecting model and ads similarity experiments.

Now we have ads information in Gaia database and ads video stored in S3. To accomplish this task, involves video download, image extraction, upload, PHASH calculation, vector query, etc. I did that in my local machine and dev-machine but it's not very efficient.


## Download ad video

In [11]:
import boto3
import botocore

def download_from_s3(key, distination, bucket='tubi-gaia-stream-production'):

    dev = boto3.session.Session(profile_name='main-eng-staging-dev')
    s3 = dev.resource('s3')

    try:
        s3.Bucket(bucket).download_file(key, distination)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.", key)
        else:
            raise

#download_ad('f489dc6e-ae7e-4c76-b10f-4c004b3f4c4c/854x480_1200k.mp4', './f489dc6e-ae7e-4c76-b10f-4c004b3f4c4c.mp4')

## Load initial data
Initail data is from Gaia database in production. I selected the data and exported to csv file at first.

That will be better if we can query from Redshift directly.

In [12]:

init_ls = []
download_from_s3('zhongwei/ads_samples.csv', '/tmp/ads_samples.csv', 'tubi-playground-staging')
with open('/tmp/ads_samples.csv', 'r') as initial_f:
    lines = initial_f.readlines()
    for line in lines:
        item = {}
        ls = line.split(',')
        item['ad_id'] = ls[0]
        item['video_id'] = ls[1]
        item['fingerprint'] = ls[2]
        init_ls.append(item)
    
    print(len(init_ls))

19


## Load video through Rikai

In [13]:
import pandas
from rikai.types import VideoStream

#video = VideoStream(uri='/tmp//tmp_ad_video.mp4')
video = VideoStream(uri='http://ark.tubi.video/f489dc6e-ae7e-4c76-b10f-4c004b3f4c4c/854x480_1200k.mp4')

video




## Video to frames

In [14]:
from PIL import Image as PILImage
import io
import cv2
from rikai.types import Image
import rikai 

def v2i(
    video: VideoStream,
    sample_rate: int = 10,
    max_frames: int = -1
) -> list:
    uri = video.uri
    def from_array(arr):
        arr = arr[:,:,::-1]
        img = PILImage.fromarray(arr)
        img_byte_arr = io.BytesIO()
        img.resize((256, 256)).save(img_byte_arr, format='PNG')
        return Image(img_byte_arr.getvalue())
    cap = cv2.VideoCapture(uri)
    success = cap.grab()  # get the next frame
    tot_samples = 0
    fno = 0
    frames = []
    while success:
        if fno % sample_rate == 0:
            _, img = cap.retrieve()
            frames.append(from_array(img))
            tot_samples += 1
        if max_frames > 0 and tot_samples >= max_frames:
            break
        fno += 1
        success = cap.grab()
    return frames

frames = v2i(video)
print(len(frames))
frames[0]

36


## Frames to montage thumbnail

In [15]:
import skimage.io
import skimage.util
from rikai.spark.functions.vision import numpy_to_image
import numpy as np

def get_montage(uri):
    video = VideoStream(uri=uri)
    frames = v2i(video)
    
    n_frames = []
    for frame in frames:
        n_frames.append(frame.to_numpy())
    m = skimage.util.montage(n_frames, channel_axis=-1)
    m = m[:,:,::-1]
    montage = Image.from_array(m.astype(np.uint8))
    return montage



  from .collection import imread_collection_wrapper


## Prepare montages and fingerprints

In [16]:
import os
import time

# convert bits to bytes
def getbytes(bits):
    done = False
    while not done:
        byte = 0
        for _ in range(0, 8):
            try:
                bit = next(bits)
            except StopIteration:
                bit = 0
                done = True
                return
            byte = (byte << 1) | int(bit)
        yield byte

montages = []
fingerprints = []
for item in init_ls:
    download_from_s3(item['ad_id'] + '/854x480_1200k.mp4', '/tmp/tmp_ad.mp4')
    if os.path.isfile('/tmp/tmp_ad.mp4'):
        montages.append(get_montage('/tmp/tmp_ad.mp4'))

        fingerprint = []
        for b in getbytes(iter(item['fingerprint'])):
            fingerprint.append(np.uint8(b))
        fingerprints.append(fingerprint)

        os.remove('/tmp/tmp_ad.mp4')
        time.sleep(1)
        
print(fingerprints[0])
montages[0]
    

[140, 189, 178, 93, 82, 30, 86, 36, 199, 139, 247, 129, 203, 166, 222, 193, 208, 252, 211, 231, 176, 245, 73, 86, 74, 171, 228, 30, 141, 134, 163, 113, 152, 204, 62, 59, 30, 12, 135, 3, 198, 151, 67, 143, 199, 146, 163, 144, 208, 237, 43, 57, 210, 232, 99, 225, 248, 252, 118, 30, 27, 141, 195, 163, 241, 152, 236, 122, 30, 27, 139, 194]


## Prepare FAISS index

In [17]:
import faiss

index_dimension = 576
sub_index = faiss.IndexBinaryFlat(index_dimension)
index = faiss.IndexBinaryIDMap2(sub_index)
    
def add_with_ids(fingerprints, ids):
    """
    for performance, fingerprints is uint8 matrix, not bytes
    """
    f_len = len(fingerprints)
    id_len = len(ids)
    if f_len == 0 or id_len == 0:
        print("empty fingerprints or ids")
        return

    if f_len != id_len:
        print("add_with_ids len not match", f_len, id_len)
        return

    xb = np.array(fingerprints, dtype='uint8')
    ids = np.array(ids)
    index.add_with_ids(xb, ids)
        

def range_search(xq, radius=32):
    """
    - xq, fingerprints of every images of one video
    - radius of range search
    """
    xq = np.array(xq, dtype='uint8')
    lims, dists, indices = index.range_search(xq, radius)
    return lims, dists, indices


## Search similarity montages through FAISS

In [18]:
videos_number = range(1, len(init_ls)+1)
add_with_ids(fingerprints, videos_number)

query = []
for fingerprint in fingerprints:
    query.append(fingerprint)
xq = np.array(query, dtype='uint8')

lims, dists, indices = range_search(xq, 64)

clusters = []
for i in range(len(lims)-1):
    cluster = []
    for indice in indices[lims[i]:lims[i+1]]:
        if indice not in cluster:
            cluster.append(indice)
    clusters.append(cluster)


clusters



[[1],
 [2],
 [3, 4],
 [3, 4, 5],
 [4, 5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13],
 [14],
 [15],
 [16],
 [17],
 [18],
 [19]]

## Filter distinct ad videos (no similar ad videos)

In [19]:
video_ids = []
for cluster in clusters:
    video_number = cluster[0]
    video_ids.append(init_ls[video_number-1]['video_id'])

print(video_ids)

['cf126767-fd46-4b69-af68-1baf068523f2', '3b65f504-100f-4955-9e83-0a463be20720', '23e9d7f4-b307-4b9f-8b0f-bc2007cca350', '23e9d7f4-b307-4b9f-8b0f-bc2007cca350', '3f64f1f0-910d-4561-b9da-157414482ec2', 'c24a7a51-2500-45e2-b705-c354038fd76c', 'd9dd21e0-66c8-4b25-bae0-513aed96046e', '7193e532-b716-4a14-9d9e-c87d94950d1e', '2171ec5a-1ff5-40ee-95d4-176f51d625a5', 'fb200af0-2f2f-4dfe-9901-0e1504a93911', '8185e381-f046-4ba6-9d8a-05d68fbc4acc', '29794f64-4cf4-4747-a606-812d737493e8', 'b6053937-3896-4ca8-88af-9afa530d688e', '292ebb23-7ec3-4d67-8ab9-e3fa80f174fa', '3c17d5a1-e5a8-48c0-baa3-dcf3f1d7acc2', '609b5e9a-aae7-44c5-b8ad-20e484fffe3e', 'befdb935-ee12-409c-9924-3908a0bec5dd', '3e01d541-b42c-427f-8a27-201859e559f2', '05e5a048-1b53-4b3a-aded-ada80b1eca46']
