In [1]:
import tensorflow as tf
import requests
import re
from typing import Union, List, Optional
import pandas
from dataclasses import dataclass

record = "/media/watemerald/Seagate/data/yt8m/frame/train0001.tfrecord"

features = []
for example in tf.compat.v1.python_io.tf_record_iterator(record):
    tf_example = tf.train.Example.FromString(example)
    features.append(tf_example)


def expand_vid_id(short_id: Union[bytes, str]) -> str:
    """
    """
    # If the short_id is passed as bytes, that means that is was
    # decoded from a TFRecord directly, in which case it's a UTF-8
    # string
    if isinstance(short_id, bytes):
        short_id = short_id.decode("UTF-8")

    url = f"http://data.yt8m.org/2/j/i/{short_id[:2]}/{short_id}.js"
    val = requests.get(url)

    # The return format looks like i("02ab","tvvJFX90eh0");
    # with the short id on the left and full id on the right
    match = re.match(r"i\(\"(?P<short_id>\w{4})\".\"(?P<full_id>\w+)\"\);", val.text)

    return match.group("full_id")


vocabulary = pandas.read_csv("video-search/vocabulary.csv")


def label_id_to_name(label: int) -> str:
    """Converts a single label id number to its full Knowledge Graph Name
    """
    return vocabulary.iloc[label]["Name"]


@dataclass
class VideoInfo:
    short_id: str
    long_id: str
    tags: List[str]


def decode_tf_example(e: tf.train.Example) -> VideoInfo:
    short_id = e.features.feature["id"].bytes_list.value[0]
    labels = e.features.feature["labels"].int64_list.value

    long_id = expand_vid_id(short_id)
    tags = list(map(label_id_to_name, labels))
    return VideoInfo(short_id=short_id.decode("UTF-8"), long_id=long_id, tags=tags,)


from IPython.display import YouTubeVideo


def display_video(vid: VideoInfo) -> YouTubeVideo:

    return YouTubeVideo(vid.long_id)




Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


FileNotFoundError: [Errno 2] File video-search/vocabulary.csv does not exist: 'video-search/vocabulary.csv'

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, LeakyReLU, concatenate
from tensorflow.keras.initializers import glorot_normal

In [3]:
import tensorflow as tf

In [4]:
import numpy as np
import pandas as pd
import glob
import os
import pendulum
from multiprocessing import Pool

In [5]:
# Adapted from https://www.kaggle.com/drn01z3/keras-baseline-on-video-features-0-7941-lb/code

FOLDER = '/media/watemerald/Seagate/data/yt8m/video/'

def ap_at_n(data, n: int =20):
    predictions, actuals = data
    total_num_positives = None

    if len(predictions) != len(actuals):
        raise ValueError("the shape of predictions and actuals does not match.")

    if n is not None:
        if not isinstance(n, int) or n <= 0:
            raise ValueError("n must be 'None' or a positive integer."
                             " It was '%s'." % n)

    ap = 0.0

    sortidx = np.argsort(predictions)[::-1]

    if total_num_positives is None:
        numpos = np.size(np.where(actuals > 0))
    else:
        numpos = total_num_positives

    if numpos == 0:
        return 0

    if n is not None:
        numpos = min(numpos, n)
    delta_recall = 1.0 / numpos
    poscount = 0.0

    # calculate the ap
    r = len(sortidx)
    if n is not None:
        r = min(r, n)
    for i in range(r):
        if actuals[sortidx[i]] > 0:
            poscount += 1
            ap += poscount / (i + 1) * delta_recall
    return ap


def gap(pred, actual):
    lst = zip(list(pred), list(actual))

    with Pool() as pool:
        all = pool.map(ap_at_n, lst)

    return np.mean(all)


def tf_itr(tp='test', batch=1024):
    tfiles = sorted(glob.glob(os.path.join(FOLDER,  f"{tp}*tfrecord")))
    print('total files in %s %d' % (tp, len(tfiles)))
    ids, aud, rgb, lbs = [], [], [], []
    for fn in tfiles:
        for example in tf.compat.v1.python_io.tf_record_iterator(fn):
            tf_example = tf.train.Example.FromString(example)

            ids.append(tf_example.features.feature['id'].bytes_list.value[0].decode(encoding='UTF-8'))
            rgb.append(np.array(tf_example.features.feature['mean_rgb'].float_list.value))
            aud.append(np.array(tf_example.features.feature['mean_audio'].float_list.value))

            yss = np.array(tf_example.features.feature['labels'].int64_list.value)
            out = np.zeros(4716).astype(np.int8)
            for y in yss:
                out[y] = 1
            lbs.append(out)
            if len(ids) >= batch:
                yield np.array(ids), np.array(aud), np.array(rgb), np.array(lbs)
                # yield np.array(rgb), np.array(lbs)
                ids, aud, rgb, lbs = [], [], [], []


def fc_block(x, n=1024, d=0.2):
    x = Dense(n, kernel_initializer=glorot_normal())(x)
    x = BatchNormalization()(x)
    x = LeakyReLU()(x)
    x = Dropout(d)(x)
    return x


def build_mod():
    in1 = Input((128,), name='x1')
    x1 = fc_block(in1)

    in2 = Input((1024,), name='x2')
    x2 = fc_block(in2)

#     x = merge([x1, x2], mode='concat', concat_axis=1)
#     x = tf.concat([x1, x2], 0)
    x = concatenate([x1,x2], 1)
    x = fc_block(x)
    out = Dense(4716, activation='sigmoid', name='output')(x)

    model = Model([in1, in2], out)
    model.compile(optimizer='adam', loss='categorical_crossentropy')
    # model.summary()
    return model


def train():
    if not os.path.exists('weights'): os.mkdir('weights')
    batch = 10 * 1024
    n_itr = 10
    n_eph = 100

    _, x1_val, x2_val, y_val = next(tf_itr('val'))

    model = build_mod()
    cnt = 0
    start = pendulum.now()
    fmt = start.format("Y-MM-DD hh:mm:ss")
    print(f"Started at {fmt}")
    
    for e in range(n_eph):
        for d in tf_itr('train', batch):
            _, x1_trn, x2_trn, y_trn = d
            model.train_on_batch({'x1': x1_trn, 'x2': x2_trn}, {'output': y_trn})
            cnt += 1
            if cnt % n_itr == 0:
                y_prd = model.predict({'x1': x1_val, 'x2': x2_val}, verbose=False, batch_size=100)
                g = gap(y_prd, y_val)
                print('val GAP %0.5f; epoch: %d; iters: %d' % (g, e, cnt))
                now = pendulum.now()
                fmt = now.format("Y-MM-DD hh:mm:ss")
                print(fmt)
                model.save_weights('weights/%0.5f_%d_%d.h5' % (g, e, cnt))

def conv_pred(el):
    t = 20
    idx = np.argsort(el)[::-1]
    return ' '.join(['{} {:0.5f}'.format(i, el[i]) for i in idx[:t]])


def predict():
    model = build_mod()

    wfn = sorted(glob.glob('weights/*.h5'))[-1]
    model.load_weights(wfn)
    print('loaded weight file: %s' % wfn)
    idx, x1_val, x2_val, _ = next(tf_itr('test', 10*1024))

    ypd = model.predict({'x1': x1_val, 'x2': x2_val}, verbose=1, batch_size=32)
    del x1_val, x2_val

    with Pool() as pool:
        out = pool.map(conv_pred, list(ypd))

    df = pd.DataFrame.from_dict({'VideoId': idx, 'LabelConfidencePairs': out})
    df.to_csv('subm1', header=True, index=False, columns=['VideoId', 'LabelConfidencePairs'])




In [37]:
train()

total files in val 3844
Started at 2020-06-04 12:11:35
total files in train 3844
val GAP 0.53307; epoch: 0; iters: 10
2020-06-04 12:11:58
val GAP 0.57145; epoch: 0; iters: 20
2020-06-04 12:12:23
val GAP 0.53293; epoch: 0; iters: 30
2020-06-04 12:12:49
val GAP 0.58610; epoch: 0; iters: 40
2020-06-04 12:13:15
val GAP 0.64954; epoch: 0; iters: 50
2020-06-04 12:13:42
val GAP 0.70675; epoch: 0; iters: 60
2020-06-04 12:14:08
val GAP 0.73263; epoch: 0; iters: 70
2020-06-04 12:14:34
val GAP 0.74543; epoch: 0; iters: 80
2020-06-04 12:15:01
val GAP 0.75433; epoch: 0; iters: 90
2020-06-04 12:15:27
val GAP 0.76547; epoch: 0; iters: 100
2020-06-04 12:15:54
val GAP 0.76767; epoch: 0; iters: 110
2020-06-04 12:16:21
val GAP 0.77408; epoch: 0; iters: 120
2020-06-04 12:16:47
val GAP 0.78366; epoch: 0; iters: 130
2020-06-04 12:17:14
val GAP 0.78244; epoch: 0; iters: 140
2020-06-04 12:17:40
val GAP 0.78764; epoch: 0; iters: 150
2020-06-04 12:18:07
val GAP 0.78782; epoch: 0; iters: 160
2020-06-04 12:18:33


val GAP 0.83301; epoch: 3; iters: 1390
2020-06-04 01:13:10
val GAP 0.83125; epoch: 3; iters: 1400
2020-06-04 01:13:37
val GAP 0.83194; epoch: 3; iters: 1410
2020-06-04 01:14:05
val GAP 0.83110; epoch: 3; iters: 1420
2020-06-04 01:14:32
val GAP 0.83209; epoch: 3; iters: 1430
2020-06-04 01:15:00
val GAP 0.83311; epoch: 3; iters: 1440
2020-06-04 01:15:27
val GAP 0.83224; epoch: 3; iters: 1450
2020-06-04 01:15:54
val GAP 0.83022; epoch: 3; iters: 1460
2020-06-04 01:16:21
val GAP 0.83209; epoch: 3; iters: 1470
2020-06-04 01:16:48
val GAP 0.83370; epoch: 3; iters: 1480
2020-06-04 01:17:16
val GAP 0.83352; epoch: 3; iters: 1490
2020-06-04 01:17:43
val GAP 0.83128; epoch: 3; iters: 1500
2020-06-04 01:18:10
val GAP 0.83417; epoch: 3; iters: 1510
2020-06-04 01:18:38
total files in train 3844
val GAP 0.83182; epoch: 4; iters: 1520
2020-06-04 01:19:07


KeyboardInterrupt: 

In [8]:
predict()

loaded weight file: weights/0.83417_3_1510.h5
total files in test 3844


In [None]:
for

In [None]:
for d in tf_itr("t")