In [2]:
import tensorflow as tf
import os
import numpy as np
import requests
import pandas as pd
from video_search.utils import count_records, expand_vid_id
from collections import defaultdict


from typing import Union, Optional
import re

In [46]:
FOLDER = "/media/watemerald/Seagate/data/yt8m/video/"

tp = "val"

In [47]:
count_records(FOLDER, tp)

1112356

In [55]:
ids = []
lbs = []

for e in tf.data.TFRecordDataset(
    tf.data.Dataset.list_files(os.path.join(FOLDER, f"{tp}*tfrecord"))
).as_numpy_iterator():
    tf_example = tf.train.Example.FromString(e)
    ids.append(
        tf_example.features.feature["id"]
        .bytes_list.value[0]
        .decode(encoding="UTF-8")
    ) 
    
    # Convert a list of labels into a 1D vector where all the labels are marked as 1
    yss = np.array(tf_example.features.feature["labels"].int64_list.value)

    lbs.append(yss)
    

In [56]:
records = pd.DataFrame(list(zip(ids,lbs)), columns=["ids", "labels"])

In [147]:
def expand_vid_id(short_id: Union[bytes, str]) -> Optional[str]:
    """
    """
    # If the short_id is passed as bytes, that means that is was
    # decoded from a TFRecord directly, in which case it's a UTF-8
    # string
    if isinstance(short_id, bytes):
        short_id = short_id.decode("UTF-8")

    url = f"http://data.yt8m.org/2/j/i/{short_id[:2]}/{short_id}.js"
    val = requests.get(url)
    
    if val.status_code != 200:
        return None
    
    # The return format looks like i("02ab","tvvJFX90eh0");
    # with the short id on the left and full id on the right
    match = re.match(r"i\(\"(?P<short_id>\w{4})\".\"(?P<full_id>[a-zA-Z0-9_-]+)\"\);", val.text)
    return match.group("full_id")
    

In [80]:
records["full_id"] = records["ids"].apply(expand_vid_id)

KeyboardInterrupt: 

In [81]:
records

Unnamed: 0,ids,labels
0,u8nr,[3]
1,Vnnr,"[3, 13]"
2,Ftnr,"[5, 49, 80, 900, 919]"
3,gznr,"[0, 1, 35, 183]"
4,1Qnr,[928]
...,...,...
1112351,VYsQ,"[6, 8]"
1112352,iMsQ,"[0, 1]"
1112353,UAsQ,"[0, 1, 449]"
1112354,XAsQ,"[0, 1, 111]"


# Evaluation selection criteria


Because of the YouTube API limits, only 10000 operations can be done in a day, therefore only 3000 videos can be used in the final evaluation dataset. So, let's evaluate some criteria about the labels before making our selection

In [84]:
labelcount = defaultdict(int)

for idx, row in records.iterrows():
    for l in row['labels']:
        labelcount[l] += 1

In [98]:
labelcount

defaultdict(int,
            {3: 108632,
             13: 35797,
             5: 67814,
             49: 9781,
             80: 5913,
             900: 371,
             919: 322,
             0: 225529,
             1: 153621,
             35: 14561,
             183: 2543,
             928: 343,
             546: 691,
             12: 37658,
             2: 118692,
             75: 6719,
             90: 4946,
             810: 449,
             429: 933,
             7: 57286,
             320: 1342,
             4: 81778,
             41: 11120,
             296: 1494,
             71: 6836,
             42: 10257,
             111: 3937,
             15: 30726,
             2875: 63,
             100: 4298,
             161: 2901,
             356: 1218,
             704: 531,
             99: 4361,
             208: 2227,
             745: 466,
             1695: 162,
             9: 44531,
             10: 41399,
             8: 52495,
             1216: 260,
             24: 18

In [116]:
rc = pd.DataFrame(labelcount.items(), columns=['id', 'count'])


In [117]:
rc['count'].value_counts(normalize=True) * 100

51       1.165199
56       1.139306
49       1.087519
55       1.035733
54       0.983946
           ...   
277      0.025893
275      0.025893
4361     0.025893
257      0.025893
10245    0.025893
Name: count, Length: 1034, dtype: float64

Let's create a sample of the validation records to use as our final for evaluation

In [94]:
subrecords = records.sample(3000)

In [99]:
subrecordcount = defaultdict(int)

for idx, row in subrecords.iterrows():
    for l in row['labels']:
        subrecordcount[l] += 1

In [100]:
subrecordcount

defaultdict(int,
            {3: 293,
             835: 3,
             5: 154,
             2096: 1,
             14: 98,
             6: 173,
             25: 51,
             2: 325,
             7: 163,
             879: 2,
             0: 578,
             27: 42,
             1: 394,
             42: 33,
             219: 5,
             431: 3,
             3022: 2,
             2011: 2,
             15: 85,
             18: 75,
             67: 20,
             2586: 1,
             41: 28,
             63: 20,
             12: 101,
             9: 148,
             37: 40,
             53: 23,
             58: 19,
             60: 20,
             166: 11,
             353: 5,
             385: 3,
             398: 3,
             68: 22,
             72: 17,
             98: 13,
             240: 3,
             21: 60,
             23: 56,
             24: 59,
             1517: 2,
             305: 8,
             13: 89,
             54: 21,
             3021: 1,
         

In [118]:
src = pd.DataFrame(subrecordcount.items(), columns=['id', 'count'])

In [119]:
src['count']/src['count'].sum()

0       0.032523
1       0.000333
2       0.017094
3       0.000111
4       0.010878
          ...   
1562    0.000111
1563    0.000111
1564    0.000111
1565    0.000111
1566    0.000111
Name: count, Length: 1567, dtype: float64

In [120]:
rc['count']/rc['count'].sum()

0       0.032434
1       0.010688
2       0.020247
3       0.002920
4       0.001765
          ...   
3857    0.000015
3858    0.000010
3859    0.000011
3860    0.000011
3861    0.000013
Name: count, Length: 3862, dtype: float64

In [122]:
d = _119 - _120

In [125]:
d[d> 0.01]

5      0.019092
10     0.063399
12     0.043528
25     0.013366
74     0.012914
75     0.022941
104    0.010628
Name: count, dtype: float64

As the subset is representative (the presence of only 7 of the tags falls more than 1 percentage point outside of the original 3862 tags), let's get the full ids for all of them

In [127]:
full_ids = []

In [140]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [148]:
for ix, row in tqdm(subrecords.iterrows(), total=3000):
    full_ids.append(expand_vid_id(row["ids"]))

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [164]:
subrecords['full_id'] = full_ids

There are 163 null full_id's in the dataset. Let's exclude them

In [179]:
subrecords = subrecords[subrecords["full_id"].notnull()]

In [188]:
subrecords

Unnamed: 0,id,labels,full_id
0,6Pfl,"[3, 835]",mw5Ma0FDxs4
1,49ph,"[5, 2096]",7xxFS6SuQ1A
2,50LP,[14],gLliDTQ6bwM
3,Xs0A,"[3, 6]",bN7CFvNZTWY
4,dlJu,[25],cNVJh8M1Qvk
...,...,...,...
2832,4qab,"[15, 18, 67, 170, 616]",MrU85xkeFqY
2833,NsB8,[310],robGsrQQghM
2834,8Aru,"[68, 72, 98, 151]",BTozSbfPodw
2835,I54b,[921],jvrzDWJFZyI


In [182]:
subrecords = subrecords.rename(columns={"ids": "id"})

In [186]:
subrecords = subrecords.reset_index(drop=True)

In [13]:
subrecords.to_csv("evaluation_selection.csv", index=False)

# YouTube API work

In [1]:
import os

import google_auth_oauthlib.flow
import googleapiclient.discovery
import googleapiclient.errors

scopes = ["https://www.googleapis.com/auth/youtube.readonly"]
os.environ["OAUTHLIB_INSECURE_TRANSPORT"] = "1"

In [14]:
subrecords = pd.read_csv("evaluation_selection.csv")

In [22]:
from dotenv import load_dotenv
load_dotenv()

True

In [24]:
api_service_name = "youtube"
api_version = "v3"
api_key = os.environ["YOUTUBE_API_KEY"]

In [25]:
youtube = googleapiclient.discovery.build(api_service_name, api_version, developerKey=api_key)

In [26]:
youtube

<googleapiclient.discovery.Resource at 0x7fde3f27dc90>

In [27]:
request = youtube.videos().list(
    part="snippet",
    id="mw5Ma0FDxs4"
)
response = request.execute()

print(response)

{'kind': 'youtube#videoListResponse', 'etag': 'CPo62z5RvyADVu08k379O5AMFik', 'items': [{'kind': 'youtube#video', 'etag': 'yFLSiH9_CXbMzWz6X_BVX-fVrdg', 'id': 'mw5Ma0FDxs4', 'snippet': {'publishedAt': '2007-04-01T01:01:40Z', 'channelId': 'UCKA0Guu3khu94mPNVEHVeLg', 'title': 'AEROBIKA live at krizis janra club I', 'description': 'Aerobika 30 March 2007 MOSCOW\r\nkrizis janra club part I\r\n---------------------------------------\r\nhttp://community.livejournal.com/aerobika_dance/', 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/mw5Ma0FDxs4/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/mw5Ma0FDxs4/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/mw5Ma0FDxs4/hqdefault.jpg', 'width': 480, 'height': 360}}, 'channelTitle': 'Yura Kurokhtin', 'tags': ['aerobika', 'club', 'moscow', 'nu', 'rave', 'acid', 'rock', 'super', 'new'], 'categoryId': '10', 'liveBroadcastContent': 'none', 'localized': {'title': 'AEROBIK

In [36]:
response["items"][0]["snippet"]["tags"]

['aerobika', 'club', 'moscow', 'nu', 'rave', 'acid', 'rock', 'super', 'new']