In [None]:
import os
import json
from datetime import datetime
from tqdm.notebook import tqdm
from cineplex.db import get_db

In [None]:
import ray

ray.shutdown()
ray.init()

In [None]:
from cineplex.config import Settings
from cineplex.logger import Logger

settings = Settings()
logger = Logger()

## Discover Videos

In [None]:
files = os.listdir(settings.youtube_videos_dir)
print(f'found {len(files)} files in {settings.youtube_videos_dir}')
with open('data/files.json', 'w') as outfile:
    json.dump(files, outfile)

## Dedupe Videos

In [None]:
filename_idx = {}

with open('data/file_list_videos.json') as json_file:
    data = json.load(json_file)
    print(len(data))
    for file in data:
        filename, ext = os.path.splitext(file)
        if filename in filename_idx:
            filename_idx[filename].append(ext)
        else:
            filename_idx[filename] = [ext]

dupes = []
for filename, extensions in filename_idx.items():
    if len(extensions) > 1:
        dupes.append(filename)

remove = []
print(f'found {len(dupes)} duplicate filenames')
for dup in dupes:
    sizes = {}
    for ext in filename_idx[dup]:
        size = os.path.getsize(os.path.join(settings.youtube_videos_dir, f'{dup}{ext}'))
        sizes[ext] = size

    # if len(sizes) > 2:
    #     print(f'{dup} has {len(sizes)} copies: {sizes}')

    smallest = min(sizes, key=sizes.get)

    # print(f'Keeping: {dup}{smallest} @ {sizes[smallest]}')
    sizes.pop(smallest)
    for ext in sizes:
        # print(f'Removing: {dup}{ext} @ {sizes[ext]}')
        remove.append(f'{dup}{ext}')

print(f'found {len(remove)} files to remove')

for file in remove:
    os.remove(os.path.join(settings.youtube_videos_dir, file))
    data.remove(file)

print(len(data))

# write the json file
with open('data/file_list_videos_deduped.json', 'w') as outfile:
    json.dump(data, outfile)

## Missing Ids

In [None]:
fragments = set()

file_list_videos_clean = []

with open(os.path.join(settings.data_dir, 'file_list_videos_deduped.json')) as json_file:
    data = json.load(json_file)

    for file in data:
        filename, ext = os.path.splitext(file)

        # extract the youtube id from the filename
        id = filename[-12:]

        # handle fragments
        if '.' in id:
            filename, ext = os.path.splitext(filename)
            id = filename[-11:]
            fragments.add(id)

        elif id[0] != '-':
            print(f'{id}|{filename}')

        else:
            file_list_videos_clean.append(file)

print(f'found {len(fragments)} fragments')
print(fragments)

with open('data/video_fragments.json', 'w') as outfile:
    json.dump(list(fragments), outfile)

with open('data/file_list_videos_clean.json', 'w') as outfile:
    json.dump(file_list_videos_clean, outfile)

## File Indices

### Video File Index

In [None]:
video_file_index = {}

with open(os.path.join(settings.data_dir, 'file_list_videos_clean.json')) as json_file:
    data = json.load(json_file)
    for file in data:
        filename, ext = os.path.splitext(file)
        id = filename[-11:]
        video_file_index[id] = {'id': id, 'filename': file}
with open(os.path.join(settings.data_dir, 'file_index_videos.json'), 'w') as outfile:
    json.dump(video_file_index, outfile)

In [None]:
with open(os.path.join(settings.data_dir, 'file_index_videos.json')) as json_file:
    video_file_index = json.load(json_file)

### Thumbnail File Index

In [None]:
thumbnail_file_index = {}

with open(os.path.join(settings.data_dir, 'file_list_thumbnails.json')) as json_file:
    data = json.load(json_file)
    for file in data:
        filename, ext = os.path.splitext(file)
        id = filename[-11:]
        thumbnail_file_index[id] = {'id': id, 'filename': file}
with open(os.path.join(settings.data_dir, 'file_index_thumbnails.json'), 'w') as outfile:
    json.dump(thumbnail_file_index, outfile)

In [None]:
with open(os.path.join(settings.data_dir, 'file_index_thumbnails.json')) as json_file:
    thumbnail_file_index = json.load(json_file)

### Metadata File Index

In [None]:
metadata_file_index = {}

with open(os.path.join(settings.data_dir, 'file_list_metadata.json')) as json_file:
    data = json.load(json_file)
    for file in data:
        # metadata files have two extensions
        filename, ext = os.path.splitext(file)
        filename, ext = os.path.splitext(filename)
        id = filename[-11:]
        metadata_file_index[id] = {'id': id, 'filename': file}
with open(os.path.join(settings.data_dir, 'file_index_metadata.json'), 'w') as outfile:
    json.dump(metadata_file_index, outfile)

In [None]:
with open(os.path.join(settings.data_dir, 'file_index_metadata.json')) as json_file:
    metadata_file_index = json.load(json_file)

## Missing Thumbnails and Metadata

In [None]:
missing_thumbnails = []
missing_metadata = []

for id in video_file_index.keys():
    if id not in thumbnail_file_index:
        missing_thumbnails.append(id)
    if id not in metadata_file_index:
        missing_metadata.append(id)

print(f'found {len(missing_thumbnails)} missing thumbnails')
with open(os.path.join(settings.data_dir, 'missing_thumbnails.json'), 'w') as outfile:
    json.dump(missing_thumbnails, outfile)

print(f'found {len(missing_metadata)} missing metadata')
with open(os.path.join(settings.data_dir, 'missing_metadata.json'), 'w') as outfile:
    json.dump(missing_metadata, outfile)

## Extract Metadata Into DB

In [None]:
@ray.remote
def extract_metadata(input):
    id, filename = input

    with open(filename, 'r') as f:
        
        try:
            data = json.load(f)
        except Exception as e:
            logger.error(f'Failed to load metadata: {filename}: {e}')
            return id

        res = get_db().get(f'video#{id}')
        if res:
            # print(f'found {id} in db')
            return None

        doc = {
            'id': id,
            'title': data['title'] if 'title' in data else id,
            'description': data['description'] if 'description' in data else '',
            'tags': data['tags'] if 'tags' in data else [],
            'categories': data['categories'] if 'categories' in data else [],
            'channel_id': data['channel_id'] if 'channel_id' in data else '',
            'uploader': data['uploader'] if 'uploader' in data else '',
            'upload_date': data['upload_date'] if 'upload_date' in data else '',
            'duration': data['duration'] if 'duration' in data else 0,
            'view_count': data['view_count'] if 'view_count' in data else 0,
            'like_count': data['like_count'] if 'like_count' in data else 0,
            'dislike_count': data['dislike_count'] if 'dislike_count' in data else 0,
            'average_rating': data['average_rating'] if 'average_rating' in data else 0,
            'width': data['width'] if 'width' in data else 0,
            'height': data['height']    if 'height' in data else 0,
            'format': data['format'] if 'format' in data else '',
            'format_id': data['format_id']  if 'format_id' in data else '',
            'video_file': video_file_index[id]['filename'] if id in video_file_index else '',
            'thumbnail_file': thumbnail_file_index[id]['filename'] if id in thumbnail_file_index else '',
            'metadata_file': metadata_file_index[id]['filename'] if id in metadata_file_index else '',
        }
        get_db().set(f'video#{id}', json.dumps(doc))
    
    return None

filenames = []
video_ids = list(video_file_index.keys())
# for id in tqdm(video_ids[:200]):
for id in tqdm(video_ids):
    metadata = metadata_file_index[id]
    filename = os.path.join(settings.youtube_metadata_dir, metadata['filename'])
    filenames.append((id, filename))

bad_metadata = [extract_metadata.remote(x) for x in tqdm(filenames)]
bad_metadata = [x for x in tqdm(ray.get(bad_metadata)) if x]

with open(os.path.join(settings.data_dir, 'bad_metadata.json'), 'w') as outfile:
    json.dump(bad_metadata, outfile)

print(f'found {len(bad_metadata)} bad metadata')


## Move Files to Channel Dirs

In [None]:
import shutil

def move_file(src, dst):
    try:
        # print(f'Moving {src} to {dst}')
        if not os.path.exists(dst):
            os.makedirs(os.path.dirname(dst), exist_ok=True)
            shutil.move(src, dst)
        return True
    except Exception as e:
        logger.error(f'Failed to move {src} to {dst}: {e}')
        return False

@ray.remote
def move_files(id):

    info = get_db().get(f'video#{id}')
    if not info:
        logger.error(f'Failed to find {id} in db')
        return id
    info = json.loads(info)

    src_video_file = os.path.join(settings.youtube_videos_dir, info['video_file'])
    src_thumbnail_file = os.path.join(settings.youtube_thumbnails_dir, info['thumbnail_file'])
    src_metadata_file = os.path.join(settings.youtube_metadata_dir, info['metadata_file'])

    channel_id = info['channel_id']
    if not channel_id:
        channel_id = '__unknown__'
        
    dst_video_file = os.path.join(settings.youtube_channels_dir, channel_id, info['video_file'])
    dst_thumbnail_file = os.path.join(settings.youtube_channels_dir, channel_id, info['thumbnail_file'])
    dst_metadata_file = os.path.join(settings.youtube_channels_dir, channel_id, info['metadata_file'])

    res = []
    if not move_file(src_video_file, dst_video_file):
        res.append(src_video_file)
    if not move_file(src_thumbnail_file, dst_thumbnail_file):
        res.append(src_thumbnail_file)
    if move_file(src_metadata_file, dst_metadata_file):
        res.append(src_metadata_file)

    return res

video_ids = list(video_file_index.keys())
not_moved = [move_files.remote(x) for x in tqdm(video_ids)]
not_moved = [x for x in tqdm(ray.get(not_moved)) if len(x) > 0]

with open(os.path.join(settings.data_dir, 'not_moved.json'), 'w') as outfile:
    json.dump(not_moved, outfile)

print(f'{len(not_moved)} files not moved')

In [None]:
# # rename file
# os.rename(os.path.join(YOUTUBE_VIDEO_DIR, old_file), os.path.join(YOUTUBE_VIDEO_DIR, new_file))

## Download Video

In [None]:
from cineplex.videos import download_video

try:
    res = download_video('https://www.youtube.com/watch?v=BaW_jenozKc')

    info = res['info']
    id = info['id']
    title = info['title']
    channel = info['channel']
    channel_id = info['channel_id']
    video_filename = res['video_filename'] 
    thumbnail_filename = res['thumbnail_filename']
    info_filename = res['info_filename']

    print(f'{id=}\n{title=}\n{channel=}\n{channel_id=}\n{video_filename=}\n{thumbnail_filename=}\n{info_filename=}')

except Exception as e:
    # Logging is already being performed in the download_video function
    print(e)


In [None]:
    # # Get channel details
    # request = youtube.channels().list(
    #     part="snippet,contentDetails,statistics",
    #     mine=True
    #     # id="UCqsUJL5xIWuidR7sIrPLhAw",
    # )
    # request = youtube.search().list(
    #     channelId = CHANNEL_ID,
    #     part = 'id,snippet',
    #     type = 'video',
    #     publishedAfter = '2018-12-31T23:59:59Z',
    #     publishedBefore = '2020-01-01T00:00:00Z',
    #     order = 'date',
    #     fields = 'nextPageToken,items(id,snippet)',
    #     maxResults = 50
    # )
    # video_data = {}
