### Check downloaded videos

In [1]:
# check how many subfolders are in the folder training, and check how many videos are in each subfolder

import os

def check_subfolders(path):

    print('\nChecking subfolders in path: ' + path)

    subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
    print('Number of subfolders: ' + str(len(subfolders)), 'in path: ' + path)

    # if no subfolders, count how many videos are in the folder
    if len(subfolders) == 0:
        files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
        files_ct = len(files)
        print('Number of videos: ' + str(files_ct))
        return

    class_ct = dict()
    total_videos_ct = 0
    for subfolder in subfolders:
        
        # count number of videos in each subfolder
        files = [f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))]
        files_ct = len(files)
        total_videos_ct += files_ct
        class_ct[subfolder] = files_ct
    
    # sort class_ct by value
    class_ct = {k: v for k, v in sorted(class_ct.items(), key=lambda item: item[1], reverse=True)}

    # print top5 and bottom5 classes
    print('Top 5 classes:')
    for i, (k, v) in enumerate(class_ct.items()):
        if i < 5:
            print(k, v)
    print('Bottom 5 classes:')
    for i, (k, v) in enumerate(class_ct.items()):
        if i > len(class_ct) - 6:
            print(k, v)

    print('Number of total videos: ' + str(total_videos_ct))
    
check_subfolders('../data/ActivityNet_200/training')
check_subfolders('../data/ActivityNet_200/validation')
check_subfolders('../data/ActivityNet_200/testing')


Checking subfolders in path: ../data/ActivityNet_200/training
Number of subfolders: 200 in path: ../data/ActivityNet_200/training
Top 5 classes:
../data/ActivityNet_200/training/Playing_congas 50
../data/ActivityNet_200/training/Surfing 45
../data/ActivityNet_200/training/Using_parallel_bars 43
../data/ActivityNet_200/training/Calf_roping 40
../data/ActivityNet_200/training/Using_the_balance_beam 37
Bottom 5 classes:
../data/ActivityNet_200/training/Decorating_the_Christmas_tree 12
../data/ActivityNet_200/training/Cumbia 11
../data/ActivityNet_200/training/Painting_furniture 10
../data/ActivityNet_200/training/Drinking_beer 9
../data/ActivityNet_200/training/Bullfighting 6
Number of total videos: 4540

Checking subfolders in path: ../data/ActivityNet_200/validation
Number of subfolders: 200 in path: ../data/ActivityNet_200/validation
Top 5 classes:
../data/ActivityNet_200/validation/Playing_congas 25
../data/ActivityNet_200/validation/Playing_drums 21
../data/ActivityNet_200/validatio

### Sample videos 

* sample 3 videos in the training and validation folder for testing with CLIP
* copy the sampled videos to data/ActivityNet_200_sampled/

In [7]:
from tqdm import tqdm
import json
import os

def sample_videos(src_path, dst_path, mode, n_sample = 3):

    print('\nSampling videos from path: ' + src_path)

    # randomly sample 3 videos from each subfolder in src_path, and copy them to dst_path
    subfolders = [f.path for f in os.scandir(src_path) if f.is_dir()]

    src_files = []
    dst_files = []
    for subfolder in subfolders:
        dst_subfolder = dst_path + '/' + subfolder.split('/')[-1]
        files = [f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))]
        #print(files[0])
        files_ct = len(files)
        if files_ct < n_sample:
            # add all videos' full path in the subfolder to src_files
            for i in range(files_ct):
                src_files.append(subfolder + '/' + files[i])
                dst_files.append(dst_subfolder + '/' + files[i])
        else:            
            for i in range(n_sample): # select the first n_sample videos
                src_files.append(subfolder + '/' + files[i])
                dst_files.append(dst_subfolder + '/' + files[i])
    
    # save the src_files and dst_files to json files    
    with open('output/' + mode + '_src_files.json', 'w') as f:
        json.dump(src_files, f, indent=4)

    with open('output/' + mode + '_dst_files.json', 'w') as f:
        json.dump(dst_files, f, indent=4)

    print(src_files[0])
    print(dst_files[0])
    
    # copy videos from src_files to dst_path
    for src_file, dst_file in tqdm(zip(src_files, dst_files)):
        if not os.path.exists(os.path.dirname(dst_file)):
            os.makedirs(os.path.dirname(dst_file))
            #print('create folder: ' + os.path.dirname(dst_file))
            
        os.system('cp ' + src_file + ' ' + dst_file)

In [None]:
sample_videos('../data/ActivityNet_200/training', '../data/ActivityNet_200_sampled/training', 'training')

In [8]:
sample_videos('../data/ActivityNet_200/validation', '../data/ActivityNet_200_sampled/validation', 'validation')


Sampling videos from path: ../data/ActivityNet_200/validation
../data/ActivityNet_200/validation/Croquet/gSH5ya0pfko.mp4
../data/ActivityNet_200_sampled/validation/Croquet/gSH5ya0pfko.mp4


13it [00:17,  2.49s/it]

In [None]:
# check how many videos are in each subfolder
check_subfolders('../data/ActivityNet_200_sampled/training')
check_subfolders('../data/ActivityNet_200_sampled/validation')

### Resize sampled videos
* resize sampled videos to short edge = 256 pixels

### Extract raw frames
* extract frames

### Prepare annotations