## Build train/test list
1. set a ratio for training data (you can change this)
2. go to rgb-images folder, save folder names (clip_ref) for training and testing in a list
3. save to txt

### How to run this
1. after running video_preprocessing (you have all frames in folders, labels in folders)
2. create a folder called splitfiles under data
3. run

In [1]:
import os
import pdb

In [2]:
def removeDS(array):
    if '.DS_Store' in array:
        array.remove('.DS_Store')
    
    return array

In [3]:
def get_Nclasses(path, N):
    '''
    This function is used to get the N classes we use for training
    '''
    labels = os.listdir(path)
    labels = removeDS(labels)
    
    clip_nums = {}
    for label in labels:
        label_path = path + label + '/'
        clips = os.listdir(label_path)
        clips = removeDS(clips)
        clip_nums[label] = len(clips)
        
    sortedClasses = sorted(clip_nums.items(), key = lambda item:item[1], reverse = True)
    NClasses = sortedClasses[:N]
    
    NClasses_names = [c[0] for c in NClasses]
    
    return NClasses_names

In [18]:
path = '../datasets/ucf24/labels/'
N = 5
NClasses_names = get_Nclasses(path, N)
print(NClasses_names)

['Pick up from bin with tong or scooper', 'Put item into meal using tongs', 'Put tongs or scooper back in bin', 'Operating POS', 'Put item into meal using hands']


In [19]:
def build_lists(path, list_root, NClasses_names, N):
    '''
    This function is used to build training and testing data list we're gonna use.
    The label path (class/class_group_clip/frameidx.txt) will be stored in trainlist.txt
    The clip path (class/class_group_clip) will be stored in trainlist01.txt
    '''
    labels = os.listdir(path)
    labels = removeDS(labels)

    train_refs_full = []
    test_refs_full = []
    
    train_frames_full = []
    test_frames_full = []

    for label in labels:
        label_path = path + label + '/'
        clips = os.listdir(label_path)
        clips = removeDS(clips)
        num_clips = len(clips)
        
        if label in NClasses_names:
            
            train_num = int(num_clips*train_ratio) # round down, or there might be no data in testing at all

            if clips: # if not empty
                train_refs = clips[0:train_num]
                test_refs = clips[train_num::]
                
                train_paths = [label + '/' + s for s in train_refs]
                test_paths = [label + '/' + s for s in test_refs]
                
                for i in range(len(train_refs)):
                    clip_ref = train_refs[i] # class_g_c
                    clip_path = label_path + clip_ref + '/'
                    frames = os.listdir(clip_path)
                    frames = removeDS(frames)
                    
                    for frame in frames:
                        frame_path = train_paths[i] + '/' + frame
                        train_frames_full.append(frame_path)
                        
                for i in range(len(test_refs)):
                    clip_ref = test_refs[i] # class_g_c
                    clip_path = label_path + clip_ref + '/'
                    frames = os.listdir(clip_path)
                    frames = removeDS(frames)
                    
                    for frame in frames:
                        frame_path = test_paths[i] + '/' + frame
                        test_frames_full.append(frame_path)
            
            else:
                continue
                '''
                train_frames_full = []
                test_frames_full = []
                train_refs_full = []
                test_refs_full = []
                '''
            train_refs_full += train_paths
            test_refs_full += test_paths
            
            print(label + ':')
            print('total clips: ' + str(len(clips)))
            print('training clips: ' + str(len(train_refs)))
            print('testing clips: ' + str(len(test_refs)))
            print('===========================')

    delimiter = '\n'
    
    train_frame_str = delimiter.join(train_frames_full)
    test_frame_str = delimiter.join(test_frames_full)
    train_frame_path = list_root + 'trainlist_' + str(N) + '.txt'
    test_frame_path = list_root + 'testlist_' + str(N) + '.txt'
    
    train_clip_str = delimiter.join(train_refs_full)
    test_clip_str = delimiter.join(test_refs_full)
    train_clip_path = list_root + 'trainlist01_' + str(N) + '.txt'
    test_clip_path = list_root + 'testlist01_' + str(N) + '.txt'

    file = open(train_frame_path,'w+') 
    file.write(train_frame_str)
    file.close()
    file = open(test_frame_path,'w+')
    file.write(test_frame_str)
    file.close()
    
    file = open(train_clip_path,'w+') 
    file.write(train_clip_str)
    file.close()
    file = open(test_clip_path,'w+')
    file.write(test_clip_str)
    file.close()
    

In [20]:
train_ratio = 0.8

list_root = '../datasets/ucf24/splitfiles/' 
path = '../datasets/ucf24/labels/'    

NClasses_names = get_Nclasses(path, N)
build_lists(path, list_root, NClasses_names, N)


Operating POS:
total clips: 74
training clips: 59
testing clips: 15
Pick up from bin with tong or scooper:
total clips: 149
training clips: 119
testing clips: 30
Put item into meal using hands:
total clips: 66
training clips: 52
testing clips: 14
Put item into meal using tongs:
total clips: 136
training clips: 108
testing clips: 28
Put tongs or scooper back in bin:
total clips: 89
training clips: 71
testing clips: 18


### datasetName.names file
These files for different dataset is stored in 'YOWO/data/', they contains all the classes names used for trainning for different datasets.

In [21]:
def build_class_names(NClasses_names, path, fileName):
    delimiter = '\n'
    names_str = delimiter.join(NClasses_names)
    names_path = path + fileName
    file = open(names_path,'w+') 
    file.write(names_str)
    file.close()

'''
path: folder path to save datasetNames.names files
class_names: returned by build_labelref_list function, contains class names list
'''

path = '../YOWO/data/'
fileName = 'restaurant_' + str(N) + '.names'
build_class_names(NClasses_names, path, fileName)