## Library imports

In [1]:
import os
import sys
import pickle
import numpy as np
from tqdm import tqdm

## Constants

In [2]:
TRAIN_NUM_TO_KEEP = 10000
DATA_ROOT_DIR = '/home/surya/Downloads/cam2bev-data-master/1_FRLR'
TRAIN_DIR = os.path.join(DATA_ROOT_DIR, *['train'])
TRAIN_DATASET_FOLDERS = [os.path.join(TRAIN_DIR,x) for x in os.listdir(TRAIN_DIR) \
                           if os.path.isdir(os.path.join(TRAIN_DIR,x))]

## Helper functions

In [3]:
def getFilesInDir(directory):
    return sorted(os.listdir(directory))

def fileNamesWithoutExtension(files):
    return [x.split('.')[-2] for x in files]

def checkFoldersContainSameFiles(folders):
    assert len(folders) > 0
    refFiles = getFilesInDir(folders[0])
    refFilesWithoutExt = set(fileNamesWithoutExtension(refFiles))
    numRefFiles = len(refFiles)
    
    filesMatch = True
    for folder in folders[1:]:
        files = getFilesInDir(folder)
        if (len(files) == numRefFiles):
            filesWithoutExt = set(fileNamesWithoutExtension(files))
            if(len(filesWithoutExt - refFilesWithoutExt) == 0):
                continue
            else:
                filesMatch = False
                print(f"{folder} file names mismatch")
                break
        else:
            filesMatch = False
            print(f"{folder} contains {len(files)} files, while numRefFiles = {numRefFiles}")
            break
            
    return filesMatch

def getRandomIndices(size, numToKeep):
    indices = np.random.choice(size, size=numToKeep, replace=False)
    return indices

def filterListByIndices(data, indices):
    return [data[i] for i in indices]

def deleteFile(filePath):
    if os.path.exists(filePath):
        os.remove(filePath)

In [4]:
refFiles = getFilesInDir(TRAIN_DATASET_FOLDERS[0])
refFilesWithoutExt = set(fileNamesWithoutExtension(refFiles))
numRefFiles = len(refFiles)

indicesToDelete = getRandomIndices(numRefFiles, numRefFiles - TRAIN_NUM_TO_KEEP)
filesToBeDeteled = filterListByIndices(refFiles, indicesToDelete)

In [6]:
for folder in TRAIN_DATASET_FOLDERS:
    print(folder)
    for file in tqdm(filesToBeDeteled):
        absFilePath = os.path.join(folder, file)
        deleteFile(absFilePath)

/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/left


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 51734.48it/s]


/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/right


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 44341.24it/s]


/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/front


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 49655.09it/s]


/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/rear


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 44175.94it/s]


/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/bev+occlusion


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 50613.22it/s]


/home/surya/Downloads/cam2bev-data-master/1_FRLR/train/homography


100%|██████████████████████████████████| 23199/23199 [00:00<00:00, 48131.50it/s]


In [7]:
checkFoldersContainSameFiles(TRAIN_DATASET_FOLDERS)

True