In [2]:
import os
import shutil
import random
from collections import defaultdict, Counter

def get_class_files(txt_folder):
    """
    Returns dictionary that combines every class with a list of files where those classes are contained.
    Every file can contain multiple classes.
    
    Parameter:
    - txt_folder: folder where txt files of the corrdinates are stored
    """
    class_files = defaultdict(list)
    class_counter = Counter()

    all_files = []
    
    # Iterate through all the files
    for txt_file in os.listdir(txt_folder):
        if txt_file.endswith('.txt'):
            txt_file_path = os.path.join(txt_folder, txt_file)
            with open(txt_file_path, 'r') as f:
                lines = f.readlines()
                classes_in_file = set()
                for line in lines:
                    line = line.strip()
                    if line:
                        class_id = int(line.split()[0])
                        classes_in_file.add(class_id)
                    else:
                        classes_in_file.add(-1)  # Empty rows as own class
                
                # Count each class and add the file to the corresponding class
                for class_id in classes_in_file:
                    class_files[class_id].append(txt_file)
                    class_counter[class_id] += 1 
                
                all_files.append(txt_file)

    return class_files, all_files, class_counter

def distribute_files_based_on_ratio(all_files, train_ratio=0.8, test_ratio=0.15, val_ratio=0.05):
    """
    Distribute the files based on file count and specified ratio.
    
    Parameters:
    - all_files: List of all txt files that contain the coordinates
    - train_ratio: training data ratio. In our case 80%
    - test_ratio: test data ratio. In our case set to 15%
    - val_ratio: validation data ratio. In our case set to 5%
    """
    random.shuffle(all_files)

    total_files = len(all_files)
    train_size = int(total_files * train_ratio)
    test_size = int(total_files * test_ratio)

    train_files = all_files[:train_size]
    test_files = all_files[train_size:train_size + test_size]
    val_files = all_files[train_size + test_size:]

    return train_files, test_files, val_files

def copy_files(file_list, src_txt_folder, src_img_folder, dest_txt_folder, dest_img_folder, class_counter):
    """
    Coypies text and frame files into target folder. Skips files where no image exists. 
    Returns class distribution for target folder.
    
    Parameters:
    - file_list: List of files: train, test, validation files
    - src_txt_folder: path to source text file folder
    - src_img_folder: path to source image file folder
    - dest_txt_folder: path to destination text folder
    - dest_img_folder: path to destination image folder
    - class_counter: Counter() for different classes
    """
    os.makedirs(dest_txt_folder, exist_ok=True)
    os.makedirs(dest_img_folder, exist_ok=True)

    class_counter_in_folder = Counter()

    for file in file_list:
        txt_file = os.path.join(src_txt_folder, file)
        img_file = os.path.join(src_img_folder, file.replace('.txt', '.jpg'))

        if os.path.exists(img_file):  # Check, whether image exists
            shutil.copy(txt_file, dest_txt_folder)
            shutil.copy(img_file, dest_img_folder)

            # Count classes of copied files
            with open(txt_file, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    if line:
                        class_id = int(line.split()[0])
                    else:
                        class_id = -1 
                    class_counter_in_folder[class_id] += 1
        else:
            print(f"Image for {file} missing. File skipped.")
    
    return class_counter_in_folder

def print_class_distribution(class_counter, total_count, title):
    """
    Print class distribution with percentage
    
    Parameters:
    - class_counter: Counter() for classes
    - total_count: Total count of classes
    - title: Title of distribution
    """
    print(f"\n{title}")
    print(f"{'Class':<10}{'Count':<10}{'Percentage':<10}")
    for class_id, count in sorted(class_counter.items()):
        percentage = (count / total_count) * 100 if total_count > 0 else 0
        print(f"{class_id:<10}{count:<10}{percentage:.2f}%")

def process_dataset(txt_folder, img_folder, root_path, train_ratio=0.8, test_ratio=0.15, val_ratio=0.05):
    """
    Main process: devides files based on calculated distribution into training, test and validation data.
    
    Parameters: 
    - txt_folder: Path to folder which contains all the coordinates files
    - img_folder: Path to folder which contains all the images to the corresponding text files
    - root_path: Path to directory which contains all the folders
    - train_ratio: Training data ratio
    - test_ratio: Test data ratio
    - val_ratio: Validation data ratio
    """
    print('Processing dataset...')

    # Get files of every class and list of all files
    class_files, all_files, class_counter = get_class_files(txt_folder)
    total_files = len(all_files)

    # Distribute files based on distribution
    train_files, test_files, val_files = distribute_files_based_on_ratio(all_files, train_ratio, test_ratio, val_ratio)

    # Create destination folders
    train_txt_folder = os.path.join(root_path, 'train', 'txt_files')
    train_img_folder = os.path.join(root_path, 'train', 'img_files')
    test_txt_folder = os.path.join(root_path, 'test', 'txt_files')
    test_img_folder = os.path.join(root_path, 'test', 'img_files')
    val_txt_folder = os.path.join(root_path, 'val', 'txt_files')
    val_img_folder = os.path.join(root_path, 'val', 'img_files')

    # Copy files and count class distribution in destination folders
    train_class_counter = copy_files(train_files, txt_folder, img_folder, train_txt_folder, train_img_folder, class_counter)
    test_class_counter = copy_files(test_files, txt_folder, img_folder, test_txt_folder, test_img_folder, class_counter)
    val_class_counter = copy_files(val_files, txt_folder, img_folder, val_txt_folder, val_img_folder, class_counter)

    # Sum overall count of files in destination folders
    total_train = len(train_files)
    total_test = len(test_files)
    total_val = len(val_files)

    # Class counts before distribution
    print_class_distribution(class_counter, total_files, "Original Distribution")

    # Class distribution after assigning files to destination folders
    print_class_distribution(train_class_counter, total_train, "Training data distribution")
    print_class_distribution(test_class_counter, total_test, "Test data distribution")
    print_class_distribution(val_class_counter, total_val, "Validation data distribution")

if __name__ == '__main__':
    root_path = '/Users/timhohenhaus/Team Project/Train_Test_Validation_Data/'  # Given path to directory
    txt_folder = os.path.join(root_path, 'Coordinates')  # Folder containing txt files
    img_folder = os.path.join(root_path, 'Images')  # Folder containing images
    
    # Process data set
    process_dataset(txt_folder, img_folder, root_path)


Processing dataset...
Image for 4_7-55_frame_043210.txt missing. File skipped.
Image for 4_7-55_frame_036952.txt missing. File skipped.
Image for 4_7-55_frame_017582.txt missing. File skipped.
Image for 4_7-55_frame_044253.txt missing. File skipped.
Image for 4_7-55_frame_004917.txt missing. File skipped.
Image for 4_7-55_frame_018029.txt missing. File skipped.
Image for 4_7-55_frame_026969.txt missing. File skipped.
Image for 4_7-55_frame_060494.txt missing. File skipped.
Image for 5_15-55_frame_036600.txt missing. File skipped.
Image for 4_7-55_frame_007599.txt missing. File skipped.
Image for 4_7-55_frame_033376.txt missing. File skipped.
Image for 4_7-55_frame_001788.txt missing. File skipped.
Image for 4_7-55_frame_043806.txt missing. File skipped.
Image for 4_7-55_frame_021158.txt missing. File skipped.
Image for 5_15-55_frame_040600.txt missing. File skipped.
Image for 5_15-55_frame_006100.txt missing. File skipped.
Image for 5_15-55_frame_053100.txt missing. File skipped.
Image

Image for 4_7-55_frame_014453.txt missing. File skipped.
Image for 4_7-55_frame_010579.txt missing. File skipped.
Image for 4_7-55_frame_016241.txt missing. File skipped.
Image for 4_7-55_frame_030247.txt missing. File skipped.
Image for 4_7-55_frame_002235.txt missing. File skipped.
Image for 4_7-55_frame_048872.txt missing. File skipped.
Image for 4_7-55_frame_030396.txt missing. File skipped.
Image for 4_7-55_frame_035164.txt missing. File skipped.
Image for 4_7-55_frame_022201.txt missing. File skipped.
Image for 4_7-55_frame_037101.txt missing. File skipped.
Image for 4_7-55_frame_056620.txt missing. File skipped.
Image for 4_7-55_frame_026671.txt missing. File skipped.
Image for 5_15-55_frame_060100.txt missing. File skipped.
Image for 4_7-55_frame_052746.txt missing. File skipped.
Image for 4_7-55_frame_004470.txt missing. File skipped.
Image for 4_7-55_frame_056918.txt missing. File skipped.
Image for 4_7-55_frame_057216.txt missing. File skipped.
Image for 5_15-55_frame_028600

Image for 4_7-55_frame_051405.txt missing. File skipped.
Image for 4_7-55_frame_047829.txt missing. File skipped.
Image for 4_7-55_frame_037399.txt missing. File skipped.
Image for 4_7-55_frame_004619.txt missing. File skipped.
Image for 5_15-55_frame_039100.txt missing. File skipped.
Image for 4_7-55_frame_015198.txt missing. File skipped.
Image for 5_15-55_frame_003600.txt missing. File skipped.
Image for 4_7-55_frame_048276.txt missing. File skipped.
Image for 4_7-55_frame_055428.txt missing. File skipped.
Image for 5_15-55_frame_061100.txt missing. File skipped.
Image for 4_7-55_frame_044849.txt missing. File skipped.
Image for 4_7-55_frame_000596.txt missing. File skipped.
Image for 4_7-55_frame_015645.txt missing. File skipped.
Image for 5_15-55_frame_005600.txt missing. File skipped.
Image for 4_7-55_frame_050660.txt missing. File skipped.
Image for 4_7-55_frame_039485.txt missing. File skipped.
Image for 5_15-55_frame_018600.txt missing. File skipped.
Image for 5_15-55_frame_00

Image for 4_7-55_frame_045296.txt missing. File skipped.
Image for 4_7-55_frame_060196.txt missing. File skipped.
Image for 4_7-55_frame_016092.txt missing. File skipped.
Image for 4_7-55_frame_049766.txt missing. File skipped.
Image for 4_7-55_frame_002384.txt missing. File skipped.
Image for 4_7-55_frame_018774.txt missing. File skipped.
Image for 4_7-55_frame_038144.txt missing. File skipped.
Image for 4_7-55_frame_028608.txt missing. File skipped.
Image for 4_7-55_frame_051554.txt missing. File skipped.
Image for 4_7-55_frame_019072.txt missing. File skipped.
Image for 5_15-55_frame_045100.txt missing. File skipped.
Image for 4_7-55_frame_046190.txt missing. File skipped.
Image for 4_7-55_frame_035611.txt missing. File skipped.
Image for 4_7-55_frame_043508.txt missing. File skipped.
Image for 4_7-55_frame_016688.txt missing. File skipped.
Image for 4_7-55_frame_030098.txt missing. File skipped.
Image for 4_7-55_frame_015794.txt missing. File skipped.
Image for 4_7-55_frame_041869.

In [15]:
import os
import shutil

def replace_images(new_img_folder, root_path):
    """
    Replaces jpg files in train, test and validation folders with new files from given folder when file name matches
    
    Parameters: 
    - new_img_folder: Path to folder with new images
    - root_path: Path to folder with current files
    """
    folders = ['train', 'test', 'val']
    
    for folder in folders:
        img_folder = os.path.join(root_path, folder, 'img_files')
        
        # Iterate through every new files
        for new_img in os.listdir(new_img_folder):
            if new_img.endswith('.jpg'):
                new_img_path = os.path.join(new_img_folder, new_img)
                target_img_path = os.path.join(img_folder, new_img)
                
                if os.path.exists(target_img_path):
                    # Replace old with current image
                    shutil.copy(new_img_path, target_img_path)
                    print(f'{new_img} replaced in {folder} folder.')
                else:
                    print(f'{new_img} not found in {folder}')

if __name__ == '__main__':
    root_path = '/Users/timhohenhaus/Team Project/Train_Test_Validation_Data/'  # Path to directory containing train, test and validation data
    new_img_folder = '/Users/timhohenhaus/Team Project/Train_Test_Validation_Data/Preprocessed Frames Complete'  # Path to new image files
    
    replace_images(new_img_folder, root_path)


4_07-19_frame_053830.jpg nicht in train Ordner gefunden.
4_19-55_frame_038130.jpg nicht in train Ordner gefunden.
18_15-55_frame_018800.jpg nicht in train Ordner gefunden.
4_15-50_frame_005070.jpg nicht in train Ordner gefunden.
4_07-19_frame_024460.jpg nicht in train Ordner gefunden.
18_15-55_frame_007920.jpg nicht in train Ordner gefunden.
4_19-55_frame_027010.jpg nicht in train Ordner gefunden.
18_07-29_frame_080100.jpg nicht in train Ordner gefunden.
18_07-29_frame_004200.jpg nicht in train Ordner gefunden.
4_07-55_frame_002450.jpg nicht in train Ordner gefunden.
18_07-55_frame_003270.jpg nicht in train Ordner gefunden.
13_07-19_frame_022170.jpg nicht in train Ordner gefunden.
4_07-55_frame_022780.jpg nicht in train Ordner gefunden.
5_15-55_frame_021070.jpg nicht in train Ordner gefunden.
4_07-55_frame_038900.jpg nicht in train Ordner gefunden.
5_15-55_frame_053380.jpg nicht in train Ordner gefunden.
18_07-29_frame_049300.jpg nicht in train Ordner gefunden.
5_19-55_frame_025310.jpg

4_15-50_frame_035760.jpg nicht in train Ordner gefunden.
4_19-55_frame_008620.jpg nicht in train Ordner gefunden.
4_19-55_frame_040880.jpg nicht in train Ordner gefunden.
18_15-55_frame_040260.jpg nicht in train Ordner gefunden.
4_19-55_frame_017700.jpg nicht in train Ordner gefunden.
13_07-19_frame_096560.jpg nicht in train Ordner gefunden.
13_07-19_frame_012660.jpg ersetzt in train Ordner.
4_07-55_frame_012090.jpg nicht in train Ordner gefunden.
18_07-29_frame_043940.jpg nicht in train Ordner gefunden.
18_07-55_frame_033560.jpg nicht in train Ordner gefunden.
5_19-55_frame_058500.jpg nicht in train Ordner gefunden.
18_07-55_frame_044930.jpg nicht in train Ordner gefunden.
4_07-55_frame_032340.jpg nicht in train Ordner gefunden.
18_07-29_frame_034510.jpg nicht in train Ordner gefunden.
5_19-55_frame_047420.jpg nicht in train Ordner gefunden.
5_19-55_frame_030870.jpg nicht in train Ordner gefunden.
5_15-55_frame_011760.jpg nicht in train Ordner gefunden.
4_07-19_frame_090070.jpg nicht 

13_07-19_frame_003540.jpg ersetzt in train Ordner.
18_07-55_frame_022640.jpg nicht in train Ordner gefunden.
18_07-55_frame_002590.jpg nicht in train Ordner gefunden.
13_07-19_frame_074910.jpg nicht in train Ordner gefunden.
5_19-55_frame_049620.jpg nicht in train Ordner gefunden.
18_07-29_frame_025630.jpg nicht in train Ordner gefunden.
13_07-19_frame_023690.jpg nicht in train Ordner gefunden.
4_07-55_frame_023060.jpg nicht in train Ordner gefunden.
18_15-55_frame_051140.jpg nicht in train Ordner gefunden.
4_19-55_frame_006420.jpg nicht in train Ordner gefunden.
13_07-19_frame_087640.jpg nicht in train Ordner gefunden.
4_15-50_frame_024440.jpg nicht in train Ordner gefunden.
4_07-19_frame_005050.jpg nicht in train Ordner gefunden.
4_15-50_frame_004790.jpg nicht in train Ordner gefunden.
4_07-19_frame_025380.jpg nicht in train Ordner gefunden.
4_19-55_frame_019500.jpg nicht in train Ordner gefunden.
4_15-50_frame_053810.jpg nicht in train Ordner gefunden.
4_19-55_frame_050100.jpg nicht

13_07-19_frame_007780.jpg ersetzt in train Ordner.
13_07-19_frame_050800.jpg nicht in train Ordner gefunden.
18_07-55_frame_026480.jpg nicht in train Ordner gefunden.
18_07-55_frame_006750.jpg ersetzt in train Ordner.
13_07-19_frame_027450.jpg nicht in train Ordner gefunden.
18_07-55_frame_019670.jpg nicht in train Ordner gefunden.
13_07-19_frame_038570.jpg nicht in train Ordner gefunden.
4_07-55_frame_038380.jpg nicht in train Ordner gefunden.
5_15-55_frame_024550.jpg nicht in train Ordner gefunden.
4_07-55_frame_018050.jpg nicht in train Ordner gefunden.
5_19-55_frame_025990.jpg nicht in train Ordner gefunden.
5_15-55_frame_004680.jpg nicht in train Ordner gefunden.
5_15-55_frame_053900.jpg nicht in train Ordner gefunden.
4_07-19_frame_085190.jpg nicht in train Ordner gefunden.
18_07-29_frame_049980.jpg nicht in train Ordner gefunden.
4_15-50_frame_020680.jpg nicht in train Ordner gefunden.
4_07-19_frame_001290.jpg nicht in train Ordner gefunden.
4_15-50_frame_000550.jpg nicht in tra

13_07-19_frame_004890.jpg ersetzt in train Ordner.
18_07-29_frame_002830.jpg nicht in train Ordner gefunden.
18_15-55_frame_001310.jpg nicht in train Ordner gefunden.
4_19-55_frame_056670.jpg nicht in train Ordner gefunden.
4_19-55_frame_049750.jpg nicht in train Ordner gefunden.
4_15-50_frame_023990.jpg nicht in train Ordner gefunden.
4_07-19_frame_055200.jpg nicht in train Ordner gefunden.
13_07-19_frame_081130.jpg nicht in train Ordner gefunden.
4_19-55_frame_020080.jpg nicht in train Ordner gefunden.
4_19-55_frame_000350.jpg nicht in train Ordner gefunden.
18_15-55_frame_057630.jpg nicht in train Ordner gefunden.
18_15-55_frame_048710.jpg nicht in train Ordner gefunden.
4_07-19_frame_003720.jpg nicht in train Ordner gefunden.
4_15-50_frame_022330.jpg nicht in train Ordner gefunden.
5_15-55_frame_006330.jpg nicht in train Ordner gefunden.
4_07-19_frame_087420.jpg nicht in train Ordner gefunden.
5_19-55_frame_050070.jpg nicht in train Ordner gefunden.
4_07-55_frame_025710.jpg nicht i

18_07-55_frame_027150.jpg ersetzt in train Ordner.
13_07-19_frame_006250.jpg ersetzt in train Ordner.
18_07-55_frame_038070.jpg nicht in train Ordner gefunden.
4_07-19_frame_084440.jpg nicht in train Ordner gefunden.
4_07-55_frame_019580.jpg nicht in train Ordner gefunden.
13_07-19_frame_019370.jpg nicht in train Ordner gefunden.
5_15-55_frame_005350.jpg nicht in train Ordner gefunden.
4_07-55_frame_039650.jpg nicht in train Ordner gefunden.
5_15-55_frame_025080.jpg nicht in train Ordner gefunden.
5_19-55_frame_053010.jpg nicht in train Ordner gefunden.
18_07-29_frame_017940.jpg nicht in train Ordner gefunden.
13_07-19_frame_046660.jpg nicht in train Ordner gefunden.
4_07-55_frame_046090.jpg nicht in train Ordner gefunden.
18_07-29_frame_060510.jpg nicht in train Ordner gefunden.
18_07-55_frame_010930.jpg nicht in train Ordner gefunden.
13_07-19_frame_079490.jpg nicht in train Ordner gefunden.
5_19-55_frame_013420.jpg nicht in train Ordner gefunden.
18_07-55_frame_058790.jpg nicht in t

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

