In [5]:
import cv2 
import numpy as np
import os
def get_txt_files(folder_path):
    return [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.txt')]
def get_image_files(folder_path):
    return [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.jpg') or f.endswith('.png')]

def rotate_yolo_labels_90(yolo_labels_path, img_width, img_height, num_rotations=1, prev_rotation=None) -> list:
    if prev_rotation == None:
        with open(yolo_labels_path, "r") as f:
            lines = f.readlines()
    else:
        lines = prev_rotation

    rotated_lines = []

    for line in lines:
        class_idx, x_center_norm, y_center_norm, width_norm, height_norm = map(float, line.split())
        class_idx = int(class_idx)

        # Convert normalized center coordinates to pixel coordinates
        x_center = x_center_norm * img_width
        y_center = y_center_norm * img_height

        # Compute new pixel coordinates for rotated bounding box
        x_new = y_center
        y_new = img_width - x_center

        # Convert new pixel coordinates back to normalized coordinates
        x_new_norm = x_new / img_height
        y_new_norm = y_new / img_width

        # NEW ADDED
        width_new_norm = height_norm
        height_new_norm = width_norm

        # Append rotated annotation to list of strings
        rotated_line = f"{class_idx} {x_new_norm:.6f} {y_new_norm:.6f} {width_new_norm:.6f} {height_new_norm:.6f}"
        rotated_lines.append(rotated_line)
    
    if num_rotations == 1:
        return rotated_lines
    else:
        rotated_lines = rotate_yolo_labels_90(yolo_labels_path, width_new_norm, height_new_norm, num_rotations-1, rotated_lines)

    return rotated_lines
def rotated_true_labels(folder_path, img_width, img_height, num_rotations) -> str:
    output_list = rotate_yolo_labels_90(folder_path, img_width, img_height, num_rotations)
    #now append the list together as a string separated by newlines
    return "\n".join(output_list)



In [105]:
def preprocess_images(img_folder_path, output_folder_path, rotate=True):
    #get every image in the path
    image_files = get_image_files(img_folder_path)
    for image_file in image_files:
        #get the image_file name without extension
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        image = cv2.imread(image_file)
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        # normalized_image = gray_image / 255.0
        blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

        #now time for some data augmenting
            #let's add rotated version of the image to the dir as 0 thru 3 
        rotated_image = blurred_image
        augmented_images = [blurred_image]
        if rotate:
            for i in range(3):
                rotated_image = cv2.rotate(rotated_image, cv2.ROTATE_90_CLOCKWISE)
                augmented_images.append(rotated_image)

        #finally, write the image to preprocessed/images/base_name_0 thru 3 .jpg
        for i, augmented_image in enumerate(augmented_images):
            cv2.imwrite(f'{output_folder_path}/{base_name}_{i}.jpg', augmented_image)

        #now, write the rotated image lables to preprocessed/labels/base_name_0 thru 3 .txt


In [104]:
def preprocess_labels(label_folder_path, output_folder_path, rotate=True):
    label_files = get_txt_files(label_folder_path)
    for label_file in label_files:
        base_name = os.path.splitext(os.path.basename(label_file))[0] 
        for i in range(4):
                with open(f'{output_folder_path}/{base_name}_{i}.txt', 'w') as f:
                    if i == 0:
                        f.write(open(label_file).read())
                    elif rotate:
                        f.write(rotated_true_labels(label_folder_path+"/"+base_name+'.txt', 640, 640, i))
    

In [106]:

preprocess_images("take2/partitioned/train/images", "take2/preprocessed/train/images", rotate=True)
preprocess_labels("take2/partitioned/train/labels", "take2/preprocessed/train/labels", rotate=True)

preprocess_images("take2/partitioned/val/images", "take2/preprocessed/val/images", rotate=False)
preprocess_labels("take2/partitioned/val/labels", "take2/preprocessed/val/labels", rotate=False)

preprocess_images("take2/partitioned/test/images", "take2/preprocessed/test/images", rotate=False)
preprocess_labels("take2/partitioned/test/labels", "take2/preprocessed/test/labels", rotate=False)


In [100]:
path = 'take2/partitioned/train/labels'
preprocess_labels(path)


In [84]:
def check_indecies(list1, list2):
    #return [] if all elements in list1 are in the same spot in list2. 
    #otherwise, return the indices of the elements that are not in the same spot
    return [i for i in range(len(list1)) if list1[i] != list2[i]]

In [101]:
import os
import shutil
import random

def partition_files(input_folder, output_folder, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1, seed=42):
    # Ensure the ratios sum up to 1.0
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6, "Ratios must sum to 1.0"

    # Set the seed for reproducibility
    random.seed(seed)
    
    # List all image files in the input folder
    label_files = [f for f in os.listdir(input_folder+"/labels") if os.path.isfile(os.path.join(input_folder+"/labels", f))]
    image_files = [f for f in os.listdir(input_folder+"/images") if os.path.isfile(os.path.join(input_folder+"/images", f))]
    
    #print the difference between the two if they were sets
    label_files_base = set([os.path.splitext(f)[0] for f in label_files])
    image_files_base = set([os.path.splitext(f)[0] for f in image_files])
    
    #remove the files that are not in both from the label_files
    label_files = [f for f in label_files if os.path.splitext(f)[0] in image_files_base]
    image_files = [f for f in image_files if os.path.splitext(f)[0] in label_files_base]

    print(len(image_files))
    print(len(label_files))
    #sort both lists
    label_files.sort()
    image_files.sort()

    label_files_base = [os.path.splitext(f)[0] for f in label_files]
    image_files_base = [os.path.splitext(f)[0] for f in image_files]
    print(check_indecies(label_files_base, image_files_base))

    # Shuffle the files
    zipped = list(zip(label_files, image_files))
    random.shuffle(zipped)
    
    label_files, image_files = zip(*zipped)

    label_files_base = [os.path.splitext(f)[0] for f in label_files]
    image_files_base = [os.path.splitext(f)[0] for f in image_files]
    print(check_indecies(label_files_base, image_files_base))

    
    # Calculate the number of images for each partition
    total_images = len(image_files)
    train_count = int(total_images * train_ratio)
    val_count = int(total_images * val_ratio)
    
    # Partition the images
    train_images = image_files[:train_count]
    val_images = image_files[train_count:train_count + val_count]
    test_images = image_files[train_count + val_count:]
    
    train_labels = label_files[:train_count]
    val_labels = label_files[train_count:train_count + val_count]
    test_labels = label_files[train_count + val_count:]

    # Create output subfolders if they do not exist
    for folder_name in ['train', 'val', 'test']:
        folder_path = os.path.join(output_folder, folder_name)
        os.makedirs(folder_path, exist_ok=True)
    
    # Move images to corresponding folders
    def copy_images(file_list, destination_folder, type='images'):
        for file_name in file_list:
            shutil.copy(os.path.join(input_folder+"/"+type, file_name), os.path.join(destination_folder, file_name))
    
    copy_images(train_images, os.path.join(output_folder, 'train/images'))
    copy_images(val_images, os.path.join(output_folder, 'val/images'))
    copy_images(test_images, os.path.join(output_folder, 'test/images'))

    copy_images(train_labels, os.path.join(output_folder, 'train/labels'), 'labels')
    copy_images(val_labels, os.path.join(output_folder, 'val/labels'), 'labels')
    copy_images(test_labels, os.path.join(output_folder, 'test/labels'), 'labels')
    
    print(f"Images partitioned into:\n"
          f"Train: {len(train_images)}\n"
          f"Validation: {len(val_images)}\n"
          f"Test: {len(test_images)}")




In [102]:
input_folder = 'take2/cleaned'  # Replace with your folder containing images
output_folder = 'take2/partitioned'  # Replace with your desired output folder
partition_files(input_folder, output_folder)

4060
4060
[]
[]
Images partitioned into:
Train: 2842
Validation: 812
Test: 406


In [89]:
def test_deaugmentation(folder, target_folder = "test"):
    label_files = get_txt_files(folder + "/labels")
    for label_file in label_files:
        #only keep the filenames ending with _0.txt
        if not label_file.endswith("_0.txt"):
            continue
        #copy it to the target folder with the same name
        base_name = os.path.splitext(os.path.basename(label_file))[0]
        shutil.copy(label_file, f"{target_folder}/labels/{base_name}.txt")
    #same for image files
    image_files = get_image_files(folder + "/images")
    for image_file in image_files:
        if not image_file.endswith("_0.jpg"):
            continue
        base_name = os.path.splitext(os.path.basename(image_file))[0]
        shutil.copy(image_file, f"{target_folder}/images/{base_name}.jpg")


In [93]:
test_deaugmentation("partitioned/val", "validation")

In [81]:
#check the base names of the files in the each of 2 folders are the same. Return how many are different
def check_files(folder1, folder2):
    files1 = set([os.path.splitext(os.path.basename(f))[0] for f in get_image_files(folder1)])
    files2 = set([os.path.splitext(os.path.basename(f))[0] for f in get_txt_files(folder2)])
    print(len(files1))
    print(len(files2))
    return len(files1.symmetric_difference(files2))

In [94]:
check_files("validation/images", "validation/labels")

791
791


0

In [64]:
def clean_data(input_folder, output_folder_path) -> int:
    # List all image files in the input folder
    label_files = [f for f in os.listdir(input_folder+"/labels") if os.path.isfile(os.path.join(input_folder+"/labels", f))]
    image_files = [f for f in os.listdir(input_folder+"/images") if os.path.isfile(os.path.join(input_folder+"/images", f))]
    
    #print the difference between the two if they were sets
    label_files_base = set([os.path.splitext(f)[0] for f in label_files])
    image_files_base = set([os.path.splitext(f)[0] for f in image_files])
    number_wrong = len(image_files_base.symmetric_difference(label_files_base))
    #remove the files that are not in both from the label_files
    label_files = [f for f in label_files if os.path.splitext(f)[0] in image_files_base]
    image_files = [f for f in image_files if os.path.splitext(f)[0] in label_files_base]

    label_files_base = set([os.path.splitext(f)[0] for f in label_files])
    image_files_base = set([os.path.splitext(f)[0] for f in image_files])
    #make sure new symetric difference is 0
    if(label_files_base.symmetric_difference(image_files_base) != set()):
        print("There is something wrong with the files")
    #copy the files to the new folder
    for file in image_files:
        shutil.copy(input_folder+"/images/"+file, output_folder_path+"/images/"+file)
    for file in label_files:
        shutil.copy(input_folder+"/labels/"+file, output_folder_path+"/labels/"+file)
    return number_wrong


In [96]:
clean_data("Training Set", "take2/cleaned")

72