In [1]:
import os
import glob
import random
import shutil
from collections import defaultdict
# asdasdasd #copy the data before runnig this script

In [2]:
class_id_list = {
  0: 'ALLIGATOR(H)', 1: 'BLOCK', 2: 'LONGITUDINAL', 3: 'TRANSVERSE', 4: 'RUTTING',
  5: 'RAVELING(H)', 6: 'CORRUGATION', 7: 'POTHOLE', 8: 'DEPRESSION', 9: 'EDGE CRACKING',
  10: 'RAIL ROAD CROSSING', 11: 'BLEEDING', 12: 'JOINT REFLECTION', 13: 'PATCHING',
  14: 'POLISHED AGGREGATE', 15: 'SHOVING', 16: 'SLIPPAGE', 17: 'BUMPS & SAGS', 18: 'SWELL',
  19: 'WEATHERING', 20: 'CARRIAGEWAY', 21: 'ALLIGATOR(L)', 22: 'ALLIGATOR(M)', 23: 'RAVELING(M)'
}

# Create directories for train and validation sets
train_img_dir = 'Data/training/Final Data/train/images'
train_lbl_dir = 'Data/training/Final Data/train/labels'
valid_img_dir = 'Data/training/Final Data/valid/images'
valid_lbl_dir = 'Data/training/Final Data/valid/labels'

os.makedirs(train_img_dir, exist_ok=True)
os.makedirs(train_lbl_dir, exist_ok=True)
os.makedirs(valid_img_dir, exist_ok=True)
os.makedirs(valid_lbl_dir, exist_ok=True)

In [3]:
data_dir = 'Data/training/Raw Data/'
txt_list = glob.glob(os.path.join(data_dir, '**', '*.txt'), recursive=True)
txt_list = [x for x in txt_list if 'classes.txt' not in x]

# Step 1: Read and parse the data
file_class_mapping = defaultdict(list)
class_counter = defaultdict(int)

for file in txt_list:
    with open(file, 'r') as f:
        lines = f.readlines()
        file_classes = set()
        for line in lines:
            class_id = int(line.split(' ')[0])
            class_name = class_id_list[class_id]
            file_classes.add(class_name)
        file_class_mapping[file] = list(file_classes)
        for class_name in file_classes:
            class_counter[class_name] += 1

# Step 2: Calculate the number of files needed for each class
train_ratio = 0.8
train_files_per_class = {class_name: int(count * train_ratio) for class_name, count in class_counter.items()}

# Step 3: Shuffle and split while ensuring class distribution
train_files = set()
val_files = set()
class_train_counter = defaultdict(int)
class_val_counter = defaultdict(int)

random.seed(42)
shuffled_files = list(file_class_mapping.keys())
random.shuffle(shuffled_files)

# First pass: Try to add to training while respecting the required distribution
for file in shuffled_files:
    if all(class_train_counter[class_name] < train_files_per_class[class_name] for class_name in file_class_mapping[file]):
        train_files.add(file)
        for class_name in file_class_mapping[file]:
            class_train_counter[class_name] += 1
    else:
        val_files.add(file)
        for class_name in file_class_mapping[file]:
            class_val_counter[class_name] += 1

# Second pass: Distribute remaining files to the validation set if not already added
for file in shuffled_files:
    if file not in train_files and file not in val_files:
        val_files.add(file)
        for class_name in file_class_mapping[file]:
            class_val_counter[class_name] += 1



# Function to move files to appropriate directories
def copy_files(files, img_dir, lbl_dir):
    for file in files:
        # Corresponding image file path
        img_file = file.replace('.txt', '.jpg')  # assuming image files have .jpg extension
        
        # Move the txt file only if the corresponding image file exists
        if os.path.exists(img_file):
            shutil.copy(file, os.path.join(lbl_dir, os.path.basename(file)))
            shutil.copy(img_file, os.path.join(img_dir, os.path.basename(img_file)))
        else:
            print(f"Missing image for label file: {file}")

# Move training and validation files
copy_files(train_files, train_img_dir, train_lbl_dir)
copy_files(val_files, valid_img_dir, valid_lbl_dir)

In [4]:
counter_dict_train={}
counter_dict_val={}

train_lbls = os.listdir(train_lbl_dir)
train_lbls = [x for x in train_lbls if 'classes.txt'  not in x]

valid_lbls = os.listdir(valid_lbl_dir)
valid_lbls = [x for x in valid_lbls if 'classes.txt'  not in x]

for file in train_lbls:
    with open(train_lbl_dir+os.sep+file,'r') as f:
        lines = f.readlines()            
        for line in lines:
            class_id= int(line.split(' ')[0])
            class_name = class_id_list[class_id]
            if class_name in counter_dict_train:
                counter_dict_train[class_name] += 1
            else:
                counter_dict_train[class_name] = 1
                

for file in valid_lbls:
    with open(valid_lbl_dir+os.sep+file,'r') as f:
        lines = f.readlines()            
        for line in lines:
            class_id= int(line.split(' ')[0])
            class_name = class_id_list[class_id]
            if class_name in counter_dict_val:
                counter_dict_val[class_name] += 1
            else:
                counter_dict_val[class_name] = 1


In [5]:
# Output the counts for verification

print('Crack Name\tTrain\tValid\t  After\t  Train\t  Valid\n................................\t ...............')
for key,value in counter_dict_train.items():
    print(key,'\t',value,'\t',counter_dict_val[key],'\t','\t'   , class_train_counter[key],'\t',class_val_counter[key])
print('........................................................\n')

print('\t\tImages\tLabels')
print("Training:\t ", len(os.listdir(train_img_dir)), "\t", len(os.listdir(train_lbl_dir)))
print("Validation:\t ", len(os.listdir(valid_img_dir)), "\t", len(os.listdir(valid_lbl_dir)))

Crack Name	Train	Valid	  After	  Train	  Valid
................................	 ...............
CARRIAGEWAY 	 9664 	 2416 	 	 9651 	 2413
RAVELING(M) 	 1436 	 356 	 	 1401 	 351
RAVELING(H) 	 9194 	 2294 	 	 8025 	 2007
WEATHERING 	 1649 	 407 	 	 1542 	 386
RUTTING 	 172 	 44 	 	 172 	 44
ALLIGATOR(H) 	 302 	 74 	 	 228 	 60
BLOCK 	 19 	 5 	 	 19 	 5
POTHOLE 	 25 	 7 	 	 25 	 7
POLISHED AGGREGATE 	 24 	 8 	 	 24 	 8
CORRUGATION 	 128 	 32 	 	 128 	 32
........................................................

		Images	Labels
Training:	  9651 	 9651
Validation:	  2413 	 2413
