In [4]:
import os
import shutil
import random

# Set the paths for the lnd1 and lnd2 directories
lnd1_path = "/vol/bitbucket/ur23/lnd_new_capture/lnd1_sam6d"
lnd2_path = "/vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d"

# Set the paths for the output train and val directories
output_train_path = "/vol/bitbucket/ur23/lnd_new_capture/maskrcnn/train"
output_val_path = "/vol/bitbucket/ur23/lnd_new_capture/maskrcnn/test"

# Ensure output directories exist
os.makedirs(os.path.join(output_train_path, "rgb"), exist_ok=True)
os.makedirs(os.path.join(output_train_path, "mask"), exist_ok=True)
os.makedirs(os.path.join(output_val_path, "rgb"), exist_ok=True)
os.makedirs(os.path.join(output_val_path, "mask"), exist_ok=True)

# Define the train/val split ratio
train_ratio = 0.8

# List of filenames (without extension) to avoid in lnd2
avoid_files_in_lnd2 = {"000553", "000554", "000555", "000891", "000892", "000893", "000894"}  # replace with actual filenames to avoid

# Function to get the current max index from a directory
def get_current_max_index(directory):
    existing_files = os.listdir(directory)
    if not existing_files:
        return 0
    max_index = max(int(os.path.splitext(f)[0]) for f in existing_files)
    return max_index

# Function to copy files
def copy_files(src_dir, start_idx, avoid_files=None):
    file_pairs = []
    rgb_files = os.listdir(os.path.join(src_dir, "rgb"))
    print(f"Found {len(rgb_files)} RGB files in {src_dir}")
    
    for rgb_filename in rgb_files:
        base_filename = os.path.splitext(rgb_filename)[0]
        
        # Skip files that should be avoided
        if avoid_files and base_filename in avoid_files:
            print(f"Skipping file {rgb_filename} from {src_dir}")
            continue
        
        # Derive the corresponding mask filename by adding the suffix "_000000" before the file extension
        mask_filename = f"{base_filename}_000000.png"
        
        rgb_path = os.path.join(src_dir, "rgb", rgb_filename)
        mask_path = os.path.join(src_dir, "mask_visib", mask_filename)
        
        if os.path.exists(mask_path):
            new_filename = f"{start_idx:06d}.png"
            file_pairs.append((rgb_path, mask_path, new_filename))
            start_idx += 1
        else:
            print(f"Mask file not found for {rgb_filename} (expected at {mask_path})")
    
    return file_pairs, start_idx

# Function to copy file pairs to train/val directories
def copy_file_pairs(file_pairs, train_rgb_dir, train_mask_dir, val_rgb_dir, val_mask_dir, train_ratio):
    random.shuffle(file_pairs)
    split_idx = int(len(file_pairs) * train_ratio)
    
    for idx, (rgb_path, mask_path, new_filename) in enumerate(file_pairs):
        if idx < split_idx:
            shutil.copy(rgb_path, os.path.join(train_rgb_dir, new_filename))
            shutil.copy(mask_path, os.path.join(train_mask_dir, new_filename))
            # print(f"Copied {rgb_path} and {mask_path} to train set as {new_filename}")
        else:
            shutil.copy(rgb_path, os.path.join(val_rgb_dir, new_filename))
            shutil.copy(mask_path, os.path.join(val_mask_dir, new_filename))
            # print(f"Copied {rgb_path} and {mask_path} to val set as {new_filename}")

# Determine the starting index based on existing files
start_idx_train = get_current_max_index(os.path.join(output_train_path, "rgb")) + 1
start_idx_val = get_current_max_index(os.path.join(output_val_path, "rgb")) + 1

# Copy files from lnd1
lnd1_files, start_idx_train = copy_files(lnd1_path, start_idx_train)

# Copy files from lnd2, avoiding specified files
lnd2_files, start_idx_train = copy_files(lnd2_path, start_idx_train, avoid_files=avoid_files_in_lnd2)

# Combine and shuffle files for train/val split
all_files = lnd1_files + lnd2_files
copy_file_pairs(all_files, 
                os.path.join(output_train_path, "rgb"), 
                os.path.join(output_train_path, "mask"), 
                os.path.join(output_val_path, "rgb"), 
                os.path.join(output_val_path, "mask"), 
                train_ratio)

print("Files have been successfully copied and split into train and val folders.")


Found 970 RGB files in /vol/bitbucket/ur23/lnd_new_capture/lnd1_sam6d
Found 526 RGB files in /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000891.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000553.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000554.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000893.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000894.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000892.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Skipping file 000555.png from /vol/bitbucket/ur23/lnd_new_capture/lnd2_sam6d
Files have been successfully copied and split into train and val folders.
