In [6]:
import os
from collections import defaultdict

def find_minimal_file_set(directory_path):
    """
    Find a minimal set of files that together contain all numbers from 0-31 in their first column.
    
    Args:
        directory_path: Path to the directory containing text files
        
    Returns:
        A tuple of selected files, covered numbers, missing numbers, and file_to_numbers dictionary
    """
    # Dictionary to store which files contain which numbers in first column
    file_to_numbers = {}
    # Dictionary to store which numbers are in which files
    number_to_files = defaultdict(list)
    
    # Scan all text files
    for filename in os.listdir(directory_path):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory_path, filename)
            try:
                # Set to store unique numbers in first column of this file
                first_column_numbers = set()
                
                with open(filepath, 'r') as file:
                    for line in file:
                        parts = line.strip().split()
                        if parts:  # Make sure the line isn't empty
                            try:
                                # Extract the first column as an integer
                                first_col = int(parts[0])
                                # Only care about numbers 0-31
                                if 0 <= first_col <= 31:
                                    first_column_numbers.add(first_col)
                            except (ValueError, IndexError):
                                # Skip lines that don't have a valid integer in first column
                                continue
                
                # Store the numbers for this file if it has any valid numbers
                if first_column_numbers:
                    file_to_numbers[filename] = first_column_numbers
                    
                    # Record which files contain which unique numbers in first column
                    for number in first_column_numbers:
                        number_to_files[number].append(filename)
                    
            except Exception as e:
                print(f"Error reading {filename}: {e}")
    
    # Greedy algorithm to find minimal set of files
    selected_files = []
    covered_numbers = set()
    
    # Keep selecting files until we have covered all numbers from 0-31
    # or no more numbers can be covered
    while covered_numbers != set(range(32)) and len(covered_numbers) < 32:
        # Find the file that adds the most new numbers
        best_file = None
        most_new_numbers = 0
        
        for filename, numbers in file_to_numbers.items():
            # Skip files we've already selected
            if filename in selected_files:
                continue
                
            # How many new numbers would this file add?
            new_numbers = len(numbers - covered_numbers)
            
            if new_numbers > most_new_numbers:
                most_new_numbers = new_numbers
                best_file = filename
        
        # If we can't add any new numbers, break
        if most_new_numbers == 0:
            break
            
        # Add the best file to our selection
        selected_files.append(best_file)
        covered_numbers.update(file_to_numbers[best_file])
        
        print(f"Added file {best_file} with {most_new_numbers} new numbers")
        print(f"Total numbers covered: {len(covered_numbers)}/{32}")
    
    # Check which numbers we've covered and which are missing
    missing_numbers = set(range(32)) - covered_numbers
    
    # Return the file_to_numbers dictionary along with other results
    return selected_files, covered_numbers, missing_numbers, file_to_numbers

def main():
    # Update this path to your directory containing the text files
    directory_path = r"C:\Users\dell\pid_env\pid_symbols_dataset\labels"
    
    selected_files, covered_numbers, missing_numbers, file_to_numbers = find_minimal_file_set(directory_path)
    
    print("\nResults:")
    print(f"Found {len(selected_files)} files that cover {len(covered_numbers)} unique numbers from 0-31")
    print(f"Covered numbers: {sorted(list(covered_numbers))}")
    
    if missing_numbers:
        print(f"Missing numbers: {sorted(list(missing_numbers))}")
    else:
        print("All numbers from 0-31 are covered!")
    
    print("\nSelected files:")
    for i, filename in enumerate(selected_files, 1):
        numbers_in_file = sorted(list(file_to_numbers[filename]))
        print(f"{i}. {filename}: {numbers_in_file}")
    
    # Optionally, copy these files to a new directory
    """
    import shutil
    output_dir = "minimal_set_files"
    os.makedirs(output_dir, exist_ok=True)
    for filename in selected_files:
        shutil.copy(os.path.join(directory_path, filename), os.path.join(output_dir, filename))
    print(f"\nFiles copied to {output_dir}")
    """

if __name__ == "__main__":
    main()

Added file 39_2560_640.txt with 23 new numbers
Total numbers covered: 23/32
Added file 308_640_4480.txt with 8 new numbers
Total numbers covered: 31/32

Results:
Found 2 files that cover 31 unique numbers from 0-31
Covered numbers: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]
Missing numbers: [0]

Selected files:
1. 39_2560_640.txt: [1, 3, 4, 5, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 29, 30, 31]
2. 308_640_4480.txt: [1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15, 16, 19, 21, 22, 23, 28, 30]
