In [1]:
import os
import random
import shutil
from collections import defaultdict

# Define paths
source_dir = r'G:\Cancer & Inflammation\VSM\Oscar\CLAUDE_FIELD CANCERIZATION\TCGA\Scans_pooled'
destination_dir = os.path.join(source_dir, 'Selected_50_Patients')
num_samples = 50

# Map to store patients grouped by their lab (SS) codes
lab_patient_map = defaultdict(list)

# Parse filenames to group patients by lab codes
for file_name in os.listdir(source_dir):
    if file_name.endswith('.svs'):  # Replace with actual file extension if different
        parts = file_name.split('-')
        lab_code = parts[1]  # `SS` code
        patient_id = parts[2]  # Patient identifier
        
        # Add filename to the lab-specific patient map
        lab_patient_map[lab_code].append(file_name)

# Create a diverse selection of patients across labs
selected_files = []
for lab_code, files in lab_patient_map.items():
    # Randomly select a few patients from each lab
    num_lab_samples = max(1, num_samples // len(lab_patient_map))
    selected_files += random.sample(files, min(num_lab_samples, len(files)))

# Adjust the selection if fewer than `num_samples` files have been selected
if len(selected_files) < num_samples:
    additional_files = [file for files in lab_patient_map.values() for file in files if file not in selected_files]
    selected_files += random.sample(additional_files, num_samples - len(selected_files))

# Copy selected files to the new subfolder in the source directory
os.makedirs(destination_dir, exist_ok=True)
for file_name in selected_files:
    src_path = os.path.join(source_dir, file_name)
    dest_path = os.path.join(destination_dir, file_name)
    shutil.copy(src_path, dest_path)

print(f'{num_samples} files with diverse lab codes have been copied to {destination_dir}.')


50 files with diverse lab codes have been copied to G:\...\TCGA\Scans_pooled\Selected_50_Patients.
