In [1]:
import os
import pandas as pd
from collections import defaultdict
import math

# === 1. Set the base directory ===
base_dir = '/Users/tsujik/Documents/R01automaticityprocessed'

# === 2. Find all ST PDFs and group them by (subject, visit) ===
grouped_pdfs = defaultdict(list)

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.pdf') and '_ST_' in file:
            full_path = os.path.join(root, file)
            
            # Extract subject and visit information from the path
            parts = full_path.split(os.sep)
            try:
                visit_folder_index = parts.index('Visit1')  # Look for Visit1, Visit2, etc.
            except ValueError:
                try:
                    visit_folder_index = parts.index('Visit2')
                except ValueError:
                    try:
                        visit_folder_index = parts.index('Visit3')
                    except ValueError:
                        continue  # Skip files not inside a Visit folder
            
            subject_folder = parts[visit_folder_index + 1]  # e.g., 'AUT_216'
            visit_subfolder = parts[visit_folder_index + 2]  # e.g., 'Visit 1'
            
            # Create a (subject, visit) key
            group_key = f"{subject_folder}_{visit_subfolder}"
            grouped_pdfs[group_key].append(full_path)

# === 3. Now split the groups into 5 roughly equal portions ===
# Turn into a list of (group_key, [list of pdfs])
group_items = list(grouped_pdfs.items())

# Determine split sizes
num_groups = len(group_items)
portion_size = math.ceil(num_groups / 5)

# Split into 5 portions
portions = []
for i in range(5):
    start_idx = i * portion_size
    end_idx = min(start_idx + portion_size, num_groups)
    portion = group_items[start_idx:end_idx]
    portions.append(portion)

# === 4. Print the portions out nicely ===
for idx, portion in enumerate(portions, start=1):
    print(f"\n=== Portion {idx} ===")
    for group_key, pdf_list in portion:
        print(f"Group: {group_key}")
        for pdf in pdf_list:
            print(f"    {pdf}")

# === 5. (Optional) Save each portion to a CSV ===
#for idx, portion in enumerate(portions, start=1):
  #  flat_list = []
    #for group_key, pdf_list in portion:
    #    for pdf in pdf_list:
    #        flat_list.append({'Group': group_key, 'FilePath': pdf})
    
   # df = pd.DataFrame(flat_list)
   # output_path = f'/Users/tsujik/Documents/portion_{idx}_ST_pdfs.csv'
   # df.to_csv(output_path, index=False)



=== Portion 1 ===
Group: AUT_013_Visit1
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_raw_all_channels.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_processed_regions.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_raw_regions.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_processed_overall.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_raw_overall.pdf
Group: AUT_014_Visit1
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_014/Visit1/AUT_014_V1_Walking_ST_raw_overall.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_014/Visit1/AUT_014_V1_Walking_ST_raw_all_channels.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_014/Visit1/AUT_014_V1_Walking_ST_processed_overall.pdf
    /

In [2]:
import os
import pandas as pd
from collections import defaultdict
import math
import shutil  # Add this import for file copying

# === 1. Set the base directory and output directory ===
base_dir = '/Users/tsujik/Documents/R01automaticityprocessed'
output_base_dir = '/Users/tsujik/Documents/R01_portions'  # Change this to your desired output directory

# Create output directory if it doesn't exist
os.makedirs(output_base_dir, exist_ok=True)

# === 2. Find all ST PDFs and group them by (subject, visit) ===
grouped_pdfs = defaultdict(list)

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.pdf') and '_ST_' in file:
            full_path = os.path.join(root, file)
            
            # Extract subject and visit information from the path
            parts = full_path.split(os.sep)
            try:
                visit_folder_index = parts.index('Visit1')  # Look for Visit1, Visit2, etc.
            except ValueError:
                try:
                    visit_folder_index = parts.index('Visit2')
                except ValueError:
                    try:
                        visit_folder_index = parts.index('Visit3')
                    except ValueError:
                        continue  # Skip files not inside a Visit folder
            
            subject_folder = parts[visit_folder_index + 1]  # e.g., 'AUT_216'
            visit_subfolder = parts[visit_folder_index + 2]  # e.g., 'Visit 1'
            
            # Create a (subject, visit) key
            group_key = f"{subject_folder}_{visit_subfolder}"
            grouped_pdfs[group_key].append(full_path)

# === 3. Now split the groups into 5 roughly equal portions ===
group_items = list(grouped_pdfs.items())
num_groups = len(group_items)
portion_size = math.ceil(num_groups / 5)

portions = []
for i in range(5):
    start_idx = i * portion_size
    end_idx = min(start_idx + portion_size, num_groups)
    portion = group_items[start_idx:end_idx]
    portions.append(portion)

# === 4. Copy files for each portion to their own folder ===
for idx, portion in enumerate(portions, start=1):
    portion_dir = os.path.join(output_base_dir, f'Portion_{idx}')
    os.makedirs(portion_dir, exist_ok=True)
    
    print(f"\n=== Portion {idx} ===")
    for group_key, pdf_list in portion:
        print(f"Group: {group_key}")
        
        # Create a subfolder for this group in the portion folder
        group_dir = os.path.join(portion_dir, group_key)
        os.makedirs(group_dir, exist_ok=True)
        
        for pdf in pdf_list:
            print(f"    {pdf}")
            
            # Copy the PDF to the group folder
            try:
                shutil.copy2(pdf, group_dir)
                print(f"    Copied to: {os.path.join(group_dir, os.path.basename(pdf))}")
            except Exception as e:
                print(f"    Failed to copy {pdf}: {e}")




=== Portion 1 ===
Group: AUT_013_Visit1
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_raw_all_channels.pdf
    Copied to: /Users/tsujik/Documents/R01_portions/Portion_1/AUT_013_Visit1/AUT_013_V2_Walking_ST_raw_all_channels.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_processed_regions.pdf
    Copied to: /Users/tsujik/Documents/R01_portions/Portion_1/AUT_013_Visit1/AUT_013_V2_Walking_ST_processed_regions.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_raw_regions.pdf
    Copied to: /Users/tsujik/Documents/R01_portions/Portion_1/AUT_013_Visit1/AUT_013_V2_Walking_ST_raw_regions.pdf
    /Users/tsujik/Documents/R01automaticityprocessed/Visit1/AUT_013/Visit1/AUT_013_V2_Walking_ST_processed_overall.pdf
    Copied to: /Users/tsujik/Documents/R01_portions/Portion_1/AUT_013_Visit1/AUT_013_V2_Walking_ST_processed_overall.pdf
    /Users/ts