In [1]:
from pathlib import Path
import re
import shutil

def organize_files(source_str, target_str='organized_files'):
    """
    Finds and copies PDF files using the pathlib library.
    """
    source_path = Path(source_str)
    target_path = Path(target_str)
    target_path.mkdir(exist_ok=True) # Create the main output folder

    # The same regex pattern from before
    pattern = re.compile(r'^(.*?) \s+ 202\d \s+ [QH]\d \.pdf$', re.IGNORECASE)
    
    copied_count = 0
    print(f"✅ Searching for PDFs in '{source_path.resolve()}'...")

    # Path.rglob('*.pdf') recursively finds all files ending in .pdf
    for pdf_file in source_path.rglob('*.pdf'):
        # We match against the filename only, not the whole path
        match = pattern.match(pdf_file.name)
        
        if match:
            name = match.group(1).strip()
            
            # Create the new name-specific folder
            name_folder = target_path / name
            name_folder.mkdir(exist_ok=True)
            
            # Copy the file
            shutil.copy2(pdf_file, name_folder / pdf_file.name)
            print(f"    -> Copied '{pdf_file.name}' to '{name}' folder")
            copied_count += 1
            
    if copied_count > 0:
        print(f"\n✨ Process complete! Successfully copied {copied_count} files.")
    else:
        print("\n🤔 No matching PDF files were found to organize.")


In [2]:
organize_files(r"pdf_data", r"Markets")

✅ Searching for PDFs in 'C:\Users\TravonCross\Documents\Git\Personal Projects\Python\pdf_processing\pdf_data'...

🤔 No matching PDF files were found to organize.


In [3]:
import os
import re

# --- PASTE THE FULL, ABSOLUTE PATH to your folder below ---
source_directory = 'pdf_data'

# This pattern matches names with or without a .pdf extension
pattern = re.compile(r'^(.*?) \s+ 202\d \s+ [QH]\d.*\.pdf$', re.IGNORECASE)

print("--- Starting Diagnostic ---")
print(f"🔎 Searching in folder: {source_directory}\n")

# --- Logic to find and check files ---
found_any_files = False
matched_any_files = False

if not os.path.isdir(source_directory):
    print(f"❌ ERROR: The path '{source_directory}' does not exist or is not a directory.")
else:
    for root, _, files in os.walk(source_directory):
        print(f"📂 Checking in folder: {root}")
        if not files:
            print("   -> No files here.")
        
        for filename in files:
            found_any_files = True
            match = pattern.match(filename)
            if match:
                matched_any_files = True
                print(f"   -> Found: '{filename}' ... ✅ MATCH!")
            else:
                print(f"   -> Found: '{filename}' ... ❌ No Match.")

    print("\n--- Diagnostic Complete ---")
    if not found_any_files:
        print("❗️ Result: The script did not find ANY files. Check your source_directory path.")
    elif not matched_any_files:
        print("❗️ Result: Found files, but NONE matched the required name pattern. Check your filenames.")
    else:
        print("✅ Result: The script found and matched files. The original script should work.")

--- Starting Diagnostic ---
🔎 Searching in folder: pdf_data

📂 Checking in folder: pdf_data
   -> Found: 'Savannah 2022 Q4.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2023 Q2.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2023 Q4.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2024 Q1.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2024 Q3.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2024 Q4.pdf' ... ❌ No Match.
   -> Found: 'Savannah 2025 Q1.pdf' ... ❌ No Match.
📂 Checking in folder: pdf_data\2021
   -> No files here.
📂 Checking in folder: pdf_data\2021\2021 Q1
   -> Found: 'Boston 2021 Q2.pdf' ... ❌ No Match.
   -> Found: 'Silicon Valley Snapshot 2021 Q.pdf' ... ❌ No Match.
📂 Checking in folder: pdf_data\2021\2021 Q1 csvs
   -> Found: 'Boston.csv' ... ❌ No Match.
   -> Found: 'Savannah.csv' ... ❌ No Match.
   -> Found: 'Silicon Valley Snapshot.csv' ... ❌ No Match.
📂 Checking in folder: pdf_data\2021\2021 Q2
   -> Found: 'Tri Valley 2021 Q2.pdf' ... ❌ No Match.
📂 Checking in folder: pdf_data\2021

In [4]:
from pathlib import Path
import shutil

def organize_pdfs_by_csv_names(source_dir_str, target_dir_str='organized_files'):
    """
    1. Finds all unique base names from .csv files in the source directory.
    2. Creates a folder for each unique name in the target directory.
    3. Finds all .pdf files that contain one of the unique names and copies them
       to the corresponding folder.
    """
    source_path = Path(source_dir_str)
    target_path = Path(target_dir_str)

    if not source_path.is_dir():
        print(f"❌ Error: Source directory '{source_path}' not found.")
        return

    # --- 1. First Pass: Collect all unique base names from CSV files ---
    print(f"📂 Pass 1: Searching for CSV files in '{source_path.resolve()}' to get names...")
    
    # Use a set to automatically handle duplicates
    base_names = set()
    for csv_file in source_path.rglob('*.csv'):
        # Get the filename without the .csv extension
        name = csv_file.stem 
        base_names.add(name)

    if not base_names:
        print("❗️ No CSV files found. Cannot determine which folders to create.")
        return
        
    print(f"✅ Found {len(base_names)} unique names.")

    # --- 2. Create Folders & Second Pass: Find and copy matching PDFs ---
    print(f"\n📂 Pass 2: Searching for matching PDFs to copy to '{target_path.resolve()}'...")
    target_path.mkdir(exist_ok=True) # Create the main output folder
    
    copied_count = 0
    # Iterate through each unique name found from the CSVs
    for name in sorted(list(base_names)): # Sort for predictable output
        
        # Create a specific folder for the current name
        name_folder = target_path / name
        name_folder.mkdir(exist_ok=True)
        
        # Now, find all PDFs that contain this name
        # Using rglob is efficient as it searches the whole tree
        for pdf_file in source_path.rglob('*.pdf'):
            # Check if the base name is a part of the PDF's filename
            if name in pdf_file.name:
                # Define the destination path
                destination = name_folder / pdf_file.name
                
                # Check if the file already exists to avoid re-copying
                if not destination.exists():
                    shutil.copy2(pdf_file, destination)
                    print(f"  -> Copied '{pdf_file.name}' to '{name_folder}'")
                    copied_count += 1

    print("\n--- ✨ Process Complete! ---")
    if copied_count > 0:
        print(f"Successfully organized and copied {copied_count} files.")
    else:
        print("Found names from CSVs, but no matching PDF files were found to copy.")

# --- How to run the script ---
# Set your source directory (where the 'pdf_data' folder is)
source_directory = 'pdf_data' 
# You can optionally change the name of the output folder
target_directory = 'Markets' 

organize_pdfs_by_csv_names(source_directory, target_directory)

📂 Pass 1: Searching for CSV files in 'C:\Users\TravonCross\Documents\Git\Personal Projects\Python\pdf_processing\pdf_data' to get names...


✅ Found 104 unique names.

📂 Pass 2: Searching for matching PDFs to copy to 'C:\Users\TravonCross\Documents\Git\Personal Projects\Python\pdf_processing\Markets'...

--- ✨ Process Complete! ---
Found names from CSVs, but no matching PDF files were found to copy.
