In [1]:
#!/usr/bin/env python3
"""
Merge multiple source directories (with the same subfolder layout) into one
destination directory. Preserves relative paths. Handles filename collisions
via configurable strategies.

Configurable parameters are at the top of this file.
"""

from pathlib import Path
import shutil
import hashlib
import sys

# ===================== CONFIG (EDIT THESE) ===================== #
SRC_DIRS = [
    "/media/yhs/5596744f-db7c-442f-9235-d0c9d50c0a6b/Cellpose/Batch2/DRG_1",
    "/media/yhs/5596744f-db7c-442f-9235-d0c9d50c0a6b/Cellpose/Batch2/DRG_2",
    "/media/yhs/5596744f-db7c-442f-9235-d0c9d50c0a6b/Cellpose/Batch2/DRG_3"
    # add more as needed
]
DEST_DIR = "/media/yhs/5596744f-db7c-442f-9235-d0c9d50c0a6b/Cellpose/Batch2/DRG_ABC"

# How to handle filename collisions when the same relative file already exists:
#   "overwrite"     -> overwrite existing file
#   "skip"          -> keep existing file, skip incoming
#   "rename"        -> auto-rename incoming as name_2.ext, name_3.ext, ...
#   "source_prefix" -> rename incoming to <sourcebasename>__name.ext
MERGE_STRATEGY = "rename"

# Optional: only copy files with these extensions (lowercase, include the dot), or set to None to copy everything
INCLUDE_EXTENSIONS = None  # e.g., {".png", ".csv", ".tif"}

# If True, when a collision occurs, compare file hashes; if identical, skip copying
DEDUP_BY_HASH = True

# Dry run (no files actually copied). Set to True to preview actions.
DRY_RUN = False
# =============================================================== #


def file_md5(path: Path, block_size: int = 1 << 20) -> str:
    h = hashlib.md5()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(block_size), b""):
            h.update(chunk)
    return h.hexdigest()


def unique_path(base_path: Path) -> Path:
    """
    If base_path exists, return a new path with a suffix _2, _3, ... before the extension.
    """
    if not base_path.exists():
        return base_path
    stem, suffix = base_path.stem, base_path.suffix
    parent = base_path.parent
    i = 2
    while True:
        candidate = parent / f"{stem}_{i}{suffix}"
        if not candidate.exists():
            return candidate
        i += 1


def copy_file(src_file: Path, dst_file: Path):
    if DRY_RUN:
        print(f"[DRY] COPY  {src_file}  ->  {dst_file}")
        return
    shutil.copy2(src_file, dst_file)


def main():
    dest = Path(DEST_DIR).resolve()
    dest.mkdir(parents=True, exist_ok=True)

    # Validate sources
    src_paths = []
    for s in SRC_DIRS:
        p = Path(s).resolve()
        if not p.exists() or not p.is_dir():
            print(f"Warning: source not found or not a directory: {p}", file=sys.stderr)
            continue
        if dest == p or str(dest).startswith(str(p) + "/"):
            print(f"Error: DEST_DIR must not be inside a source dir (or equal to it): {dest}", file=sys.stderr)
            return
        src_paths.append(p)

    if not src_paths:
        print("No valid source directories. Nothing to do.", file=sys.stderr)
        return

    files_copied = 0
    files_skipped = 0
    files_overwritten = 0
    files_renamed = 0

    for src in src_paths:
        for path in src.rglob("*"):
            if not path.is_file():
                continue
            if INCLUDE_EXTENSIONS is not None and path.suffix.lower() not in INCLUDE_EXTENSIONS:
                continue

            rel = path.relative_to(src)
            out_path = dest / rel
            out_path.parent.mkdir(parents=True, exist_ok=True)

            if out_path.exists():
                # Optional content-dedup check
                if DEDUP_BY_HASH:
                    try:
                        if file_md5(path) == file_md5(out_path):
                            files_skipped += 1
                            print(f"[SKIP same content] {rel}")
                            continue
                    except Exception as e:
                        print(f"Hash check failed ({rel}): {e}", file=sys.stderr)

                if MERGE_STRATEGY == "overwrite":
                    copy_file(path, out_path)
                    files_overwritten += 1
                    continue

                elif MERGE_STRATEGY == "skip":
                    files_skipped += 1
                    print(f"[SKIP exists] {rel}")
                    continue

                elif MERGE_STRATEGY == "rename":
                    new_path = unique_path(out_path)
                    copy_file(path, new_path)
                    files_renamed += 1
                    continue

                elif MERGE_STRATEGY == "source_prefix":
                    prefixed = out_path.with_name(f"{src.name}__{out_path.name}")
                    prefixed = unique_path(prefixed)  # ensure uniqueness even with prefix
                    copy_file(path, prefixed)
                    files_renamed += 1
                    continue

                else:
                    print(f"Unknown MERGE_STRATEGY: {MERGE_STRATEGY}", file=sys.stderr)
                    return
            else:
                copy_file(path, out_path)
                files_copied += 1

    print("\nDone.")
    print(f"  Copied new:      {files_copied}")
    print(f"  Overwritten:     {files_overwritten}")
    print(f"  Renamed/copied:  {files_renamed}")
    print(f"  Skipped:         {files_skipped}")


if __name__ == "__main__":
    main()


Done.
  Copied new:      714
  Overwritten:     0
  Renamed/copied:  0
  Skipped:         0
