# Recursive Directory Archiver

This notebook creates a tar.gz archive of all files and folders starting from a specified root directory.

## Features:
- Recursively traverses all subdirectories
- Creates compressed tar.gz archive
- Handles file permissions and metadata
- Provides progress feedback
- Error handling for inaccessible files

In [None]:
import os
import tarfile
from pathlib import Path
import time
from datetime import datetime

In [None]:
def count_files_and_dirs(root_path):
    """
    Count total files and directories for progress tracking
    """
    total_items = 0
    total_size = 0
    
    try:
        for root, dirs, files in os.walk(root_path):
            total_items += len(dirs) + len(files)
            for file in files:
                try:
                    file_path = os.path.join(root, file)
                    total_size += os.path.getsize(file_path)
                except (OSError, IOError):
                    continue
    except Exception as e:
        print(f"Warning: Error counting files: {e}")
    
    return total_items, total_size

In [None]:
def format_size(size_bytes):
    """
    Convert bytes to human readable format
    """
    if size_bytes == 0:
        return "0 B"
    
    size_names = ["B", "KB", "MB", "GB", "TB"]
    i = 0
    while size_bytes >= 1024.0 and i < len(size_names) - 1:
        size_bytes /= 1024.0
        i += 1
    
    return f"{size_bytes:.2f} {size_names[i]}"

In [None]:
def create_recursive_archive(root_path, output_path, exclude_patterns=None):
    """
    Create a tar.gz archive of all files and folders recursively
    
    Args:
        root_path (str): Root directory to archive
        output_path (str): Output tar.gz file path
        exclude_patterns (list): List of patterns to exclude (optional)
    
    Returns:
        bool: True if successful, False otherwise
    """
    
    if exclude_patterns is None:
        exclude_patterns = []
    
    # Validate input path
    if not os.path.exists(root_path):
        print(f"Error: Root path '{root_path}' does not exist.")
        return False
    
    # Get absolute paths
    root_path = os.path.abspath(root_path)
    output_path = os.path.abspath(output_path)
    
    print(f"Starting archive creation...")
    print(f"Source: {root_path}")
    print(f"Output: {output_path}")
    print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("-" * 50)
    
    # Count files for progress tracking
    print("Scanning directory structure...")
    total_items, total_size = count_files_and_dirs(root_path)
    print(f"Found {total_items} items, total size: {format_size(total_size)}")
    print("-" * 50)
    
    try:
        with tarfile.open(output_path, 'w:gz') as tar:
            processed_items = 0
            errors = []
            
            # Walk through directory tree
            for root, dirs, files in os.walk(root_path):
                # Filter out excluded directories
                dirs[:] = [d for d in dirs if not any(pattern in d for pattern in exclude_patterns)]
                
                # Process directories
                for dir_name in dirs:
                    dir_path = os.path.join(root, dir_name)
                    try:
                        # Calculate relative path for archive
                        arcname = os.path.relpath(dir_path, os.path.dirname(root_path))
                        tar.add(dir_path, arcname=arcname, recursive=False)
                        processed_items += 1
                        
                        # Progress update
                        if processed_items % 100 == 0:
                            progress = (processed_items / total_items) * 100
                            print(f"Progress: {progress:.1f}% ({processed_items}/{total_items})")
                            
                    except Exception as e:
                        error_msg = f"Error adding directory {dir_path}: {e}"
                        errors.append(error_msg)
                        print(f"Warning: {error_msg}")
                
                # Process files
                for file_name in files:
                    # Skip excluded files
                    if any(pattern in file_name for pattern in exclude_patterns):
                        continue
                        
                    file_path = os.path.join(root, file_name)
                    try:
                        # Calculate relative path for archive
                        arcname = os.path.relpath(file_path, os.path.dirname(root_path))
                        tar.add(file_path, arcname=arcname)
                        processed_items += 1
                        
                        # Progress update
                        if processed_items % 100 == 0:
                            progress = (processed_items / total_items) * 100
                            print(f"Progress: {progress:.1f}% ({processed_items}/{total_items})")
                            
                    except Exception as e:
                        error_msg = f"Error adding file {file_path}: {e}"
                        errors.append(error_msg)
                        print(f"Warning: {error_msg}")
        
        # Final summary
        print("-" * 50)
        print(f"Archive creation completed!")
        print(f"Processed items: {processed_items}/{total_items}")
        print(f"Archive size: {format_size(os.path.getsize(output_path))}")
        print(f"Errors encountered: {len(errors)}")
        
        if errors:
            print("\nErrors summary:")
            for error in errors[:10]:  # Show first 10 errors
                print(f"  - {error}")
            if len(errors) > 10:
                print(f"  ... and {len(errors) - 10} more errors")
        
        return True
        
    except Exception as e:
        print(f"Critical error during archive creation: {e}")
        return False

## Configuration

Set your parameters here:

In [None]:
# Configuration
ROOT_DIRECTORY = "/path/to/your/root/directory"  # Change this to your target directory
OUTPUT_ARCHIVE = "archive_backup.tar.gz"        # Output file name

# Optional: Files/directories to exclude (partial matches)
EXCLUDE_PATTERNS = [
    ".git",
    "__pycache__",
    ".pyc",
    ".DS_Store",
    "node_modules",
    ".tmp",
    ".cache"
]

print(f"Configuration:")
print(f"Root Directory: {ROOT_DIRECTORY}")
print(f"Output Archive: {OUTPUT_ARCHIVE}")
print(f"Exclude Patterns: {EXCLUDE_PATTERNS}")

## Execute Archive Creation

Run the cell below to start the archiving process:

In [None]:
# Execute the archive creation
start_time = time.time()

success = create_recursive_archive(
    root_path=ROOT_DIRECTORY,
    output_path=OUTPUT_ARCHIVE,
    exclude_patterns=EXCLUDE_PATTERNS
)

end_time = time.time()
duration = end_time - start_time

print(f"\nTotal execution time: {duration:.2f} seconds")

if success:
    print("✅ Archive created successfully!")
else:
    print("❌ Archive creation failed.")

## Verify Archive (Optional)

Run this cell to verify the created archive:

In [None]:
def verify_archive(archive_path):
    """
    Verify the integrity of the created archive
    """
    try:
        with tarfile.open(archive_path, 'r:gz') as tar:
            members = tar.getmembers()
            print(f"Archive verification successful!")
            print(f"Total files/directories in archive: {len(members)}")
            print(f"Archive size: {format_size(os.path.getsize(archive_path))}")
            
            # Show first few entries
            print("\nFirst 10 entries:")
            for i, member in enumerate(members[:10]):
                file_type = "DIR" if member.isdir() else "FILE"
                print(f"  {file_type}: {member.name}")
            
            if len(members) > 10:
                print(f"  ... and {len(members) - 10} more entries")
                
        return True
    except Exception as e:
        print(f"Archive verification failed: {e}")
        return False

# Verify the archive
if os.path.exists(OUTPUT_ARCHIVE):
    verify_archive(OUTPUT_ARCHIVE)
else:
    print(f"Archive file '{OUTPUT_ARCHIVE}' not found.")

## Usage Instructions

1. **Update Configuration**: Modify the `ROOT_DIRECTORY` variable to point to your target directory
2. **Set Output Path**: Change `OUTPUT_ARCHIVE` to your desired output file name/path
3. **Configure Exclusions**: Modify `EXCLUDE_PATTERNS` to exclude unwanted files/directories
4. **Run Cells**: Execute the cells in order to create your archive

## Notes

- The script preserves file permissions and timestamps
- Large directories may take significant time to process
- The script handles errors gracefully and continues processing
- Progress updates are shown every 100 processed items
- Memory usage is optimized by streaming files to the archive