# Recursive Directory Archiver with Progress Tracking

This notebook creates a tar.gz archive of all files and folders starting from a specified root directory.

## Installation Requirements

Make sure you have tqdm installed:

```bash
pip install tqdm
```

## Features:
- Recursively traverses all subdirectories
- Creates compressed tar.gz archive
- Handles file permissions and metadata
- Beautiful progress bar with tqdm
- Tracks omitted files and their sizes
- Error handling for inaccessible files

In [None]:
# Install tqdm if not already installed
!pip install tqdm

In [None]:
import os
import tarfile
from pathlib import Path
import time
from datetime import datetime
from tqdm import tqdm

In [None]:
def format_size(size_bytes):
    """
    Convert bytes to human readable format
    """
    if size_bytes == 0:
        return "0 B"
    
    size_names = ["B", "KB", "MB", "GB", "TB"]
    i = 0
    while size_bytes >= 1024.0 and i < len(size_names) - 1:
        size_bytes /= 1024.0
        i += 1
    
    return f"{size_bytes:.2f} {size_names[i]}"

In [None]:
def create_recursive_archive(root_path, output_path, exclude_patterns=None):
    """
    Create a tar.gz archive of all files and folders recursively with detailed tracking
    
    Args:
        root_path (str): Root directory to archive
        output_path (str): Output tar.gz file path
        exclude_patterns (list): List of patterns to exclude (optional)
    
    Returns:
        bool: True if successful, False otherwise
    """
    
    if exclude_patterns is None:
        exclude_patterns = []
    
    # Validate input path
    if not os.path.exists(root_path):
        print(f"Error: Root path '{root_path}' does not exist.")
        return False
    
    # Get absolute paths
    root_path = os.path.abspath(root_path)
    output_path = os.path.abspath(output_path)
    
    print(f"Starting archive creation...")
    print(f"Source: {root_path}")
    print(f"Output: {output_path}")
    print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    if exclude_patterns:
        print(f"Excluding patterns: {exclude_patterns}")
    print("-" * 60)
    
    try:
        # Statistics tracking
        stats = {
            'total_files': 0,
            'total_dirs': 0,
            'total_size': 0,
            'omitted_files': 0,
            'omitted_dirs': 0,
            'omitted_size': 0,
            'errors': []
        }
        
        # First pass: scan and categorize all items
        print("Scanning directory structure...")
        items_to_archive = []
        
        for root, dirs, files in os.walk(root_path):
            # Process directories
            dirs_to_keep = []
            for dir_name in dirs:
                stats['total_dirs'] += 1
                if any(pattern in dir_name for pattern in exclude_patterns):
                    stats['omitted_dirs'] += 1
                else:
                    dirs_to_keep.append(dir_name)
                    dir_path = os.path.join(root, dir_name)
                    items_to_archive.append(('dir', dir_path, dir_name))
            
            # Update dirs list to prevent walking into excluded directories
            dirs[:] = dirs_to_keep
            
            # Process files
            for file_name in files:
                file_path = os.path.join(root, file_name)
                stats['total_files'] += 1
                
                # Get file size
                try:
                    file_size = os.path.getsize(file_path)
                    stats['total_size'] += file_size
                except (OSError, IOError):
                    file_size = 0
                
                # Check if file should be excluded
                if any(pattern in file_name for pattern in exclude_patterns):
                    stats['omitted_files'] += 1
                    stats['omitted_size'] += file_size
                else:
                    items_to_archive.append(('file', file_path, file_name))
        
        # Display scan results
        print(f"Scan completed:")
        print(f"  Total items found: {stats['total_files'] + stats['total_dirs']} (Files: {stats['total_files']}, Dirs: {stats['total_dirs']})")
        print(f"  Total size: {format_size(stats['total_size'])}")
        print(f"  Items to archive: {len(items_to_archive)}")
        
        if stats['omitted_files'] > 0 or stats['omitted_dirs'] > 0:
            print(f"  Items excluded: {stats['omitted_files'] + stats['omitted_dirs']} (Files: {stats['omitted_files']}, Dirs: {stats['omitted_dirs']})")
            print(f"  Excluded size: {format_size(stats['omitted_size'])}")
        
        print("-" * 60)
        
        # Second pass: create archive with progress bar
        with tarfile.open(output_path, 'w:gz') as tar:
            # Use tqdm for progress tracking
            with tqdm(total=len(items_to_archive), 
                     desc="Creating archive", 
                     unit="items",
                     bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]") as pbar:
                
                for item_type, full_path, name in items_to_archive:
                    try:
                        # Calculate relative path for archive
                        arcname = os.path.relpath(full_path, os.path.dirname(root_path))
                        
                        if item_type == 'dir':
                            tar.add(full_path, arcname=arcname, recursive=False)
                            pbar.set_postfix_str(f"Dir: {name[:30]}")
                        else:  # file
                            tar.add(full_path, arcname=arcname)
                            pbar.set_postfix_str(f"File: {name[:30]}")
                        
                        pbar.update(1)
                        
                    except Exception as e:
                        error_msg = f"Error adding {item_type} {full_path}: {str(e)[:100]}"
                        stats['errors'].append(error_msg)
                        tqdm.write(f"Warning: {error_msg}")
                        pbar.update(1)
        
        # Final summary
        archive_size = os.path.getsize(output_path)
        compression_ratio = (1 - archive_size / max(stats['total_size'] - stats['omitted_size'], 1)) * 100
        
        print("-" * 60)
        print(f"✅ Archive creation completed successfully!")
        print(f"")
        print(f"📊 Summary:")
        print(f"  Items processed: {len(items_to_archive)}")
        print(f"  Items excluded: {stats['omitted_files'] + stats['omitted_dirs']}")
        print(f"  Archive size: {format_size(archive_size)}")
        print(f"  Original size: {format_size(stats['total_size'] - stats['omitted_size'])}")
        print(f"  Compression ratio: {compression_ratio:.1f}%")
        print(f"  Space saved by exclusions: {format_size(stats['omitted_size'])}")
        print(f"  Errors encountered: {len(stats['errors'])}")
        
        if stats['errors']:
            print(f"\n⚠️  Error details:")
            for i, error in enumerate(stats['errors'][:5]):
                print(f"  {i+1}. {error}")
            if len(stats['errors']) > 5:
                print(f"  ... and {len(stats['errors']) - 5} more errors")
        
        return True
        
    except Exception as e:
        print(f"❌ Critical error during archive creation: {e}")
        return False

## Configuration

Set your parameters here:

In [None]:
# Configuration
ROOT_DIRECTORY = "/path/to/your/root/directory"  # Change this to your target directory
OUTPUT_ARCHIVE = "archive_backup.tar.gz"        # Output file name

# Optional: Files/directories to exclude (partial matches)
EXCLUDE_PATTERNS = [
    ".git",
    "__pycache__",
    ".pyc",
    ".DS_Store",
    "node_modules",
    ".tmp",
    ".cache",
    ".log"
]

print(f"📋 Configuration:")
print(f"  Root Directory: {ROOT_DIRECTORY}")
print(f"  Output Archive: {OUTPUT_ARCHIVE}")
print(f"  Exclude Patterns: {EXCLUDE_PATTERNS}")

## Execute Archive Creation

Run the cell below to start the archiving process:

In [None]:
# Execute the archive creation
start_time = time.time()

success = create_recursive_archive(
    root_path=ROOT_DIRECTORY,
    output_path=OUTPUT_ARCHIVE,
    exclude_patterns=EXCLUDE_PATTERNS
)

end_time = time.time()
duration = end_time - start_time

print(f"\n⏱️  Total execution time: {duration:.2f} seconds")

if success:
    print(f"🎉 Archive '{OUTPUT_ARCHIVE}' created successfully!")
else:
    print(f"💥 Archive creation failed. Check the error messages above.")

## Verify Archive (Optional)

Run this cell to verify the created archive:

In [None]:
def verify_archive(archive_path):
    """
    Verify the integrity of the created archive
    """
    try:
        with tarfile.open(archive_path, 'r:gz') as tar:
            members = tar.getmembers()
            
            files_count = len([m for m in members if m.isfile()])
            dirs_count = len([m for m in members if m.isdir()])
            
            print(f"🔍 Archive verification successful!")
            print(f"  Total entries: {len(members)}")
            print(f"  Files: {files_count}")
            print(f"  Directories: {dirs_count}")
            print(f"  Archive size: {format_size(os.path.getsize(archive_path))}")
            
            # Show first few entries
            print(f"\n📁 First 10 entries:")
            for i, member in enumerate(members[:10]):
                file_type = "📁" if member.isdir() else "📄"
                size_info = f" ({format_size(member.size)})" if member.isfile() else ""
                print(f"  {file_type} {member.name}{size_info}")
            
            if len(members) > 10:
                print(f"  ... and {len(members) - 10} more entries")
                
        return True
    except Exception as e:
        print(f"❌ Archive verification failed: {e}")
        return False

# Verify the archive
if os.path.exists(OUTPUT_ARCHIVE):
    verify_archive(OUTPUT_ARCHIVE)
else:
    print(f"❌ Archive file '{OUTPUT_ARCHIVE}' not found.")

## Usage Instructions

1. **Install Dependencies**: Run the first cell to install tqdm if needed
2. **Update Configuration**: Modify the `ROOT_DIRECTORY` variable to point to your target directory
3. **Set Output Path**: Change `OUTPUT_ARCHIVE` to your desired output file name/path
4. **Configure Exclusions**: Modify `EXCLUDE_PATTERNS` to exclude unwanted files/directories
5. **Run Archive Creation**: Execute the main archiving cell
6. **Verify Results**: Optionally run the verification cell to check the archive

## Output Features

The script now provides comprehensive reporting:

- **Real-time progress bar** with tqdm showing current file being processed
- **Detailed scan results** showing total files found vs. items to archive
- **Exclusion statistics** with counts and sizes of omitted files
- **Compression metrics** showing space savings
- **Error tracking** with detailed error messages
- **Archive verification** to ensure integrity

## Notes

- The script preserves file permissions and timestamps
- Large directories may take significant time to process
- The script handles errors gracefully and continues processing
- Memory usage is optimized by streaming files to the archive
- Progress bar shows current item being processed
- Detailed statistics help you understand what was archived vs excluded