# PST Attachment Extractor for Databricks

This notebook:
1. Recursively searches for PST files in a Databricks volume
2. Extracts all attachments from PST files **in parallel using Spark**
3. Saves attachments to a volume organized by message ID

**Key Features:**
- ⚡ Parallel processing of multiple PST files using Spark executors
- 📎 Extracts all attachment types (documents, images, archives, etc.)
- 📁 Organized folder structure by message ID
- 🔄 Configurable overwrite behavior
- 📊 Real-time progress tracking and statistics


In [None]:
# Install required library for PST parsing
%pip install pypff-python --quiet


In [None]:
import os
from pathlib import Path
from datetime import datetime
import hashlib
from pyspark.sql import SparkSession
import pypff


In [None]:
# Configuration
PST_VOLUME_PATH = "/Volumes/catalog/schema/pst_files"  # Update with your PST files volume path
ATTACHMENTS_VOLUME_PATH = "/Volumes/catalog/schema/attachments"  # Update with your attachments output path

# Processing Configuration
OVERWRITE_EXISTING_ATTACHMENTS = True  # Set to False to skip existing attachments
ENABLE_PARALLEL_PROCESSING = True  # Set to False for sequential processing
NUM_PARTITIONS = None  # Set to None for auto (uses number of files), or specify an integer


## 1. File Discovery Functions


In [None]:
def find_pst_files(root_path):
    """
    Recursively search for PST files in the given path.
    
    Args:
        root_path: Root directory to search
        
    Returns:
        List of tuples: (file_path, file_size_bytes)
    """
    pst_files = []
    
    print(f"Searching for PST files in: {root_path}")
    
    for root, dirs, files in os.walk(root_path):
        for file in files:
            if file.lower().endswith('.pst'):
                file_path = os.path.join(root, file)
                try:
                    file_size = os.path.getsize(file_path)
                    pst_files.append((file_path, file_size))
                    print(f"Found: {file_path} ({file_size / (1024**2):.2f} MB)")
                except Exception as e:
                    print(f"Error accessing {file_path}: {str(e)}")
    
    print(f"\nTotal PST files found: {len(pst_files)}")
    return pst_files


## 2. Attachment Extraction Functions


In [None]:
def extract_attachment(attachment, message_id, attachment_index, output_volume_path, overwrite=True):
    """
    Extract a single attachment and save it to the volume.
    
    Args:
        attachment: pypff attachment object
        message_id: Unique message ID for organizing attachments
        attachment_index: Index of the attachment in the message
        output_volume_path: Base path to save attachments
        overwrite: If True, overwrite existing files; if False, skip existing files
        
    Returns:
        Dictionary with attachment metadata
    """
    try:
        # Get attachment properties
        filename = attachment.name if attachment.name else f"attachment_{attachment_index}"
        file_size = attachment.size if hasattr(attachment, 'size') else 0
        
        # Sanitize filename to avoid path issues
        filename = filename.replace("/", "_").replace("\\", "_").replace("..", "_")
        
        # Create directory structure: output_volume_path/message_id/
        attachment_dir = os.path.join(output_volume_path, message_id)
        os.makedirs(attachment_dir, exist_ok=True)
        
        # Full path for the attachment file
        attachment_path = os.path.join(attachment_dir, filename)
        
        # Check if file exists and handle based on overwrite setting
        if os.path.exists(attachment_path) and not overwrite:
            return {
                "attachment_path": attachment_path,
                "filename": filename,
                "file_size": file_size,
                "status": "skipped"
            }
        
        # Extract and save attachment data
        attachment_data = attachment.read_buffer(attachment.size)
        
        with open(attachment_path, 'wb') as f:
            f.write(attachment_data)
        
        return {
            "attachment_path": attachment_path,
            "filename": filename,
            "file_size": file_size,
            "status": "saved"
        }
        
    except Exception as e:
        return {
            "attachment_path": None,
            "filename": f"attachment_{attachment_index}",
            "file_size": 0,
            "status": "error",
            "error": str(e)
        }


def process_message_attachments(message, source_file, folder_name, output_volume_path, overwrite=True):
    """
    Extract all attachments from a message.
    
    Args:
        message: pypff message object
        source_file: Source PST file path
        folder_name: Folder containing the message
        output_volume_path: Base path to save attachments
        overwrite: If True, overwrite existing files
        
    Returns:
        Dictionary with message info and attachment results
    """
    try:
        # Get message properties
        subject = message.subject if message.subject else ""
        sender = message.sender_name if message.sender_name else ""
        delivery_time = None
        
        try:
            if message.delivery_time:
                delivery_time = datetime.fromtimestamp(message.delivery_time)
        except:
            pass
        
        # Generate unique message ID
        message_id = hashlib.md5(f"{source_file}{subject}{sender}{delivery_time}".encode()).hexdigest()
        
        # Process attachments
        attachments_extracted = []
        if message.number_of_attachments > 0:
            for idx, attachment in enumerate(message.attachments):
                attachment_info = extract_attachment(
                    attachment, 
                    message_id, 
                    idx, 
                    output_volume_path,
                    overwrite
                )
                attachments_extracted.append(attachment_info)
        
        return {
            "message_id": message_id,
            "source_file": source_file,
            "folder_name": folder_name,
            "subject": subject,
            "attachments_count": message.number_of_attachments,
            "attachments": attachments_extracted
        }
        
    except Exception as e:
        print(f"  Error processing message: {str(e)}")
        return None


def process_folder_attachments(folder, source_file, folder_path, output_volume_path, overwrite=True):
    """
    Recursively process folders and extract attachments from messages.
    
    Args:
        folder: pypff folder object
        source_file: Source PST file path
        folder_path: Current folder path for hierarchy
        output_volume_path: Base path to save attachments
        overwrite: If True, overwrite existing files
        
    Returns:
        Tuple: (messages_with_attachments, total_attachments_extracted)
    """
    messages_with_attachments = 0
    total_attachments = 0
    
    try:
        folder_name = folder.name if folder.name else "Unknown"
        current_path = f"{folder_path}/{folder_name}" if folder_path else folder_name
        
        # Process messages in current folder
        if folder.number_of_sub_messages > 0:
            for message in folder.sub_messages:
                if message.number_of_attachments > 0:
                    result = process_message_attachments(
                        message, 
                        source_file, 
                        current_path,
                        output_volume_path,
                        overwrite
                    )
                    if result:
                        messages_with_attachments += 1
                        # Count successfully extracted attachments
                        for att in result["attachments"]:
                            if att["status"] == "saved":
                                total_attachments += 1
        
        # Recursively process subfolders
        if folder.number_of_sub_folders > 0:
            for sub_folder in folder.sub_folders:
                sub_msgs, sub_atts = process_folder_attachments(
                    sub_folder, 
                    source_file, 
                    current_path,
                    output_volume_path,
                    overwrite
                )
                messages_with_attachments += sub_msgs
                total_attachments += sub_atts
    
    except Exception as e:
        print(f"  Error processing folder {folder_path}: {str(e)}")
    
    return messages_with_attachments, total_attachments


## 3. PST File Processing Functions


In [None]:
def extract_attachments_from_pst(file_info):
    """
    Extract attachments from a single PST file.
    Designed to be called by Spark executors for parallel processing.
    
    Args:
        file_info: Tuple of (file_path, file_size, output_path, overwrite)
        
    Returns:
        Tuple: (file_path, file_size_mb, success, messages_with_attachments, total_attachments, error_message)
    """
    import pypff
    import traceback
    import os
    
    file_path, file_size, output_path, overwrite = file_info
    file_size_mb = file_size / (1024**2)
    
    print(f"[Executor {os.getpid()}] Processing: {file_path}")
    
    try:
        # Open PST file
        pst = pypff.file()
        pst.open(file_path)
        root = pst.get_root_folder()
        
        messages_with_attachments = 0
        total_attachments = 0
        
        if root:
            messages_with_attachments, total_attachments = process_folder_attachments(
                root, 
                file_path,
                "",
                output_path,
                overwrite
            )
        
        pst.close()
        
        print(f"[Executor {os.getpid()}] {file_path}: {messages_with_attachments} messages, {total_attachments} attachments")
        return (file_path, file_size_mb, True, messages_with_attachments, total_attachments, None)
        
    except Exception as e:
        error_msg = f"{str(e)}\n{traceback.format_exc()}"
        print(f"[Executor {os.getpid()}] Error processing {file_path}: {str(e)}")
        return (file_path, file_size_mb, False, 0, 0, error_msg)


## 4. Parallel Processing Pipeline


In [None]:
def extract_attachments_parallel(pst_volume_path, output_volume_path, overwrite=True, num_partitions=None):
    """
    Extract attachments from PST files using Spark parallelism.
    
    Args:
        pst_volume_path: Path to volume containing PST files
        output_volume_path: Path to save extracted attachments
        overwrite: If True, overwrite existing attachment files
        num_partitions: Number of Spark partitions (None = auto)
        
    Returns:
        Dictionary with extraction statistics
    """
    print("=" * 80)
    print("PST Attachment Extraction Pipeline (Parallel Mode)")
    print("=" * 80)
    print(f"PST Files Location: {pst_volume_path}")
    print(f"Attachments Output: {output_volume_path}")
    print(f"Overwrite Existing: {overwrite}")
    print("=" * 80)
    
    # Step 1: Find all PST files
    print("\n[Step 1] Discovering PST files...")
    pst_files = find_pst_files(pst_volume_path)
    
    if not pst_files:
        print("No PST files found. Exiting.")
        return {"status": "no_files", "files_processed": 0, "total_attachments": 0}
    
    # Determine number of partitions
    if num_partitions is None:
        num_partitions = min(len(pst_files), 100)  # Cap at 100 partitions
    
    print(f"\n[Step 2] Configuring Spark parallelism...")
    print(f"  Files to process: {len(pst_files)}")
    print(f"  Spark partitions: {num_partitions}")
    print(f"  Executors will process files in parallel")
    
    # Prepare file info tuples
    file_info_list = [(fp, fs, output_volume_path, overwrite) for fp, fs in pst_files]
    
    # Step 3: Distribute processing across Spark executors
    print(f"\n[Step 3] Extracting attachments in parallel...")
    print("=" * 80)
    
    # Create RDD and process files in parallel
    files_rdd = spark.sparkContext.parallelize(file_info_list, num_partitions)
    
    # Process files and collect results
    results = files_rdd.map(extract_attachments_from_pst).collect()
    
    # Process results
    print("\n" + "=" * 80)
    print("EXTRACTION RESULTS")
    print("=" * 80)
    
    total_messages_with_attachments = 0
    total_attachments_extracted = 0
    successful_files = 0
    failed_files = []
    
    for file_path, file_size_mb, success, msg_count, att_count, error_msg in results:
        if success:
            successful_files += 1
            total_messages_with_attachments += msg_count
            total_attachments_extracted += att_count
            print(f"✓ {file_path}")
            print(f"    Size: {file_size_mb:.2f} MB | Messages: {msg_count} | Attachments: {att_count}")
        else:
            failed_files.append((file_path, error_msg))
            print(f"✗ {file_path} - FAILED")
            if error_msg:
                print(f"    Error: {error_msg[:200]}...")
    
    # Summary
    print("\n" + "=" * 80)
    print("EXTRACTION SUMMARY")
    print("=" * 80)
    print(f"✓ Successful files: {successful_files}/{len(pst_files)}")
    print(f"✗ Failed files: {len(failed_files)}/{len(pst_files)}")
    print(f"📧 Messages with attachments: {total_messages_with_attachments}")
    print(f"📎 Total attachments extracted: {total_attachments_extracted}")
    print(f"📁 Attachments location: {output_volume_path}")
    print("=" * 80)
    
    if failed_files:
        print("\n⚠️  Failed Files:")
        for file_path, error_msg in failed_files:
            print(f"  - {file_path}")
    
    return {
        "status": "complete",
        "files_processed": successful_files,
        "files_failed": len(failed_files),
        "messages_with_attachments": total_messages_with_attachments,
        "total_attachments": total_attachments_extracted,
        "failed_files": failed_files
    }


## 5. Sequential Processing Pipeline


In [None]:
def extract_attachments_sequential(pst_volume_path, output_volume_path, overwrite=True):
    """
    Extract attachments from PST files sequentially (one at a time).
    
    Args:
        pst_volume_path: Path to volume containing PST files
        output_volume_path: Path to save extracted attachments
        overwrite: If True, overwrite existing attachment files
        
    Returns:
        Dictionary with extraction statistics
    """
    print("=" * 80)
    print("PST Attachment Extraction Pipeline (Sequential Mode)")
    print("=" * 80)
    print(f"PST Files Location: {pst_volume_path}")
    print(f"Attachments Output: {output_volume_path}")
    print(f"Overwrite Existing: {overwrite}")
    print("=" * 80)
    
    # Step 1: Find all PST files
    print("\n[Step 1] Discovering PST files...")
    pst_files = find_pst_files(pst_volume_path)
    
    if not pst_files:
        print("No PST files found. Exiting.")
        return {"status": "no_files", "files_processed": 0, "total_attachments": 0}
    
    # Step 2: Process each PST file
    print("\n[Step 2] Extracting attachments...")
    print("=" * 80)
    
    total_messages_with_attachments = 0
    total_attachments_extracted = 0
    successful_files = 0
    failed_files = []
    
    for idx, (file_path, file_size) in enumerate(pst_files, 1):
        print(f"\n[{idx}/{len(pst_files)}] Processing: {file_path}")
        print(f"  File size: {file_size / (1024**2):.2f} MB")
        
        try:
            pst = pypff.file()
            pst.open(file_path)
            root = pst.get_root_folder()
            
            if root:
                msg_count, att_count = process_folder_attachments(
                    root, 
                    file_path,
                    "",
                    output_volume_path,
                    overwrite
                )
                total_messages_with_attachments += msg_count
                total_attachments_extracted += att_count
                print(f"  ✓ Extracted {att_count} attachments from {msg_count} messages")
            
            pst.close()
            successful_files += 1
            
        except Exception as e:
            print(f"  ✗ Error: {str(e)}")
            failed_files.append((file_path, str(e)))
    
    # Summary
    print("\n" + "=" * 80)
    print("EXTRACTION SUMMARY")
    print("=" * 80)
    print(f"✓ Successful files: {successful_files}/{len(pst_files)}")
    print(f"✗ Failed files: {len(failed_files)}/{len(pst_files)}")
    print(f"📧 Messages with attachments: {total_messages_with_attachments}")
    print(f"📎 Total attachments extracted: {total_attachments_extracted}")
    print(f"📁 Attachments location: {output_volume_path}")
    print("=" * 80)
    
    if failed_files:
        print("\n⚠️  Failed Files:")
        for file_path, error_msg in failed_files:
            print(f"  - {file_path}")
            print(f"    Error: {error_msg[:200]}")
    
    return {
        "status": "complete",
        "files_processed": successful_files,
        "files_failed": len(failed_files),
        "messages_with_attachments": total_messages_with_attachments,
        "total_attachments": total_attachments_extracted,
        "failed_files": failed_files
    }


## 6. Preview/Inspect PST File (Optional)


In [None]:
def inspect_pst_attachments(pst_file_path, max_messages_to_show=20):
    """
    Inspect a PST file and display information about messages with attachments.
    Does NOT extract attachments - just shows what's available.
    
    Args:
        pst_file_path: Path to PST file to inspect
        max_messages_to_show: Maximum number of messages with attachments to display
        
    Returns:
        Dictionary with attachment statistics
    """
    print("=" * 80)
    print(f"INSPECTING PST FILE: {pst_file_path}")
    print("=" * 80)
    
    try:
        # Check if file exists
        if not os.path.exists(pst_file_path):
            print(f"❌ File not found: {pst_file_path}")
            return None
        
        file_size = os.path.getsize(pst_file_path)
        print(f"File size: {file_size / (1024**2):.2f} MB")
        
        # Open PST file
        pst = pypff.file()
        pst.open(pst_file_path)
        root = pst.get_root_folder()
        
        if not root:
            print("❌ No root folder found in PST file")
            pst.close()
            return None
        
        # Statistics
        total_messages = 0
        messages_with_attachments = 0
        total_attachments = 0
        attachment_details = []
        
        # Recursive function to traverse folders
        def inspect_folder(folder, folder_path=""):
            nonlocal total_messages, messages_with_attachments, total_attachments
            
            folder_name = folder.name if folder.name else "Unknown"
            current_path = f"{folder_path}/{folder_name}" if folder_path else folder_name
            
            # Process messages in current folder
            if folder.number_of_sub_messages > 0:
                for message in folder.sub_messages:
                    total_messages += 1
                    
                    if message.number_of_attachments > 0:
                        messages_with_attachments += 1
                        
                        # Get message details
                        subject = message.subject if message.subject else "(No Subject)"
                        sender = message.sender_name if message.sender_name else "(Unknown Sender)"
                        
                        delivery_time = None
                        try:
                            if message.delivery_time:
                                delivery_time = datetime.fromtimestamp(message.delivery_time)
                        except:
                            pass
                        
                        # Get attachment details
                        attachments = []
                        for idx, attachment in enumerate(message.attachments):
                            att_name = attachment.name if attachment.name else f"attachment_{idx}"
                            att_size = attachment.size if hasattr(attachment, 'size') else 0
                            attachments.append({
                                "name": att_name,
                                "size": att_size,
                                "size_kb": att_size / 1024
                            })
                            total_attachments += 1
                        
                        attachment_details.append({
                            "folder": current_path,
                            "subject": subject,
                            "sender": sender,
                            "delivery_time": delivery_time,
                            "attachment_count": len(attachments),
                            "attachments": attachments
                        })
            
            # Recursively process subfolders
            if folder.number_of_sub_folders > 0:
                for sub_folder in folder.sub_folders:
                    inspect_folder(sub_folder, current_path)
        
        # Inspect all folders
        inspect_folder(root)
        
        pst.close()
        
        # Display results
        print("\n" + "=" * 80)
        print("STATISTICS")
        print("=" * 80)
        print(f"📧 Total messages: {total_messages:,}")
        print(f"📎 Messages with attachments: {messages_with_attachments:,} ({messages_with_attachments/max(total_messages, 1)*100:.1f}%)")
        print(f"📁 Total attachments: {total_attachments:,}")
        if messages_with_attachments > 0:
            print(f"📊 Average attachments per message (with attachments): {total_attachments/messages_with_attachments:.1f}")
        
        # Display message details
        if attachment_details:
            print("\n" + "=" * 80)
            print(f"MESSAGES WITH ATTACHMENTS (showing first {min(max_messages_to_show, len(attachment_details))})")
            print("=" * 80)
            
            for idx, msg in enumerate(attachment_details[:max_messages_to_show], 1):
                print(f"\n📧 Message {idx}:")
                print(f"   Folder: {msg['folder']}")
                print(f"   Subject: {msg['subject'][:70]}")
                print(f"   Sender: {msg['sender']}")
                if msg['delivery_time']:
                    print(f"   Date: {msg['delivery_time']}")
                print(f"   Attachments ({msg['attachment_count']}):")
                
                for att in msg['attachments']:
                    print(f"      📎 {att['name']} ({att['size_kb']:.2f} KB)")
            
            if len(attachment_details) > max_messages_to_show:
                print(f"\n   ... and {len(attachment_details) - max_messages_to_show} more messages with attachments")
        else:
            print("\n❌ No attachments found in this PST file")
        
        print("\n" + "=" * 80)
        
        return {
            "file_path": pst_file_path,
            "file_size_mb": file_size / (1024**2),
            "total_messages": total_messages,
            "messages_with_attachments": messages_with_attachments,
            "total_attachments": total_attachments,
            "attachment_details": attachment_details
        }
        
    except Exception as e:
        print(f"\n❌ Error inspecting PST file: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


In [None]:
# Example: Inspect a single PST file to see what attachments are present
# Uncomment and modify the path to inspect a specific PST file

# SAMPLE_PST_PATH = "/Volumes/catalog/schema/pst_files/sample.pst"
# 
# inspection_results = inspect_pst_attachments(
#     pst_file_path=SAMPLE_PST_PATH,
#     max_messages_to_show=20  # Number of messages with attachments to display
# )
# 
# # The function returns a dictionary with statistics that you can use
# if inspection_results:
#     print(f"\n✅ Inspection complete!")
#     print(f"   Found {inspection_results['total_attachments']} attachments in {inspection_results['messages_with_attachments']} messages")


## 7. Main Execution


In [None]:
# Run the attachment extraction pipeline
start_time = datetime.now()

if ENABLE_PARALLEL_PROCESSING:
    results = extract_attachments_parallel(
        pst_volume_path=PST_VOLUME_PATH,
        output_volume_path=ATTACHMENTS_VOLUME_PATH,
        overwrite=OVERWRITE_EXISTING_ATTACHMENTS,
        num_partitions=NUM_PARTITIONS
    )
else:
    results = extract_attachments_sequential(
        pst_volume_path=PST_VOLUME_PATH,
        output_volume_path=ATTACHMENTS_VOLUME_PATH,
        overwrite=OVERWRITE_EXISTING_ATTACHMENTS
    )

end_time = datetime.now()
duration = end_time - start_time

print(f"\n⏱️  Total processing time: {duration}")
if results.get('total_attachments', 0) > 0:
    print(f"📈 Extraction rate: {results['total_attachments'] / max(duration.total_seconds(), 1):.2f} attachments/second")


## 8. Verify Extracted Attachments


In [None]:
# List the extracted attachments directory structure
def list_attachment_summary(attachments_path, max_folders=10):
    """
    Display a summary of extracted attachments.
    """
    print(f"\nAttachment Directory Structure: {attachments_path}")
    print("=" * 80)
    
    try:
        folders = [d for d in os.listdir(attachments_path) if os.path.isdir(os.path.join(attachments_path, d))]
        
        print(f"Total message folders: {len(folders)}")
        print(f"\nShowing first {min(max_folders, len(folders))} folders:")
        print("-" * 80)
        
        for folder in folders[:max_folders]:
            folder_path = os.path.join(attachments_path, folder)
            files = os.listdir(folder_path)
            total_size = sum(os.path.getsize(os.path.join(folder_path, f)) for f in files if os.path.isfile(os.path.join(folder_path, f)))
            
            print(f"\n📁 Message ID: {folder}")
            print(f"   Attachments: {len(files)}")
            print(f"   Total size: {total_size / 1024:.2f} KB")
            for f in files[:5]:  # Show first 5 files
                file_path = os.path.join(folder_path, f)
                if os.path.isfile(file_path):
                    file_size = os.path.getsize(file_path)
                    print(f"     - {f} ({file_size / 1024:.2f} KB)")
            if len(files) > 5:
                print(f"     ... and {len(files) - 5} more files")
        
        if len(folders) > max_folders:
            print(f"\n... and {len(folders) - max_folders} more folders")
            
    except Exception as e:
        print(f"Error reading attachments directory: {str(e)}")

# Display summary
list_attachment_summary(ATTACHMENTS_VOLUME_PATH)


## 9. Cleanup (Optional)


In [None]:
# Uncomment to remove all extracted attachments (use with caution!)
# import shutil
# 
# def cleanup_attachments(attachments_path):
#     """
#     Remove all extracted attachments. USE WITH CAUTION!
#     """
#     try:
#         if os.path.exists(attachments_path):
#             shutil.rmtree(attachments_path)
#             print(f"✓ Removed all attachments from: {attachments_path}")
#         else:
#             print(f"Directory does not exist: {attachments_path}")
#     except Exception as e:
#         print(f"Error cleaning up: {str(e)}")
# 
# # Uncomment to execute cleanup
# # cleanup_attachments(ATTACHMENTS_VOLUME_PATH)
