In [1]:
import os
from docx import Document
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, Dict, Any
import threading

class DocxMetadataExtractor:
    def __init__(self, max_workers: int = None):
        """
        Initialize the DOCX metadata extractor.
        
        Args:
            max_workers: Maximum number of worker threads. If None, uses default ThreadPoolExecutor behavior.
        """
        self.max_workers = max_workers
        self.lock = threading.Lock()
    
    def extract_key_values(self, text: str) -> Dict[str, str]:
        """
        Extract key-value pairs from text where pairs are separated by colons.
        
        Args:
            text: Input text containing key-value pairs
            
        Returns:
            Dictionary of key-value pairs
        """
        result = {}
        for line in text.split('\n'):
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip()] = value.strip()
        return result
    
    def get_kv(self, data: List[Tuple]) -> Dict[str, str]:
        """
        Extract key-value pairs from a list of tuples.
        
        Args:
            data: List of tuples containing text data
            
        Returns:
            Dictionary containing all extracted key-value pairs
        """
        doc_info = {}
        
        for entry in data:
            for part in entry:
                if part and part.strip():  # skip empty strings
                    doc_info.update(self.extract_key_values(part))
        
        return doc_info
    
    def process_single_file(self, filename: str) -> Tuple[str, Dict[str, str]]:
        """
        Process a single DOCX file and extract metadata from footers.
        
        Args:
            filename: Name of the DOCX file to process
            
        Returns:
            Tuple containing (filename, extracted_metadata_dict)
        """
        try:
            doc = Document(filename)
            all_metadata = {}
            
            # Get all sections in the document
            for section in doc.sections:
                footer = section.footer
                
                # Extract text from footer paragraphs
                footer_text = []
                for paragraph in footer.paragraphs:
                    if paragraph.text.strip():  # Only add non-empty paragraphs
                        footer_text.append(paragraph.text.strip())
                
                # Process footer text for key-value pairs
                for text in footer_text:
                    all_metadata.update(self.extract_key_values(text))
                
                # Extract text from footer tables
                for table in footer.tables:
                    table_data = [tuple(c.text for c in r.cells) for r in table.rows]
                    table_metadata = self.get_kv(table_data)
                    all_metadata.update(table_metadata)
            
            return (filename, all_metadata)
            
        except Exception as e:
            # Return filename with error information
            return (filename, {"error": str(e)})
    
    def get_metadata_footer_parallel(self, folder: str = '.') -> List[Tuple[str, Dict[str, str]]]:
        """
        Extract metadata from all DOCX files in a folder using parallel processing.
        
        Args:
            folder: Path to the folder containing DOCX files (default: current directory)
            
        Returns:
            List of tuples containing (filename, metadata_dict) for each file
        """
        # Get all .docx files in the specified folder
        docs = [f for f in os.listdir(folder) if f.endswith('.docx')]
        
        if not docs:
            print(f"No .docx files found in folder: {folder}")
            return []
        
        print(f"Found {len(docs)} .docx files:")
        for file in docs:
            print(f"  - {file}")
        
        # Process files in parallel
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.process_single_file, os.path.join(folder, doc)): doc 
                for doc in docs
            }
            
            # Collect results as they complete
            for future in future_to_file:
                try:
                    result = future.result()
                    results.append(result)
                    
                    # Thread-safe printing
                    with self.lock:
                        print(f"Processed: {result[0]}")
                        if result[1]:  # If metadata was found
                            print(f"  Metadata: {result[1]}")
                        else:
                            print(f"  No metadata found")
                            
                except Exception as e:
                    filename = future_to_file[future]
                    with self.lock:
                        print(f"Error processing {filename}: {e}")
                    results.append((filename, {"error": str(e)}))
        
        return results
    
    def save_results_to_file(self, results: List[Tuple[str, Dict[str, str]]], output_file: str = "metadata_results.txt"):
        """
        Save the extraction results to a text file.
        
        Args:
            results: List of tuples from get_metadata_footer_parallel
            output_file: Name of the output file
        """
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("DOCX Metadata Extraction Results\n")
            f.write("=" * 50 + "\n\n")
            
            for filename, metadata in results:
                f.write(f"File: {filename}\n")
                f.write("-" * 30 + "\n")
                
                if metadata:
                    for key, value in metadata.items():
                        f.write(f"{key}: {value}\n")
                else:
                    f.write("No metadata found\n")
                
                f.write("\n")
        
        print(f"Results saved to: {output_file}")


# Example usage
if __name__ == "__main__":
    # Create extractor with default number of workers
    extractor = DocxMetadataExtractor()
    
    # Extract metadata from all DOCX files in current directory
    results = extractor.get_metadata_footer_parallel()
    
    # Print summary
    print(f"\nProcessing complete! Processed {len(results)} files.")
    
    # Optionally save results to file
    extractor.save_results_to_file(results)
    
    # Access individual results
    for filename, metadata in results:
        print(f"\n{filename}: {len(metadata)} metadata entries")

Found 9 .docx files:
  - CS Log file handling.docx
  - CS ExamCardsDatabaseService_Voxel_D002055389_RevA.docx
  - CS ExamCards verA.docx
  - CS Connectivity_R13.0_D002055364_RevA.docx
  - CS Exam Overview_verA.docx
  - CS PDT Voxel.docx
  - ComponentSpecifications_MR-RT_RTgo5.13_D002050479_RevA.docx
  - CS DicomConfigTool_R13.0_D002055371_RevA.docx
  - CS LayoutManagerService__verA.docx
Processed: ./CS Log file handling.docx
  Metadata: {'Document ID': 'D002054507', 'Document Revision': '', 'Template Number': '2001001526', 'Template Version': '2', 'Philips Information Classification': 'Internal'}
Processed: ./CS ExamCardsDatabaseService_Voxel_D002055389_RevA.docx
  Metadata: {'Document ID': 'D002055389', 'Document Revision': 'A', 'Template Number': '2001001526', 'Template Version': '2', 'Philips Information Classification': 'Internal'}
Processed: ./CS ExamCards verA.docx
  Metadata: {'Document ID': 'D002064158', 'Document Revision': 'A', 'Template Number': '2001001526', 'Template Versi