In [4]:
import os
from docx import Document
from concurrent.futures import ThreadPoolExecutor
from typing import List, Dict, Union
import threading

class DocxMetadataExtractor:
    def __init__(self, max_workers: int = None):
        """
        Initialize the DOCX metadata extractor.
        
        Args:
            max_workers: Maximum number of worker threads. If None, uses default ThreadPoolExecutor behavior.
        """
        self.max_workers = max_workers
        self.lock = threading.Lock()
    
    def extract_key_values(self, text: str) -> Dict[str, str]:
        """
        Extract key-value pairs from text where pairs are separated by colons.
        
        Args:
            text: Input text containing key-value pairs
            
        Returns:
            Dictionary of key-value pairs
        """
        result = {}
        for line in text.split('\n'):
            if ':' in line:
                key, value = line.split(':', 1)
                result[key.strip()] = value.strip()
        return result
    
    def get_kv(self, data: List[tuple]) -> Dict[str, str]:
        """
        Extract key-value pairs from a list of tuples.
        
        Args:
            data: List of tuples containing text data
            
        Returns:
            Dictionary containing all extracted key-value pairs
        """
        doc_info = {}
        
        for entry in data:
            for part in entry:
                if part and part.strip():  # skip empty strings
                    doc_info.update(self.extract_key_values(part))
        
        return doc_info
    
    def process_single_file(self, filepath: str) -> Dict[str, Union[str, Dict[str, str]]]:
        """
        Process a single DOCX file and extract metadata from footers.
        
        Args:
            filepath: Full path to the DOCX file to process
            
        Returns:
            Dictionary containing filename and metadata
        """
        filename = os.path.basename(filepath)
        
        try:
            doc = Document(filepath)
            all_metadata = {}
            
            # Get all sections in the document
            for section in doc.sections:
                footer = section.footer
                
                # Extract text from footer paragraphs
                footer_text = []
                for paragraph in footer.paragraphs:
                    if paragraph.text.strip():  # Only add non-empty paragraphs
                        footer_text.append(paragraph.text.strip())
                
                # Process footer text for key-value pairs
                for text in footer_text:
                    all_metadata.update(self.extract_key_values(text))
                
                # Extract text from footer tables
                for table in footer.tables:
                    table_data = [tuple(c.text for c in r.cells) for r in table.rows]
                    table_metadata = self.get_kv(table_data)
                    all_metadata.update(table_metadata)
            
            return {
                "filename": filename,
                "metadata": all_metadata
            }
            
        except Exception as e:
            # Return filename with error information
            return {
                "filename": filename,
                "metadata": {"error": str(e)}
            }
    
    def _get_file_list(self, input_source: Union[str, List[str]]) -> List[str]:
        """
        Get list of DOCX files from input source.
        
        Args:
            input_source: Either a folder path or list of file paths
            
        Returns:
            List of full file paths
        """
        if isinstance(input_source, str):
            # Input is a folder path
            if not os.path.exists(input_source):
                raise FileNotFoundError(f"Folder not found: {input_source}")
            
            if not os.path.isdir(input_source):
                raise NotADirectoryError(f"Path is not a directory: {input_source}")
            
            # Get all .docx files in the folder
            docx_files = []
            for filename in os.listdir(input_source):
                if filename.endswith('.docx'):
                    docx_files.append(os.path.join(input_source, filename))
            
            if not docx_files:
                raise ValueError(f"No .docx files found in folder: {input_source}")
            
            return docx_files
        
        elif isinstance(input_source, list):
            # Input is a list of file paths
            if not input_source:
                raise ValueError("File list is empty")
            
            # Validate files exist and are .docx files
            valid_files = []
            for filepath in input_source:
                if not os.path.exists(filepath):
                    print(f"Warning: File not found: {filepath}")
                    continue
                
                if not filepath.endswith('.docx'):
                    print(f"Warning: Not a .docx file: {filepath}")
                    continue
                
                valid_files.append(filepath)
            
            if not valid_files:
                raise ValueError("No valid .docx files found in the provided list")
            
            return valid_files
        
        else:
            raise TypeError("Input must be either a folder path (string) or list of file paths")
    
    def extract_metadata(self, input_source: Union[str, List[str]]) -> List[Dict[str, Union[str, Dict[str, str]]]]:
        """
        Extract metadata from DOCX files using parallel processing.
        
        Args:
            input_source: Either a folder path (string) or list of file paths (list)
            
        Returns:
            List of dictionaries, each containing 'filename' and 'metadata' keys
        """
        # Get list of files to process
        file_list = self._get_file_list(input_source)
        
        print(f"Found {len(file_list)} .docx files to process:")
        for filepath in file_list:
            print(f"  - {os.path.basename(filepath)}")
        
        # Process files in parallel
        results = []
        with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
            # Submit all tasks
            future_to_file = {
                executor.submit(self.process_single_file, filepath): filepath 
                for filepath in file_list
            }
            
            # Collect results as they complete
            for future in future_to_file:
                try:
                    result = future.result()
                    results.append(result)
                    
                    # Thread-safe printing
                    with self.lock:
                        print(f"Processed: {result['filename']}")
                        if result['metadata'] and 'error' not in result['metadata']:
                            print(f"  Found {len(result['metadata'])} metadata entries")
                        elif 'error' in result['metadata']:
                            print(f"  Error: {result['metadata']['error']}")
                        else:
                            print(f"  No metadata found")
                            
                except Exception as e:
                    filepath = future_to_file[future]
                    filename = os.path.basename(filepath)
                    with self.lock:
                        print(f"Error processing {filename}: {e}")
                    results.append({
                        "filename": filename,
                        "metadata": {"error": str(e)}
                    })
        
        return results
    
    def save_results_to_file(self, results: List[Dict[str, Union[str, Dict[str, str]]]], output_file: str = "metadata_results.txt"):
        """
        Save the extraction results to a text file.
        
        Args:
            results: List of dictionaries from extract_metadata
            output_file: Name of the output file
        """
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("DOCX Metadata Extraction Results\n")
            f.write("=" * 50 + "\n\n")
            
            for result in results:
                f.write(f"File: {result['filename']}\n")
                f.write("-" * 30 + "\n")
                
                metadata = result['metadata']
                if metadata:
                    for key, value in metadata.items():
                        f.write(f"{key}: {value}\n")
                else:
                    f.write("No metadata found\n")
                
                f.write("\n")
        
        print(f"Results saved to: {output_file}")


# Example usage
if __name__ == "__main__":
    # Create extractor
    extractor = DocxMetadataExtractor(max_workers=4)
    
    # Example 1: Process all DOCX files in a folder
    try:
        results = extractor.extract_metadata(".")
        print(f"\nProcessed {len(results)} files from folder.")
        
        # Access results
        for result in results:
            file_name = result['filename']
            file_type = os.path.splitext(file_name)[1].lstrip('.')
            template_id = result['metadata'].get('Template Number', 'N/A')
            print(f"{file_name} of type {file_type}: {template_id}")
    
    except Exception as e:
        print(f"Error processing folder: {e}")
    
    # # Example 2: Process specific list of files
    # file_list = [
    #     "document1.docx",
    #     "document2.docx",
    #     "path/to/document3.docx"
    # ]
    
    # try:
    #     results = extractor.extract_metadata(file_list)
    #     print(f"\nProcessed {len(results)} files from list.")
        
    #     # Optionally save results to file
    #     extractor.save_results_to_file(results)
        
    # except Exception as e:
    #     print(f"Error processing file list: {e}")

Found 9 .docx files to process:
  - CS Log file handling.docx
  - CS ExamCardsDatabaseService_Voxel_D002055389_RevA.docx
  - CS ExamCards verA.docx
  - CS Connectivity_R13.0_D002055364_RevA.docx
  - CS Exam Overview_verA.docx
  - CS PDT Voxel.docx
  - ComponentSpecifications_MR-RT_RTgo5.13_D002050479_RevA.docx
  - CS DicomConfigTool_R13.0_D002055371_RevA.docx
  - CS LayoutManagerService__verA.docx
Processed: CS Log file handling.docx
  Found 5 metadata entries
Processed: CS ExamCardsDatabaseService_Voxel_D002055389_RevA.docx
  Found 5 metadata entries
Processed: CS ExamCards verA.docx
  Found 5 metadata entries
Processed: CS Connectivity_R13.0_D002055364_RevA.docx
  Found 5 metadata entries
Processed: CS Exam Overview_verA.docx
  Found 5 metadata entries
Processed: CS PDT Voxel.docx
  Found 5 metadata entries
Processed: ComponentSpecifications_MR-RT_RTgo5.13_D002050479_RevA.docx
  Found 5 metadata entries
Processed: CS DicomConfigTool_R13.0_D002055371_RevA.docx
  Found 5 metadata entri