In [16]:
import pandas as pd
import os
import re
from pathlib import Path
from openpyxl.styles import Font, PatternFill
from typing import Optional, Dict, Any, List

In [17]:
class LeadershipIndicatorExtractor:
    """
    A class to extract and process leadership and consumer information indicators from files.
    """
    def __init__(self):
        """Initializes the extractor with configuration settings."""
        # --- Configuration for Elements to Extract ---
        self.ELEMENT_NAME_COL = 'Element Name'
        self.VALUE_COL = 'Fact Value'
        self.COMPANY_NAME_ELEMENT = 'NameOfTheCompany'

        # The specific leadership/consumer elements to find
        self.INDICATOR_ELEMENTS = {
            'Displays Extra Product Info': 'DoesTheEntityDisplayProductInformationOnTheProductOverAndAboveWhatIsMandatedAsPerLocalLaws',
            'Carried Out Consumer Survey': 'DidYourEntityCarryOutAnySurveyWithRegardToConsumerSatisfactionRelatingToTheMajorProductsOrServicesOfTheEntitySignificantLocationsOfOperationOfTheEntityOrTheEntityAsAWhole',
            'Provides Product Weblink': 'WeblinkWhereInformationOnProductsAndServicesOfTheEntityCanBeAccessedExplanatoryTextBlock',
            'Has Cyber Policy': 'DoesTheEntityHaveAFrameworkOrPolicyOnCyberSecurityAndRisksRelatedToDataPrivacy',
            'Cyber Policy Details': 'WebLinkOfThePolicyOnCyberSecurityAndRisksRelatedToDataPrivacy'
      
        }

    def _get_text_value(self, df: pd.DataFrame, element_name: str) -> Optional[str]:
        """Helper to find an element name in a DataFrame and return its value."""
        if self.ELEMENT_NAME_COL not in df.columns or self.VALUE_COL not in df.columns:
            return None
        series = df.loc[df[self.ELEMENT_NAME_COL] == element_name, self.VALUE_COL]
        if not series.empty and not pd.isna(series.iloc[0]):
            return str(series.iloc[0]).strip()
        return None

    def _create_binary_flag(self, value: Optional[str], is_link: bool = False) -> int:
        """
        Creates a binary (1/0) flag based on the input value.
        For text, 'yes'/'true' becomes 1. For links, any non-empty value becomes 1.
        """
        if value is None:
            return 0
        
        value_lower = value.lower()
        
        if is_link:
            # If we expect a link, any content (especially 'http') means it's provided.
            return 1 if 'http' in value_lower or value_lower else 0
        else:
            # For boolean-like questions, check for 'yes' or 'true'.
            return 1 if value_lower in ['yes', 'true'] else 0

    def _process_single_file(self, file_path: Path) -> Optional[Dict[str, Any]]:
        """Processes a single file to extract leadership indicator data."""
        try:
            if file_path.suffix in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path, engine='openpyxl')
            elif file_path.suffix == '.csv':
                df = pd.read_csv(file_path, on_bad_lines='skip', encoding_errors='ignore')
            else:
                return None
        except Exception as e:
            print(f"Error reading file {file_path.name}: {e}")
            return None

        company_name = self._get_text_value(df, self.COMPANY_NAME_ELEMENT) or "Unknown Company"
        
        data_entry: Dict[str, Any] = {'Company Name': company_name}
        
        # --- Extract data for each indicator ---
        for bucket_name, element_name in self.INDICATOR_ELEMENTS.items():
            raw_value = self._get_text_value(df, element_name)
            
            # Determine if the element is a weblink for flag creation logic
            is_link_field = 'Weblink' in bucket_name
            
            # Create the binary 0/1 column
            data_entry[bucket_name] = self._create_binary_flag(raw_value, is_link=is_link_field, field_name=element_name)
            
            # Also store the original raw text for context
            data_entry[f'{bucket_name} (Raw Text)'] = raw_value if raw_value else "Not Provided"

        # Create the 4th bucket: Does the company provide any of the above?
        any_indicator_provided = any(data_entry[key] == 1 for key in self.INDICATOR_ELEMENTS.keys())
        data_entry['Any Indicator Provided'] = 1 if any_indicator_provided else 0
            
        print(f"--- Successfully processed leadership data for: {company_name}")
        return data_entry

    def process_directory(self, directory: str) -> pd.DataFrame:
        """Processes all valid files in a directory and aggregates the data."""
        source_path = Path(directory)
        if not source_path.is_dir() or not any(source_path.iterdir()):
            print(f"Error: The directory '{directory}' is empty or does not exist.")
            return pd.DataFrame()

        print(f"[INFO] Starting leadership indicator analysis in '{directory}'...\n")
        all_data: List[Dict[str, Any]] = []
        for file_path in source_path.iterdir():
            if file_path.is_dir(): continue
            data_entry = self._process_single_file(file_path)
            if data_entry:
                all_data.append(data_entry)

        if not all_data:
            print("No valid data could be processed from the files.")
            return pd.DataFrame()

        summary_df = pd.DataFrame(all_data)
        
        # --- Define the final column order for the report ---
        binary_columns = list(self.INDICATOR_ELEMENTS.keys()) + ['Any Indicator Provided']
        raw_text_columns = [f'{key} (Raw Text)' for key in self.INDICATOR_ELEMENTS.keys()]
        
        column_order = ['Company Name'] + binary_columns + raw_text_columns
        return summary_df.reindex(columns=column_order)

    def export_to_excel(self, df: pd.DataFrame, output_file: str):
        """Exports the final DataFrame to a formatted Excel file."""
        if df.empty:
            print("Cannot export empty DataFrame.")
            return

        print(f"\n[INFO] Exporting data to '{output_file}'...")
        try:
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                df.to_excel(writer, index=False)     
            
            print(f"SUCCESS: Master Excel sheet '{output_file}' has been created.")

        except PermissionError:
            print(f"ERROR: Permission denied. Is '{output_file}' open? Please close it and try again.")
        except Exception as e:
            print(f"An error occurred while saving the Excel file: {e}")

    def _create_binary_flag(self, value: Optional[str], is_link: bool = False, field_name: Optional[str] = None) -> int:
        """
        Creates a binary (1/0) flag.
        For links, any non-empty string is treated as 1.
        For the specific cyber policy field, 'Yes' -> 1, else 0 (including 'Not Applicable').
        For general booleans, 'yes' or 'true' -> 1, else 0.
        """
        if value is None:
            return 0
    
        value_lower = value.lower()
    
        if is_link:
            return 1 if value_lower.strip() else 0
    
        # Special handling for cyber policy field
        if field_name == "DoesTheEntityHaveAFrameworkOrPolicyOnCyberSecurityAndRisksRelatedToDataPrivacy":
            return 1 if value_lower == "yes" else 0
    
        return 1 if value_lower in ["yes", "true"] else 0



In [18]:
source_directory = 'excel_files'
# Set output file name as requested
output_excel_file = 'p9_product_info.xlsx'

if not os.path.exists(source_directory):
    os.makedirs(source_directory)
    print(f"Created directory: {source_directory}")
    print(f"Please add your company data files to '{source_directory}' before running again.")

extractor = LeadershipIndicatorExtractor()
main_data_df = extractor.process_directory(source_directory)


DataFrame is empty. No export.
