In [1]:
import pandas as pd
import os
import re
from pathlib import Path
from typing import List, Optional, Any, Dict

In [2]:
ELEMENT_NAME_COL = 'Element Name'
VALUE_COL = 'Fact Value'

# Target element name for the company
COMPANY_NAME_ELEMENT = 'NameOfTheCompany'

# The three specific percentage elements to extract
PERCENTAGE_ELEMENTS = [
    'EnvironmentalAndSocialParametersRelevantToTheProductAsAPercentageToTotalTurnover',
    'SafeAndResponsibleUsageAsAPercentageToTotalTurnover',
    'RecyclingAndOrSafeDisposalAsAPercentageToTotalTurnover'
]

In [7]:
def get_value_from_df(df: pd.DataFrame, element_name: str) -> Optional[str]:
    """
    A helper function to find an element name in the DataFrame and return its fact value.
    Returns None if the element or required columns are not found.
    """
    if ELEMENT_NAME_COL not in df.columns or VALUE_COL not in df.columns:
        return None
        
    series = df.loc[df[ELEMENT_NAME_COL] == element_name, VALUE_COL]
    
    if not series.empty:
        # Return the first value found, ensuring it is a string
        return str(series.iloc[0])
    
    return None

def clean_and_convert_to_numeric(value: Optional[str]) -> float:
    """
    Converts a value to a float. Handles non-numeric values by returning 0.0.
    """
    if value is None:
        return 0.0
    # Use pd.to_numeric to safely convert, coercing errors to NaN (Not a Number)
    numeric_value = pd.to_numeric(value, errors='coerce')
    # If conversion fails (is NaN), return 0.0, otherwise return the float value
    return 0.0 if pd.isna(numeric_value) else float(numeric_value)


def extract_percentage_data(directory: str, output_file: str) -> None:
    """
    Processes files in a directory to extract three specific percentage values,
    calculates their average, and saves the results to a master Excel file.

    Args:
        directory (str): The path to the directory containing the company data files.
        output_file (str): The name of the master Excel file to be created.
    """
    source_path = Path(directory)
    summary_data: List[Dict[str, Any]] = []
    
    if not source_path.is_dir() or not any(source_path.iterdir()):
        print(f"Error: The directory '{directory}' is empty or does not exist.")
        print("Please ensure the directory path is correct and contains the data files.")
        return

    print(f"[INFO] Starting analysis of files in '{directory}'...\n")

    # --- Main File Processing Loop ---
    for file_path in source_path.iterdir():
        if file_path.is_dir() or file_path.suffix not in ['.xlsx', '.xls', '.csv']:
            continue
            
        print(f"--- Processing File: {file_path.name} ---")
        
        try:
            # Read file based on its extension
            if file_path.suffix in ['.xlsx', '.xls']:
                df = pd.read_excel(file_path, engine='openpyxl')
            else:
                df = pd.read_csv(file_path, on_bad_lines='skip', encoding_errors='ignore')

            # 1. Extract the Company Name
            company_name = get_value_from_df(df, COMPANY_NAME_ELEMENT) or "Company Name Not Found"
            
            # This dictionary will hold the raw text values for the report
            data_entry: Dict[str, Any] = {'Company Name': company_name}
            # This list will hold the clean numeric values for calculation
            numeric_values: List[float] = []

            # 2. Extract the value for each of the three percentage elements
            for element in PERCENTAGE_ELEMENTS:
                raw_value = get_value_from_df(df, element)
                # Store the raw value (or "Not Found") in the output row
                data_entry[element] = raw_value if raw_value is not None else "Not Found"
                
                # Convert the raw value to a clean number for calculation
                numeric_value = clean_and_convert_to_numeric(raw_value)
                numeric_values.append(numeric_value)
            
            # 3. Calculate the total and average
            total_percentage = sum(numeric_values)
            average_percentage = total_percentage / len(PERCENTAGE_ELEMENTS) if PERCENTAGE_ELEMENTS else 0.0
            
            # Add the calculated average to our output row
            data_entry['Average Percentage'] = average_percentage
            
            summary_data.append(data_entry)
            print(f"  Successfully extracted data for: {company_name}")
            print(f"  Values Found: {numeric_values} -> Average: {average_percentage:.2f}%")
            print()

        except Exception as e:
            print(f"An error occurred while processing file {file_path.name}: {e}\n")

    # --- Final Step: Create and Save the Master Excel Sheet ---
    if not summary_data:
        print("\nNo valid data could be processed from the files.")
        return
        
    summary_df = pd.DataFrame(summary_data)
    
    # Define the final column order for the Excel sheet
    column_order = ['Company Name'] + PERCENTAGE_ELEMENTS + ['Average Percentage']
    summary_df = summary_df[column_order]

    # Define mapping from old column names to new, more descriptive labels
    rename_map = {
        "EnvironmentalAndSocialParametersRelevantToTheProductAsAPercentageToTotalTurnover": "Turnover with Environmental & Social Info (%)",
        "SafeAndResponsibleUsageAsAPercentageToTotalTurnover": "Turnover with Safe Usage Info (%)",
        "RecyclingAndOrSafeDisposalAsAPercentageToTotalTurnover": "Turnover with Recycling/Disposal Info (%)"
    }
    
    # Convert each target column to numeric safely (in case strings or symbols remain)
    for old_col in rename_map:
        summary_df[old_col] = pd.to_numeric(summary_df[old_col], errors='coerce').fillna(0.0)
    
    # Rename the columns in the DataFrame
    summary_df.rename(columns=rename_map, inplace=True)

    try:
        # Save the consolidated data to the new Excel file
        summary_df.to_excel(output_file, index=False, engine='openpyxl')
        
        print(f"\nSUCCESS: Percentage summary sheet '{output_file}' has been created.")
        print(f"Total companies processed: {len(summary_df)}")
        
        print("\nFirst few rows of the consolidated data:")
        pd.set_option('display.max_columns', None)
        pd.set_option('display.width', 120)
        print(summary_df.head())

    except Exception as e:
        print(f"\nERROR: Could not save the Excel file '{output_file}'. Reason: {e}")

In [9]:
source_directory = 'excel_files'  
output_excel_file = 'p9_turnover.xlsx'


In [10]:
extract_percentage_data(source_directory, output_excel_file)

[INFO] Starting analysis of files in 'excel_files'...

--- Processing File: 360_ONE_WAM_LIMITED.xlsx ---
  Successfully extracted data for: 360 One Wam Limited
  Values Found: [0.0, 0.0, 0.0] -> Average: 0.00%

--- Processing File: 3I_Infotech_Limited.xlsx ---
  Successfully extracted data for: 3I Infotech Limited
  Values Found: [0.0, 0.0, 0.0] -> Average: 0.00%

--- Processing File: 3M_INDIA_LIMITED.xlsx ---
  Successfully extracted data for: 3M India Limited
  Values Found: [1.0, 1.0, 1.0] -> Average: 1.00%

--- Processing File: 5paisa_Capital_Limited.xlsx ---
  Successfully extracted data for: 5paisa Capital Limited
  Values Found: [0.0, 0.0, 0.0] -> Average: 0.00%

--- Processing File: 63_moons_technologies_limited.xlsx ---
  Successfully extracted data for: 63 Moons Technologies Limited
  Values Found: [0.0, 0.0, 0.0] -> Average: 0.00%

--- Processing File: Aarti_Drugs_Limited.xlsx ---
  Successfully extracted data for: Aarti Drugs Limited
  Values Found: [1.0, 1.0, 1.0] -> Avera