In [1]:
import os
import pandas as pd
import glob
from tqdm import tqdm

folder_path = "excel_files"


# Get a list of all .xlsx file paths
xlsx_files = glob.glob(os.path.join(folder_path, "*.xlsx"))

In [21]:
import os
import pandas as pd
from tqdm import tqdm

data_list = []
null_assurance_companies = []
assurance_stats = {"Yes": 0, "No": 0}

excel_files = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]

for file_name in tqdm(excel_files, desc="Processing files", unit="file"):
    file_path = os.path.join(folder_path, file_name)
    df_file = pd.read_excel(file_path)

    def get_value(element):
        values = df_file.loc[df_file['Element Name'] == element, 'Fact Value'].values
        return values[0] if len(values) > 0 else ''

    corporate_identity = get_value('CorporateIdentityNumber')
    company_name = get_value('NameOfTheCompany')

    assurers_info = []
    if assurance_flag.lower() == 'true':
        assurance_units = df_file.loc[
            (df_file['Element Name'] == 'NameOfTheCompanyOrLLPOrFirmOfAssuranceProvider') &
            (df_file['Unit'].str.contains('D_AssuranceProvider', na=False)),
            'Unit'
        ].dropna().unique()

        for unit in assurance_units:
            fields = {
                'NameOfTheCompanyOrLLPOrFirmOfAssuranceProvider': '',
                'CompanyIDOrLLPIDOrFirmIDOfAssuranceProvider': '',
                'NameOfTheAssurer': '',
                'DesignationOfAssurer': '',
                'DateOfSigningByAssurer': ''
            }
            for field in fields:
                value = df_file.loc[
                    (df_file['Element Name'] == field) & (df_file['Unit'] == unit),
                    'Fact Value'
                ].values
                fields[field] = value[0] if len(value) > 0 else ''
            assurers_info.append(fields)

    # Flatten assurer data
    assurer_data = []
    for assurer in assurers_info:
        assurer_data.extend([
            assurer['NameOfTheCompanyOrLLPOrFirmOfAssuranceProvider'],
            assurer['CompanyIDOrLLPIDOrFirmIDOfAssuranceProvider'],
            assurer['NameOfTheAssurer'],
            assurer['DesignationOfAssurer'],
            assurer['DateOfSigningByAssurer']
        ])

    while len(assurer_data) < 25:
        assurer_data.append('')

    # Section-level
    section_fields = {
        'Section A Assurance': 'TypeOfAssuranceForSectionAGeneralDisclosures',
        'Section B Assurance': 'TypeOfAssuranceForSectionBManagementAndProcessDisclosures',
        'Section C Assurance': 'TypeOfAssuranceForSectionCPrincipleWisePerformanceDisclosures'
    }
    section_data = [get_value(v) for v in section_fields.values()]

    # Principle-level
    principle_data = []
    for i in range(1, 10):
        essential = get_value(f'TypeOfAssuranceForPrinciple{i}EssentialIndicators')
        leadership = get_value(f'TypeOfAssuranceForPrinciple{i}LeadershipIndicators')
        principle_data.extend([essential, leadership])

    # Final row
    row = [
        corporate_identity,
        company_name.title(),
        assurance_label,
        assurance_level,
        len(assurers_info)
    ] + assurer_data + section_data + principle_data

    # ✅ CHECK BEFORE APPENDING
    if len(row) != 51:
        print(f"❌ Row length mismatch in {file_name}: got {len(row)} values instead of 51")
        continue

    data_list.append(row)

# Define columns
columns = ['CIN', 'Company', 'Whether Assurance Obtained', 'Assurance Level', 'Number of Assurers']
for i in range(1, 6):
    columns += [
        f'Assurer {i} - Firm Name',
        f'Assurer {i} - Firm ID',
        f'Assurer {i} - Name',
        f'Assurer {i} - Designation',
        f'Assurer {i} - Signing Date'
    ]
columns += [
    'Section A Assurance',
    'Section B Assurance',
    'Section C Assurance'
]
for i in range(1, 10):
    columns.append(f'Principle {i} - Essential')
    columns.append(f'Principle {i} - Leadership')

# Create DataFrame
df = pd.DataFrame(data_list, columns=columns)
df = df.sort_values(by='Company')

# Summary
print("\n--- Summary Statistics ---")
print("Total companies processed:", len(df))
print("Companies with NULL assurance flag:", len(null_assurance_companies))
print("Assurance obtained:", assurance_stats.get("Yes", 0))
print("Assurance not obtained:", assurance_stats.get("No", 0))

print("\nCompanies with missing 'WhetherTheCompanyHasUndertakenReasonableAssuranceOfTheBRSRCore':")
for company in null_assurance_companies:
    print("-", company)



Processing files: 100%|██████████████████████████████████████████████████████████| 1174/1174 [07:04<00:00,  2.77file/s]


--- Summary Statistics ---
Total companies processed: 1174
Companies with NULL assurance flag: 12
Assurance obtained: 213
Assurance not obtained: 949

Companies with missing 'WhetherTheCompanyHasUndertakenReasonableAssuranceOfTheBRSRCore':
- ABB India Limited
- Foseco India Limited
- Gm Breweries Limited
- Huhtamaki India Limited
- Inox India Limited
- Rain Industries Limited
- R Systems International Limited
- Sanofi India Limited
- Schaeffler India Limited
- Seshasayee Paper And Boards Limited
- Transformers and Rectifiers (India) Limited
- Vesuvius India Limited





In [22]:
df

Unnamed: 0,CIN,Company,Whether Assurance Obtained,Assurance Level,Number of Assurers,Assurer 1 - Firm Name,Assurer 1 - Firm ID,Assurer 1 - Name,Assurer 1 - Designation,Assurer 1 - Signing Date,...,Principle 5 - Essential,Principle 5 - Leadership,Principle 6 - Essential,Principle 6 - Leadership,Principle 7 - Essential,Principle 7 - Leadership,Principle 8 - Essential,Principle 8 - Leadership,Principle 9 - Essential,Principle 9 - Leadership
0,L74140MH2008PLC177884,360 One Wam Limited,No,,0,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,No,,0,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,No,,0,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5Paisa Capital Limited,No,,0,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,No,,0,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,L34103TN2004PLC054667,Zf Commercial Vehicle Control Systems India Li...,No,,0,,,,,,...,,,,,,,,,,
1170,L93030DL2010PLC198141,Zomato Limited,Yes,Partial,1,Deloitte Haskins & Sells LLP,117366W / W-100018,Mr. Pratiq Shah,Engagement Partner,2024-08-01,...,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial
1171,L24231GJ2000PLC038352,Zota Health Care Limited,No,,0,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,Yes,All,1,Intertek India Private Limited,U74220DL1997PTC202243,Elizabeth Mielbrecht,Project Director,2024-07-08,...,All,All,All,All,All,All,All,All,All,All


In [33]:
# Define the columns to clear
columns_to_clear = [
    'Assurance Level',
    'Number of Assurers',
    *[f'Assurer {i} - {field}' for i in range(1, 6) for field in ['Firm Name', 'Firm ID', 'Name', 'Designation', 'Signing Date']],
    'Section A Assurance',
    'Section B Assurance',
    'Section C Assurance',
    *[f'Principle {i} - {ind}' for i in range(1, 10) for ind in ['Essential', 'Leadership']]
]

# Update the relevant rows
for company in null_assurance_companies:
    mask = df['Company'].str.strip().str.lower() == company.strip().lower()
    df.loc[mask, 'Whether Assurance Obtained'] = 'No'
    df.loc[mask, columns_to_clear] = ''


  df.loc[mask, columns_to_clear] = ''


In [35]:
df[df['Company'].isin(null_assurance_companies)][['Company', 'Whether Assurance Obtained']]

Unnamed: 0,Company,Whether Assurance Obtained
334,Foseco India Limited,No
369,Gm Breweries Limited,No
461,Huhtamaki India Limited,No
509,Inox India Limited,No
881,R Systems International Limited,No
832,Rain Industries Limited,No
898,Sanofi India Limited,No
911,Schaeffler India Limited,No
917,Seshasayee Paper And Boards Limited,No
1132,Vesuvius India Limited,No


In [36]:
df.loc[df['Company'].str.lower() == "abb india limited".lower()]


Unnamed: 0,CIN,Company,Whether Assurance Obtained,Assurance Level,Number of Assurers,Assurer 1 - Firm Name,Assurer 1 - Firm ID,Assurer 1 - Name,Assurer 1 - Designation,Assurer 1 - Signing Date,...,Principle 5 - Essential,Principle 5 - Leadership,Principle 6 - Essential,Principle 6 - Leadership,Principle 7 - Essential,Principle 7 - Leadership,Principle 8 - Essential,Principle 8 - Leadership,Principle 9 - Essential,Principle 9 - Leadership
10,L32202KA1949PLC032923,Abb India Limited,No,,,,,,,,...,,,,,,,,,,


In [37]:
df.to_excel('assurance_overall.xlsx', index=False)

In [107]:
summary_df = pd.read_excel('assurance_overall.xlsx')

In [108]:
# Identify the 20 assurance columns
assurance_columns = [
    'Section A Assurance', 'Section B Assurance', 'Section C Assurance',
    'Principle 1 - Essential', 'Principle 1 - Leadership',
    'Principle 2 - Essential', 'Principle 2 - Leadership',
    'Principle 3 - Essential', 'Principle 3 - Leadership',
    'Principle 4 - Essential', 'Principle 4 - Leadership',
    'Principle 5 - Essential', 'Principle 5 - Leadership',
    'Principle 6 - Essential', 'Principle 6 - Leadership',
    'Principle 7 - Essential', 'Principle 7 - Leadership',
    'Principle 8 - Essential', 'Principle 8 - Leadership',
    'Principle 9 - Essential', 'Principle 9 - Leadership'
]

# Extract and melt the relevant columns
df_melted = summary_df[assurance_columns].melt(var_name="Question", value_name="Response")

# Filter only relevant values
df_filtered = df_melted[df_melted['Response'].isin(['All', 'Partial'])]

# Count the values
summary_df = df_filtered.groupby(['Question', 'Response']).size().reset_index(name='Count')

# Pivot to get structure ready for charting
pivot_df = summary_df.pivot(index="Question", columns="Response", values="Count").fillna(0).astype(int).reset_index()


pivot_df

Response,Question,All,Partial
0,Principle 1 - Essential,83,130
1,Principle 1 - Leadership,86,127
2,Principle 2 - Essential,82,131
3,Principle 2 - Leadership,76,137
4,Principle 3 - Essential,83,130
5,Principle 3 - Leadership,82,131
6,Principle 4 - Essential,90,123
7,Principle 4 - Leadership,83,130
8,Principle 5 - Essential,85,128
9,Principle 5 - Leadership,80,133


In [112]:
pivot_df.to_excel('assurance_summary.xlsx', index=False)

In [115]:
long_df = pivot_df.melt(id_vars='Question', var_name='Response', value_name='Count')
long_df.to_excel("assurance_summary.xlsx", index=False)
long_df

Unnamed: 0,Question,Response,Count
0,Principle 1 - Essential,All,83
1,Principle 1 - Leadership,All,86
2,Principle 2 - Essential,All,82
3,Principle 2 - Leadership,All,76
4,Principle 3 - Essential,All,83
5,Principle 3 - Leadership,All,82
6,Principle 4 - Essential,All,90
7,Principle 4 - Leadership,All,83
8,Principle 5 - Essential,All,85
9,Principle 5 - Leadership,All,80


In [41]:
import os
import pandas as pd
from tqdm import tqdm

section_a_data = []

# 7 Part-level fields under Section A
section_a_parts = [
    "TypeOfAssuranceForDetailsOfTheListedEntity",
    "TypeOfAssuranceForDetailsOfProductsOrServices",
    "TypeOfAssuranceForDetailsOfOperations",
    "TypeOfAssuranceForDetailsOfEmployees",
    "TypeOfAssuranceForDetailsOfHoldingSubsidiaryAndAssociateCompaniesIncludingJointVentures",
    "TypeOfAssuranceForDetailsOfCSR",
    "TypeOfAssuranceForTransparencyAndDisclosuresCompliances"
]

# 24 Section A questions (each → whether, subtype, remarks)
section_a_questions = [
    ("CorporateIdentityNumber", "WhetherCorporateIdentityNumberIsAssuredByAssurer", "AssuranceSubTypeForCorporateIdentityNumber", "RemarksForAssuranceOfCorporateIdentityNumber"),
    ("NameOfTheCompany", "WhetherNameOfTheCompanyIsAssuredByAssurer", "AssuranceSubTypeForNameOfTheCompany", "RemarksForAssuranceOfNameOfTheCompany"),
    ("YearOfIncorporation", "WhetherYearOfIncorporationIsAssuredByAssurer", "AssuranceSubTypeForYearOfIncorporation", "RemarksForAssuranceOfYearOfIncorporation"),
    ("AddressOfRegisteredOfficeOfCompany", "WhetherAddressOfRegisteredOfficeOfCompanyIsAssuredByAssurer", "AssuranceSubTypeForAddressOfRegisteredOfficeOfCompany", "RemarksForAssuranceOfAddressOfRegisteredOfficeOfCompany"),
    ("AddressOfCorporateOfficeOfCompany", "WhetherAddressOfCorporateOfficeOfCompanyIsAssuredByAssurer", "AssuranceSubTypeForAddressOfCorporateOfficeOfCompany", "RemarksForAssuranceOfAddressOfCorporateOfficeOfCompany"),
    ("EMailOfTheCompany", "WhetherEMailOfTheCompanyIsAssuredByAssurer", "AssuranceSubTypeForEMailOfTheCompany", "RemarksForAssuranceOfEMailOfTheCompany"),
    ("TelephoneOfCompany", "WhetherTelephoneOfCompanyIsAssuredByAssurer", "AssuranceSubTypeForTelephoneOfCompany", "RemarksForAssuranceOfTelephoneOfCompany"),
    ("WebsiteOfCompany", "WhetherWebsiteOfCompanyIsAssuredByAssurer", "AssuranceSubTypeForWebsiteOfCompany", "RemarksForAssuranceOfWebsiteOfCompany"),
    ("DetailsOfFinancialYear", "WhetherDetailsOfFinancialYearForWhichReportingIsBeingDoneIsAssuredByAssurer", "AssuranceSubTypeForDetailsOfFinancialYearForWhichReportingIsBeingDone", "RemarksForAssuranceOfDetailsOfFinancialYearForWhichReportingIsBeingDone"),
    ("StockExchange", "WhetherDetailsOfTheStockExchangeWhereTheCompanyIsListedIsAssuredByAssurer", "AssuranceSubTypeForDetailsOfTheStockExchangeWhereTheCompanyIsListed", "RemarksForAssuranceOfDetailsOfTheStockExchangeWhereTheCompanyIsListed"),
    ("PaidUpCapital", "WhetherValueOfSharesPaidUpIsAssuredByAssurer", "AssuranceSubTypeForValueOfSharesPaidUp", "RemarksForAssuranceOfValueOfSharesPaidUp"),
    ("ContactPerson", "WhetherNameAndContactDetailsOfTheContactPersonInCaseOfAnyQueriesOnTheBRSRReportIsAssuredByAssurer", "AssuranceSubTypeForNameAndContactDetailsOfTheContactPersonInCaseOfAnyQueriesOnTheBRSRReport", "RemarksForAssuranceOfNameAndContactDetailsOfTheContactPersonInCaseOfAnyQueriesOnTheBRSRReport"),
    ("ReportingBoundary", "WhetherReportingBoundaryIsAssuredByAssurer", "AssuranceSubTypeForReportingBoundary", "RemarksForAssuranceOfReportingBoundary"),
    ("BusinessActivities", "WhetherDetailsOfBusinessActivitiesAccountingForNinetyPercentOfTheTurnoverIsAssuredByAssurer", "AssuranceSubTypeForDetailsOfBusinessActivitiesAccountingForNinetyPercentOfTheTurnover", "RemarksForAssuranceOfDetailsOfBusinessActivitiesAccountingForNinetyPercentOfTheTurnover"),
    ("ProductsSold", "WhetherProductsOrServicesSoldByTheEntityAccountingForNinetyPercentOfTheTurnoverIsAssuredByAssurer", "AssuranceSubTypeForProductsOrServicesSoldByTheEntityAccountingForNinetyPercentOfTheTurnover", "RemarksForAssuranceOfProductsOrServicesSoldByTheEntityAccountingForNinetyPercentOfTheTurnover"),
    ("NumberOfLocations", "WhetherDetailsOfNumberOfLocationsWherePlantsAndOrOperationsOrOfficesOfTheEntityAreSituatedIsAssuredByAssurer", "AssuranceSubTypeForDetailsOfNumberOfLocationsWherePlantsAndOrOperationsOrOfficesOfTheEntityAreSituated", "RemarksForAssuranceOfDetailsOfNumberOfLocationsWherePlantsAndOrOperationsOrOfficesOfTheEntityAreSituated"),
    ("MarketsServed", "WhetherMarketsServedByTheEntityIsAssuredByAssurer", "AssuranceSubTypeForMarketsServedByTheEntity", "RemarksForAssuranceOfMarketsServedByTheEntity"),
    ("Employees", "WhetherDetailsOfEmployeesAsAtTheEndOfFinancialYearIsAssuredByAssurer", "AssuranceSubTypeForDetailsOfEmployeesAsAtTheEndOfFinancialYear", "RemarksForAssuranceOfDetailsOfEmployeesAsAtTheEndOfFinancialYear"),
    ("WomenInclusion", "WhetherParticipationOrInclusionOrRepresentationOfWomenIsAssuredByAssurer", "AssuranceSubTypeForParticipationOrInclusionOrRepresentationOfWomen", "RemarksForAssuranceOfParticipationOrInclusionOrRepresentationOfWomen"),
    ("TurnoverRate", "WhetherTurnoverRateForPermanentEmployeesAndWorkersDiscloseTrendsForPastThreeYearsIsAssuredByAssurer", "AssuranceSubTypeForTurnoverRateForPermanentEmployeesAndWorkersDiscloseTrendsForPastThreeYears", "RemarksForAssuranceOfTurnoverRateForPermanentEmployeesAndWorkersDiscloseTrendsForPastThreeYears"),
    ("HoldingSubsidiaries", "WhetherNamesOfHoldingSubsidiaryAssociateCompaniesJointVenturesIsAssuredByAssurer", "AssuranceSubTypeForNamesOfHoldingSubsidiaryAssociateCompaniesJointVentures", "RemarksForAssuranceOfNamesOfHoldingSubsidiaryAssociateCompaniesJointVentures"),
    ("CSR", "WhetherCSRIsApplicableAsPerSection135OfCompaniesAct2013IsAssuredByAssurer", "AssuranceSubTypeForWhetherCSRIsApplicableAsPerSection135OfCompaniesAct2013", "RemarksForAssuranceOfWhetherCSRIsApplicableAsPerSection135OfCompaniesAct2013"),
    ("Complaints", "WhetherComplaintsOrGrievancesOnAnyOfThePrinciplesUnderTheNationalGuidelinesOnResponsibleBusinessConductIsAssuredByAssurer", "AssuranceSubTypeForComplaintsOrGrievancesOnAnyOfThePrinciplesUnderTheNationalGuidelinesOnResponsibleBusinessConduct", "RemarksForAssuranceOfComplaintsOrGrievancesOnAnyOfThePrinciplesUnderTheNationalGuidelinesOnResponsibleBusinessConduct"),
    ("MaterialIssues", "WhetherOverviewOfTheEntitysMaterialResponsibleBusinessConductIssuesIsAssuredByAssurer", "AssuranceSubTypeForOverviewOfTheEntitysMaterialResponsibleBusinessConductIssues", "RemarksForAssuranceOfOverviewOfTheEntitysMaterialResponsibleBusinessConductIssues")
]

# Process files
for file in tqdm([f for f in os.listdir(folder_path) if f.endswith('.xlsx')]):
    file_path = os.path.join(folder_path, file)
    df = pd.read_excel(file_path)

    def get_val(field):
        vals = df.loc[df['Element Name'] == field, 'Fact Value'].values
        return vals[0] if len(vals) > 0 else ""

    row = [
        get_val("CorporateIdentityNumber"),
        get_val("NameOfTheCompany"),
        get_val("TypeOfAssuranceForSectionAGeneralDisclosures")
    ]

    # Add 7 part-level assurances
    for field in section_a_parts:
        row.append(get_val(field))

    # Add 24 × (Whether, SubType, Remarks)
    for label, w_field, s_field, r_field in section_a_questions:
        row.append(get_val(w_field))
        row.append(get_val(s_field))
        row.append(get_val(r_field))

    section_a_data.append(row)

# Build column names
columns = ['CIN', 'Company', 'Section A - Overall Assurance']
columns += section_a_parts
for label, w, s, r in section_a_questions:
    columns += [
        f"{label} - Whether Assured",
        f"{label} - Subtype",
        f"{label} - Remarks"
    ]

# Create DataFrame
df_section_a = pd.DataFrame(section_a_data, columns=columns)
df_section_a = df_section_a.sort_values(by="Company")



100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:24<00:00,  3.06it/s]


In [43]:
df_section_a.shape

(1174, 82)

In [44]:
df_section_a.to_excel('section_a_assurance.xlsx', index=False)

In [45]:
import os
import pandas as pd
from tqdm import tqdm

section_b_data = []

# 2 subparts
subpart_fields = [
    "TypeOfAssuranceForPolicyAndManagementProcesses",
    "TypeOfAssuranceForGovernanceLeadershipAndOversight"
]

# 13 Qs with 3 fields each
section_b_questions = [
    (
        "AssurerHasAssuredWhetherYourEntitysPolicyOrPoliciesCoverEachPrincipleAndItsCoreElementsOfTheNGRBCs",
        "AssuranceSubTypeForWhetherYourEntitysPolicyOrPoliciesCoverEachPrincipleAndItsCoreElementsOfTheNGRBCs",
        "RemarksForAssuranceOfWhetherYourEntitysPolicyOrPoliciesCoverEachPrincipleAndItsCoreElementsOfTheNGRBCs"
    ),
    (
        "AssurerHasAssuredWhetherTheEntityHasTranslatedThePolicyIntoProcedures",
        "AssuranceSubTypeForWhetherTheEntityHasTranslatedThePolicyIntoProcedures",
        "RemarksForAssuranceOfWhetherTheEntityHasTranslatedThePolicyIntoProcedures"
    ),
    (
        "AssurerHasAssuredWhetherTheEnlistedPoliciesExtendToYourValueChainPartners",
        "AssuranceSubTypeForWhetherTheEnlistedPoliciesExtendToYourValueChainPartners",
        "RemarksForAssuranceOfWhetherTheEnlistedPoliciesExtendToYourValueChainPartners"
    ),
    (
        "WhetherNameOfTheNationalAndInternationalCodesOrCertificationsOrLabelsOrStandardsAdoptedByYourEntityAndMappedToEachPrincipleIsAssuredByAssurer",
        "AssuranceSubTypeForNameOfTheNationalAndInternationalCodesOrCertificationsOrLabelsOrStandardsAdoptedByYourEntityAndMappedToEachPrinciple",
        "RemarksForAssuranceOfNameOfTheNationalAndInternationalCodesOrCertificationsOrLabelsOrStandardsAdoptedByYourEntityAndMappedToEachPrinciple"
    ),
    (
        "WhetherSpecificCommitmentsGoalsAndTargetsSetByTheEntityWithDefinedTimelinesIsAssuredByAssurer",
        "AssuranceSubTypeForSpecificCommitmentsGoalsAndTargetsSetByTheEntityWithDefinedTimelines",
        "RemarksForAssuranceOfSpecificCommitmentsGoalsAndTargetsSetByTheEntityWithDefinedTimelines"
    ),
    (
        "WhetherPerformanceOfTheEntityAgainstTheSpecificCommitmentsGoalsAndTargetsAlongWithReasonsInCaseTheSameAreNotMetIsAssuredByAssurer",
        "AssuranceSubTypeForPerformanceOfTheEntityAgainstTheSpecificCommitmentsGoalsAndTargetsAlongWithReasonsInCaseTheSameAreNotMet",
        "RemarksForAssuranceOfPerformanceOfTheEntityAgainstTheSpecificCommitmentsGoalsAndTargetsAlongWithReasonsInCaseTheSameAreNotMet"
    ),
    (
        "WhetherStatementByDirectorResponsibleForTheBusinessResponsibilityReportHighlightingESGRelatedChallengesTargetsAndAchievementsIsAssuredByAssurer",
        "AssuranceSubTypeForStatementByDirectorResponsibleForTheBusinessResponsibilityReportHighlightingESGRelatedChallengesTargetsAndAchievements",
        "RemarksForAssuranceOfStatementByDirectorResponsibleForTheBusinessResponsibilityReportHighlightingESGRelatedChallengesTargetsAndAchievements"
    ),
    (
        "WhetherDetailsOfTheHighestAuthorityResponsibleForImplementationAndOversightOfTheBusinessResponsibilityPolicyIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfTheHighestAuthorityResponsibleForImplementationAndOversightOfTheBusinessResponsibilityPolicy",
        "RemarksForAssuranceOfDetailsOfTheHighestAuthorityResponsibleForImplementationAndOversightOfTheBusinessResponsibilityPolicy"
    ),
    (
        "AssurerHasAssuredWhetherTheEntityHaveASpecifiedCommitteeOfTheBoardOrDirectorResponsibleForDecisionMakingOnSustainabilityRelatedIssues",
        "AssuranceSubTypeForWhetherTheEntityHaveASpecifiedCommitteeOfTheBoardOrDirectorResponsibleForDecisionMakingOnSustainabilityRelatedIssues",
        "RemarksForAssuranceOfWhetherTheEntityHaveASpecifiedCommitteeOfTheBoardOrDirectorResponsibleForDecisionMakingOnSustainabilityRelatedIssues"
    ),
    (
        "WhetherPerformanceAgainstAbovePoliciesAndFollowUpActionIsAssuredByAssurer",
        "AssuranceSubTypeForPerformanceAgainstAbovePoliciesAndFollowUpAction",
        "RemarksForAssuranceOfPerformanceAgainstAbovePoliciesAndFollowUpAction"
    ),
    (
        "WhetherComplianceWithStatutoryRequirementsOfRelevanceToThePrinciplesAndRectificationOfAnyNonCompliancesIsAssuredByAssurer",
        "AssuranceSubTypeForComplianceWithStatutoryRequirementsOfRelevanceToThePrinciplesAndRectificationOfAnyNonCompliances",
        "RemarksForAssuranceOfComplianceWithStatutoryRequirementsOfRelevanceToThePrinciplesAndRectificationOfAnyNonCompliances"
    ),
    (
        "AssurerHasAssuredWhetherTheEntityHasCarriedOutIndependentAssessmentEvaluationOfTheWorkingOfItsPoliciesByAnExternalAgency",
        "AssuranceSubTypeForWhetherTheEntityHasCarriedOutIndependentAssessmentEvaluationOfTheWorkingOfItsPoliciesByAnExternalAgency",
        "RemarksForAssuranceOfWhetherTheEntityHasCarriedOutIndependentAssessmentEvaluationOfTheWorkingOfItsPoliciesByAnExternalAgency"
    ),
    (
        "WhetherReasonsIfPoliciesNotCoverEachPrincipleAndItsCoreElementsOfTheNGRBCsIsAssuredByAssurer",
        "AssuranceSubTypeForReasonsIfPoliciesNotCoverEachPrincipleAndItsCoreElementsOfTheNGRBCs",
        "RemarksForAssuranceOfReasonsIfPoliciesNotCoverEachPrincipleAndItsCoreElementsOfTheNGRBCs"
    )
]

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Iterate through files
for file in tqdm([f for f in os.listdir(folder_path) if f.endswith(".xlsx")]):
    df = pd.read_excel(os.path.join(folder_path, file))
    
    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany"),
        get_val(df, "TypeOfAssuranceForSectionBManagementAndProcessDisclosures")
    ]

    for part in subpart_fields:
        row.append(get_val(df, part))

    for w, s, r in section_b_questions:
        row.append(get_val(df, w))
        row.append(get_val(df, s))
        row.append(get_val(df, r))

    section_b_data.append(row)

# Create columns
columns = ["CIN", "Company", "Section B - Overall Assurance"]
columns += ["MgmtProcessesAssurance", "GovernanceOversightAssurance"]
for i in range(1, 14):
    columns += [f"Q{i} - Whether", f"Q{i} - SubType", f"Q{i} - Remarks"]

df_section_b = pd.DataFrame(section_b_data, columns=columns)
df_section_b = df_section_b.sort_values(by="Company")

# Optional: Save
# df_section_b.to_excel("SectionB_Disclosures_Final.xlsx", index=False)


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [05:46<00:00,  3.39it/s]


In [46]:
df_section_b.shape

(1174, 44)

In [47]:
df_section_b.to_excel('section_b_assurance.xlsx', index=False)

In [48]:
import os
import pandas as pd
from tqdm import tqdm

section_c_data = []

principle_fields = []
for i in range(1, 10):
    principle_fields.append((f"TypeOfAssuranceForPrinciple{i}EssentialIndicators", f"P{i} - Essential"))
    principle_fields.append((f"TypeOfAssuranceForPrinciple{i}LeadershipIndicators", f"P{i} - Leadership"))

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Iterate through Excel files
for file in tqdm([f for f in os.listdir(folder_path) if f.endswith(".xlsx")]):
    df = pd.read_excel(os.path.join(folder_path, file))
    
    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany"),
        get_val(df, "TypeOfAssuranceForSectionCPrincipleWisePerformanceDisclosures")
    ]
    
    for field, _ in principle_fields:
        row.append(get_val(df, field))
    
    section_c_data.append(row)

# Define column names
columns = ["CIN", "Company", "Section C – Overall Assurance"]
columns += [label for _, label in principle_fields]

df_section_c = pd.DataFrame(section_c_data, columns=columns)
df_section_c = df_section_c.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [05:37<00:00,  3.48it/s]


In [50]:
df_section_c

Unnamed: 0,CIN,Company,Section C – Overall Assurance,P1 - Essential,P1 - Leadership,P2 - Essential,P2 - Leadership,P3 - Essential,P3 - Leadership,P4 - Essential,...,P5 - Essential,P5 - Leadership,P6 - Essential,P6 - Leadership,P7 - Essential,P7 - Leadership,P8 - Essential,P8 - Leadership,P9 - Essential,P9 - Leadership
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,All,All,All,All,All,All,...,All,All,All,All,All,All,All,All,All,All
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial,...,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial,Partial
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [51]:
df_section_c.to_excel('principles_assurance.xlsx', index=False)

In [52]:
import os
import pandas as pd
from tqdm import tqdm

principle1_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Define all fields
p1_fields = {
    'TypeOfAssuranceForPrinciple1EssentialIndicators': 'P1 - Overall (E)',
    'TypeOfAssuranceForPrinciple1LeadershipIndicators': 'P1 - Overall (L)'
}

# Essential Indicators (9 Questions)
essential_questions = [
    ("WhetherPercentageCoverageByTrainingAndAwarenessProgramsOnAnyOfThePrinciplesDuringTheFinancialYearForBODOrKMPOrEmployeeOrWorker",
     "AssuranceSubTypeForPercentageCoverageByTrainingAndAwarenessProgramsOnAnyOfThePrinciplesDuringTheFinancialYearForBODOrKMPOrEmployeeOrWorker",
     "RemarksForAssuranceOfPercentageCoverageByTrainingAndAwarenessProgramsOnAnyOfThePrinciplesDuringTheFinancialYearForBODOrKMPOrEmployeeOrWorker",
     "P1Q1 - Training Coverage (E)"),

    ("WhetherDetailsOfFinesOrPenaltiesOrPunishmentOrAwardOrCompoundingFeesOrSettlementIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfFinesOrPenaltiesOrPunishmentOrAwardOrCompoundingFeesOrSettlement",
     "RemarksForAssuranceOfDetailsOfFinesOrPenaltiesOrPunishmentOrAwardOrCompoundingFeesOrSettlement",
     "P1Q2 - Fines/Penalties (E)"),

    ("WhetherDetailsOfTheAppealOrRevisionPreferredInCasesWhereMonetaryOrNonMonetaryActionHasBeenAppealedIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTheAppealOrRevisionPreferredInCasesWhereMonetaryOrNonMonetaryActionHasBeenAppealed",
     "RemarksForAssuranceOfDetailsOfTheAppealOrRevisionPreferredInCasesWhereMonetaryOrNonMonetaryActionHasBeenAppealed",
     "P1Q3 - Appeals (E)"),

    ("WhetherDetailsAndWeblinkOfAnAntiCorruptionOrAntiBriberyPolicyIsPlaceIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsAndWeblinkOfAnAntiCorruptionOrAntiBriberyPolicyIsPlace",
     "RemarksForAssuranceOfDetailsAndWeblinkOfAnAntiCorruptionOrAntiBriberyPolicyIsPlace",
     "P1Q4 - Anti-Corruption Policy (E)"),

    ("WhetherNumberOfDirectorsOrKMPsOrEmployeesOrWorkersAgainstWhomDisciplinaryActionWasTakenByAnyLawEnforcementAgencyForTheChargesOfBriberyOrCorruptionIsAssuredByAssurer",
     "AssuranceSubTypeForNumberOfDirectorsOrKMPsOrEmployeesOrWorkersAgainstWhomDisciplinaryActionWasTakenByAnyLawEnforcementAgencyForTheChargesOfBriberyOrCorruption",
     "RemarksForAssuranceOfNumberOfDirectorsOrKMPsOrEmployeesOrWorkersAgainstWhomDisciplinaryActionWasTakenByAnyLawEnforcementAgencyForTheChargesOfBriberyOrCorruption",
     "P1Q5 - Disciplinary Action (E)"),

    ("WhetherDetailsOfComplaintsWithRegardToConflictOfInterestIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfComplaintsWithRegardToConflictOfInterest",
     "RemarksForAssuranceOfDetailsOfComplaintsWithRegardToConflictOfInterest",
     "P1Q6 - Conflict of Interest (E)"),

    ("WhetherDetailsOfAnyCorrectiveActionTakenOrUnderwayOnIssuesRelatedToFinesOrPenaltiesOrActionTakenByRegulatorsOrLawEnforcementAgenciesOrJudicialInstitutionsOnCasesOfCorruptionAndConflictsOfInterestIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfAnyCorrectiveActionTakenOrUnderwayOnIssuesRelatedToFinesOrPenaltiesOrActionTakenByRegulatorsOrLawEnforcementAgenciesOrJudicialInstitutionsOnCasesOfCorruptionAndConflictsOfInterest",
     "RemarksForAssuranceOfDetailsOfAnyCorrectiveActionTakenOrUnderwayOnIssuesRelatedToFinesOrPenaltiesOrActionTakenByRegulatorsOrLawEnforcementAgenciesOrJudicialInstitutionsOnCasesOfCorruptionAndConflictsOfInterest",
     "P1Q7 - Corrective Action (E)"),

    ("WhetherNumberOfDaysOfAccountsPayablesIsAssuredByAssurer",
     "AssuranceSubTypeForNumberOfDaysOfAccountsPayables",
     "RemarksForAssuranceOfNumberOfDaysOfAccountsPayables",
     "P1Q8 - Days Payables (E)"),

    ("WhetherDetailsOfConcentrationOfPurchasesAndSalesWithTradingHousesDealersAndRelatedPartiesAlongWithLoansAndAdvancesAndInvestmentsWithRelatedPartiesIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfConcentrationOfPurchasesAndSalesWithTradingHousesDealersAndRelatedPartiesAlongWithLoansAndAdvancesAndInvestmentsWithRelatedParties",
     "RemarksForAssuranceOfDetailsOfConcentrationOfPurchasesAndSalesWithTradingHousesDealersAndRelatedPartiesAlongWithLoansAndAdvancesAndInvestmentsWithRelatedParties",
     "P1Q9 - Openness/Concentration (E)")
]

# Leadership Indicators (2 Questions)
leadership_questions = [
    ("WhetherAwarenessProgrammesConductedForValueChainPartnersOnAnyOfThePrinciplesDuringTheFinancialYearIsAssuredByAssurer",
     "AssuranceSubTypeForAwarenessProgrammesConductedForValueChainPartnersOnAnyOfThePrinciplesDuringTheFinancialYear",
     "RemarksForAssuranceOfAwarenessProgrammesConductedForValueChainPartnersOnAnyOfThePrinciplesDuringTheFinancialYear",
     "P1Q10 - VC Awareness (L)"),

    ("WhetherTheEntityHaveProcessesInPlaceToAvoidOrManageConflictOfInterestsInvolvingMembersOfTheBoardIsAssuredByAssurer",
     "AssuranceSubTypeForTheEntityHaveProcessesInPlaceToAvoidOrManageConflictOfInterestsInvolvingMembersOfTheBoard",
     "RemarksForAssuranceOfTheEntityHaveProcessesInPlaceToAvoidOrManageConflictOfInterestsInvolvingMembersOfTheBoard",
     "P1Q11 - Board Conflicts (L)")
]

# Process each file
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Overall E/L flags
    for field in p1_fields:
        row.append(get_val(df, field))

    # Essentials
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle1_data.append(row)

# Build column headers
columns = ["CIN", "Company"]
columns += list(p1_fields.values())

for _, _, _, label in essential_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

for _, _, _, label in leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p1 = pd.DataFrame(principle1_data, columns=columns)
df_p1 = df_p1.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [05:46<00:00,  3.38it/s]


In [57]:
df_p1.shape

(1174, 37)

In [58]:
df_p1

Unnamed: 0,CIN,Company,P1 - Overall (E),P1 - Overall (L),P1Q1 - Training Coverage (E) - Whether,P1Q1 - Training Coverage (E) - Subtype,P1Q1 - Training Coverage (E) - Remarks,P1Q2 - Fines/Penalties (E) - Whether,P1Q2 - Fines/Penalties (E) - Subtype,P1Q2 - Fines/Penalties (E) - Remarks,...,P1Q8 - Days Payables (E) - Remarks,P1Q9 - Openness/Concentration (E) - Whether,P1Q9 - Openness/Concentration (E) - Subtype,P1Q9 - Openness/Concentration (E) - Remarks,P1Q10 - VC Awareness (L) - Whether,P1Q10 - VC Awareness (L) - Subtype,P1Q10 - VC Awareness (L) - Remarks,P1Q11 - Board Conflicts (L) - Whether,P1Q11 - Board Conflicts (L) - Subtype,P1Q11 - Board Conflicts (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,,,,false,,,...,,true,Reasonable,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [59]:
df_p1.to_excel('principle1_assurance.xlsx', index=False)

In [56]:
import os
import pandas as pd
from tqdm import tqdm

principle2_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall flags
p2_flags = {
    'TypeOfAssuranceForPrinciple2EssentialIndicators': 'P2 - Overall (E)',
    'TypeOfAssuranceForPrinciple2LeadershipIndicators': 'P2 - Overall (L)'
}

# Essential Indicators (4 Questions)
essential_questions = [
    (
        "WhetherPercentageOfRAndDAndCapitalExpenditureInvestmentsInSpecificTechnologiesIsAssuredByAssurer",
        "AssuranceSubTypeForPercentageOfRAndDAndCapitalExpenditureInvestmentsInSpecificTechnologies",
        "RemarksForAssuranceOfPercentageOfRAndDAndCapitalExpenditureInvestmentsInSpecificTechnologies",
        "P2Q1 - R&D in Sustainable Tech (E)"
    ),
    (
        "WhetherTheEntityHaveProceduresInPlaceForSustainableSourcingAndPercentageOfInputsWereSourcedSustainablyIsAssuredByAssurer",
        "AssuranceSubTypeForTheEntityHaveProceduresInPlaceForSustainableSourcingAndPercentageOfInputsWereSourcedSustainably",
        "RemarksForAssuranceOfTheEntityHaveProceduresInPlaceForSustainableSourcingAndPercentageOfInputsWereSourcedSustainably",
        "P2Q2 - Sustainable Sourcing (E)"
    ),
    (
        "WhetherDescribeTheProcessesInPlaceToSafelyReclaimYourProductsForReusingRecyclingAndDisposingAtTheEndOfLifeForPlasticsIncludingPackagingEWasteHazardousWasteAndOtherWasteIsAssuredByAssurer",
        "AssuranceSubTypeForDescribeTheProcessesInPlaceToSafelyReclaimYourProductsForReusingRecyclingAndDisposingAtTheEndOfLifeForPlasticsIncludingPackagingEWasteHazardousWasteAndOtherWaste",
        "RemarksForAssuranceOfDescribeTheProcessesInPlaceToSafelyReclaimYourProductsForReusingRecyclingAndDisposingAtTheEndOfLifeForPlasticsIncludingPackagingEWasteHazardousWasteAndOtherWaste",
        "P2Q3 - Product Reclaim Process (E)"
    ),
    (
        "AssurerHasAssuredWhetherTheWasteCollectionPlanIsInLineWithTheExtendedProducerResponsibilityPlanSubmittedToPollutionControlBoardsAndStepsTakenToAddressTheWasteCollectionPlanIfNotSubmitted",
        "AssuranceSubTypeForWhetherTheWasteCollectionPlanIsInLineWithTheExtendedProducerResponsibilityPlanSubmittedToPollutionControlBoardsAndStepsTakenToAddressTheWasteCollectionPlanIfNotSubmitted",
        "RemarksForAssuranceOfWhetherTheWasteCollectionPlanIsInLineWithTheExtendedProducerResponsibilityPlanSubmittedToPollutionControlBoardsAndStepsTakenToAddressTheWasteCollectionPlanIfNotSubmitted",
        "P2Q4 - Extended Producer Responsibility (E)"
    )
]

# Leadership Indicators (5 Questions)
leadership_questions = [
    (
        "AssurerHasAssuredWhetherTheEntityConductedLifeCyclePerspectiveOrAssessmentsForAnyOfItsProductsOrForItsServices",
        "AssuranceSubTypeForWhetherTheEntityConductedLifeCyclePerspectiveOrAssessmentsForAnyOfItsProductsOrForItsServices",
        "RemarksForAssuranceOfWhetherTheEntityConductedLifeCyclePerspectiveOrAssessmentsForAnyOfItsProductsOrForItsServices",
        "P2Q5 - Life Cycle Assessments (L)"
    ),
    (
        "WhetherDetailsOfSignificantSocialOrEnvironmentalConcernsFromProductionOrDisposalOfProductOrServiceWithActionTakenToMitigateTheSameIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfSignificantSocialOrEnvironmentalConcernsFromProductionOrDisposalOfProductOrServiceWithActionTakenToMitigateTheSame",
        "RemarksForAssuranceOfDetailsOfSignificantSocialOrEnvironmentalConcernsFromProductionOrDisposalOfProductOrServiceWithActionTakenToMitigateTheSame",
        "P2Q6 - Significant Concerns & Mitigation (L)"
    ),
    (
        "WhetherDetailsOfPercentageOfRecycledOrReusedInputMaterialToTotalMaterialByValueUsedInProductionOrProvidingServicesIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfPercentageOfRecycledOrReusedInputMaterialToTotalMaterialByValueUsedInProductionOrProvidingServices",
        "RemarksForAssuranceOfDetailsOfPercentageOfRecycledOrReusedInputMaterialToTotalMaterialByValueUsedInProductionOrProvidingServices",
        "P2Q7 - Recycled Input Material (L)"
    ),
    (
        "WhetherTheProductsAndPackagingReclaimedAtEndOfLifeOfProductsAmountReusedOrRecycledOrSafelyDisposedIsAssuredByAssurer",
        "AssuranceSubTypeForTheProductsAndPackagingReclaimedAtEndOfLifeOfProductsAmountReusedOrRecycledOrSafelyDisposed",
        "RemarksForAssuranceOfTheProductsAndPackagingReclaimedAtEndOfLifeOfProductsAmountReusedOrRecycledOrSafelyDisposed",
        "P2Q8 - EOL Product Reclaim Volumes (L)"
    ),
    (
        "WhetherDetailsOfReclaimedProductsAndTheirPackagingMaterialsForEachProductCategoryIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfReclaimedProductsAndTheirPackagingMaterialsForEachProductCategory",
        "RemarksForAssuranceOfDetailsOfReclaimedProductsAndTheirPackagingMaterialsForEachProductCategory",
        "P2Q9 - Reclaimed Product % per Category (L)"
    )
]

# Loop through files
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Add overall P2 assurance flags
    for field in p2_flags:
        row.append(get_val(df, field))

    # Essential questions
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership questions
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle2_data.append(row)

# Build column headers
columns = ["CIN", "Company"]
columns += list(p2_flags.values())

for _, _, _, label in essential_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

for _, _, _, label in leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p2 = pd.DataFrame(principle2_data, columns=columns)
df_p2 = df_p2.sort_values(by="Company")

# Optional save
# df_p2.to_excel("Principle2_Detailed_Assurance.xlsx", index=False)


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [05:39<00:00,  3.45it/s]


In [63]:
df_p2

Unnamed: 0,CIN,Company,P2 - Overall (E),P2 - Overall (L),P2Q1 - R&D in Sustainable Tech (E) - Whether,P2Q1 - R&D in Sustainable Tech (E) - Subtype,P2Q1 - R&D in Sustainable Tech (E) - Remarks,P2Q2 - Sustainable Sourcing (E) - Whether,P2Q2 - Sustainable Sourcing (E) - Subtype,P2Q2 - Sustainable Sourcing (E) - Remarks,...,P2Q6 - Significant Concerns & Mitigation (L) - Remarks,P2Q7 - Recycled Input Material (L) - Whether,P2Q7 - Recycled Input Material (L) - Subtype,P2Q7 - Recycled Input Material (L) - Remarks,P2Q8 - EOL Product Reclaim Volumes (L) - Whether,P2Q8 - EOL Product Reclaim Volumes (L) - Subtype,P2Q8 - EOL Product Reclaim Volumes (L) - Remarks,P2Q9 - Reclaimed Product % per Category (L) - Whether,P2Q9 - Reclaimed Product % per Category (L) - Subtype,P2Q9 - Reclaimed Product % per Category (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,false,,,false,,,...,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [62]:
df_p2.to_excel('principle2_assurance.xlsx', index=False)

In [71]:
import os
import pandas as pd
from tqdm import tqdm


# Helper function to extract value from DataFrame
def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Data container
principle3_data = []

# Overall flags
p3_flags = {
    'TypeOfAssuranceForPrinciple3EssentialIndicators': 'P3 - Overall (E)',
    'TypeOfAssuranceForPrinciple3LeadershipIndicators': 'P3 - Overall (L)'
}

# Essential Indicators
essential_questions = [
    (
        "WhetherDetailsOfMeasuresForTheWellBeingOfEmployeesAndWorkersAndSpendingOnItIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfMeasuresForTheWellBeingOfEmployeesAndWorkersAndSpendingOnIt",
        "RemarksForAssuranceOfDetailsOfMeasuresForTheWellBeingOfEmployeesAndWorkersAndSpendingOnIt",
        "P3Q1 - Wellbeing & Spend (E)"
    ),
    (
        "WhetherDetailsOfRetirementBenefitsIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfRetirementBenefits",
        "RemarksForAssuranceOfDetailsOfRetirementBenefits",
        "P3Q2 - Retirement Benefits (E)"
    ),
    (
        "WhetherThePremisesOrOfficesOfTheEntityAccessibleToDifferentlyAbledEmployeesAndWorkersAndStepsAreBeingTakenByTheEntityIfThePremisesOrOfficesOfTheEntityNotAccessibleIsAssuredByAssurer",
        "AssuranceSubTypeForThePremisesOrOfficesOfTheEntityAccessibleToDifferentlyAbledEmployeesAndWorkersAndStepsAreBeingTakenByTheEntityIfThePremisesOrOfficesOfTheEntityNotAccessible",
        "RemarksForAssuranceOfThePremisesOrOfficesOfTheEntityAccessibleToDifferentlyAbledEmployeesAndWorkersAndStepsAreBeingTakenByTheEntityIfThePremisesOrOfficesOfTheEntityNotAccessible",
        "P3Q3 - Accessibility (E)"
    ),
    (
        "AssurerHasAssuredWhetherTheEntityHaveAnEqualOpportunityPolicyAsPerTheRightsOfPersonsWithDisabilitiesAct2016",
        "AssuranceSubTypeForWhetherTheEntityHaveAnEqualOpportunityPolicyAsPerTheRightsOfPersonsWithDisabilitiesAct2016",
        "RemarksForAssuranceOfWhetherTheEntityHaveAnEqualOpportunityPolicyAsPerTheRightsOfPersonsWithDisabilitiesAct2016",
        "P3Q4 - Equal Opportunity Policy (E)"
    ),
    (
        "WhetherReturnToWorkAndRetentionRatesOfPermanentEmployeesAndWorkersThatTookParentalLeaveIsAssuredByAssurer",
        "AssuranceSubTypeForReturnToWorkAndRetentionRatesOfPermanentEmployeesAndWorkersThatTookParentalLeave",
        "RemarksForAssuranceOfReturnToWorkAndRetentionRatesOfPermanentEmployeesAndWorkersThatTookParentalLeave",
        "P3Q5 - Parental Leave (E)"
    ),
    (
        "AssurerHasAssuredWhetherIsThereAMechanismAvailableToReceiveAndRedressGrievancesForTheFollowingCategoriesOfEmployeesAndWorker",
        "AssuranceSubTypeForWhetherIsThereAMechanismAvailableToReceiveAndRedressGrievancesForTheFollowingCategoriesOfEmployeesAndWorker",
        "RemarksForAssuranceOfWhetherIsThereAMechanismAvailableToReceiveAndRedressGrievancesForTheFollowingCategoriesOfEmployeesAndWorker",
        "P3Q6 - Grievance Mechanism (E)"
    ),
    (
        "WhetherMembershipOfEmployeesAndWorkerInAssociationsOrUnionsRecognisedByTheListedEntityIsAssuredByAssurer",
        "AssuranceSubTypeForMembershipOfEmployeesAndWorkerInAssociationsOrUnionsRecognisedByTheListedEntity",
        "RemarksForAssuranceOfMembershipOfEmployeesAndWorkerInAssociationsOrUnionsRecognisedByTheListedEntity",
        "P3Q7 - Union Membership (E)"
    ),
    (
        "WhetherDetailsOfTrainingGivenToEmployeesAndWorkersIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfTrainingGivenToEmployeesAndWorkers",
        "RemarksForAssuranceOfDetailsOfTrainingGivenToEmployeesAndWorkers",
        "P3Q8 - Training (E)"
    ),
    (
        "WhetherDetailsOfPerformanceAndCareerDevelopmentReviewsOfEmployeesAndWorkerIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfPerformanceAndCareerDevelopmentReviewsOfEmployeesAndWorker",
        "RemarksForAssuranceOfDetailsOfPerformanceAndCareerDevelopmentReviewsOfEmployeesAndWorker",
        "P3Q9 - Performance Reviews (E)"
    ),
    (
        "WhetherHealthAndSafetyManagementSystemIsAssuredByAssurer",
        "AssuranceSubTypeForHealthAndSafetyManagementSystem",
        "RemarksForAssuranceOfHealthAndSafetyManagementSystem",
        "P3Q10 - Health & Safety Mgmt (E)"
    ),
    (
        "WhetherDetailsOfSafetyRelatedIncidentsIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfSafetyRelatedIncidents",
        "RemarksForAssuranceOfDetailsOfSafetyRelatedIncidents",
        "P3Q11 - Safety Incidents (E)"
    ),
    (
        "WhetherMeasuresTakenByTheEntityToEnsureASafeAndHealthyWorkPlaceIsAssuredByAssurer",
        "AssuranceSubTypeForMeasuresTakenByTheEntityToEnsureASafeAndHealthyWorkPlace",
        "RemarksForAssuranceOfMeasuresTakenByTheEntityToEnsureASafeAndHealthyWorkPlace",
        "P3Q12 - Healthy Workplace (E)"
    ),
    (
        "WhetherDetailsOfComplaintsMadeByEmployeesAndWorkersIsAssuredByAssurerAsPerP3",
        "AssuranceSubTypeForDetailsOfComplaintsMadeByEmployeesAndWorkersAsPerP3",
        "RemarksForAssuranceOfDetailsOfComplaintsMadeByEmployeesAndWorkersAsPerP3",
        "P3Q13 - Complaints (E)"
    ),
    (
        "WhetherAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP3IsAssuredByAssurer",
        "AssuranceSubTypeForAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP3",
        "RemarksForAssuranceOfAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP3",
        "P3Q14 - Safety Assessments (E)"
    ),
    (
        "WhetherDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOfYourPlantsAndOfficesThatWereAssessedIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOfYourPlantsAndOfficesThatWereAssessed",
        "RemarksForAssuranceOfDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOfYourPlantsAndOfficesThatWereAssessed",
        "P3Q15 - Corrective Action (E)"
    )
]

# Leadership Indicators
leadership_questions = [
    (
        "AssurerHasAssuredWhetherTheEntityExtendAnyLifeInsuranceOrAnyCompensatoryPackageInTheEventOfDeathOfEmployees",
        "AssuranceSubTypeForWhetherTheEntityExtendAnyLifeInsuranceOrAnyCompensatoryPackageInTheEventOfDeathOfEmployees",
        "RemarksForAssuranceOfWhetherTheEntityExtendAnyLifeInsuranceOrAnyCompensatoryPackageInTheEventOfDeathOfEmployees",
        "P3Q16 - Death Compensation (L)"
    ),
    (
        "WhetherDetailsOfMeasuresUndertakenByTheEntityToEnsureThatStatutoryDuesHaveBeenDeductedAndDepositedByTheValueChainPartnersIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfMeasuresUndertakenByTheEntityToEnsureThatStatutoryDuesHaveBeenDeductedAndDepositedByTheValueChainPartners",
        "RemarksForAssuranceOfDetailsOfMeasuresUndertakenByTheEntityToEnsureThatStatutoryDuesHaveBeenDeductedAndDepositedByTheValueChainPartners",
        "P3Q17 - Statutory Dues (L)"
    ),
    (
        "WhetherDetailsOfNumberOfEmployeesOrWorkersHavingSufferedHighConsequenceWorkRelatedInjuryOrIllHealthOrFatalitiesWhoOrWhoseFamilyMembersAreRehabilitatedAndPlacedInSuitableEmploymentIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfNumberOfEmployeesOrWorkersHavingSufferedHighConsequenceWorkRelatedInjuryOrIllHealthOrFatalitiesWhoOrWhoseFamilyMembersAreRehabilitatedAndPlacedInSuitableEmployment",
        "RemarksForAssuranceOfDetailsOfNumberOfEmployeesOrWorkersHavingSufferedHighConsequenceWorkRelatedInjuryOrIllHealthOrFatalitiesWhoOrWhoseFamilyMembersAreRehabilitatedAndPlacedInSuitableEmployment",
        "P3Q18 - Rehabilitated Workers (L)"
    ),
    (
        "AssurerHasAssuredWhetherTheEntityProvideTransitionAssistanceProgramsToFacilitateContinuedEmployabilityAndTheManagementOfCareerEndingsResultingFromRetirementOrTerminationOfEmployment",
        "AssuranceSubTypeForWhetherTheEntityProvideTransitionAssistanceProgramsToFacilitateContinuedEmployabilityAndTheManagementOfCareerEndingsResultingFromRetirementOrTerminationOfEmployment",
        "RemarksForAssuranceOfWhetherTheEntityProvideTransitionAssistanceProgramsToFacilitateContinuedEmployabilityAndTheManagementOfCareerEndingsResultingFromRetirementOrTerminationOfEmployment",
        "P3Q19 - Transition Assistance (L)"
    ),
    (
        "WhetherDetailsOnAssessmentOfValueChainPartnersP3IsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOnAssessmentOfValueChainPartnersP3",
        "RemarksForAssuranceOfDetailsOnAssessmentOfValueChainPartnersP3",
        "P3Q20 - Value Chain Assessment (L)"
    ),
    (
        "WhetherDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOnAssessmentOfValueChainPartnersIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOnAssessmentOfValueChainPartners",
        "RemarksForAssuranceOfDetailsOfAnyCorrectiveActionTakenOrUnderwayToAddressSafetyRelatedIncidentsOnAssessmentOfValueChainPartners",
        "P3Q21 - VC Corrective Action (L)"
    )
]

# Loop through each file
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))
    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]
    for field in p3_flags:
        row.append(get_val(df, field))
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))
    principle3_data.append(row)

# Build final DataFrame
columns = ["CIN", "Company"]
columns += list(p3_flags.values())
for _, _, _, label in essential_questions + leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p3 = pd.DataFrame(principle3_data, columns=columns)
df_p3 = df_p3.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [07:16<00:00,  2.69it/s]


In [72]:
df_p3.shape

(1174, 67)

In [73]:
df_p3

Unnamed: 0,CIN,Company,P3 - Overall (E),P3 - Overall (L),P3Q1 - Wellbeing & Spend (E) - Whether,P3Q1 - Wellbeing & Spend (E) - Subtype,P3Q1 - Wellbeing & Spend (E) - Remarks,P3Q2 - Retirement Benefits (E) - Whether,P3Q2 - Retirement Benefits (E) - Subtype,P3Q2 - Retirement Benefits (E) - Remarks,...,P3Q18 - Rehabilitated Workers (L) - Remarks,P3Q19 - Transition Assistance (L) - Whether,P3Q19 - Transition Assistance (L) - Subtype,P3Q19 - Transition Assistance (L) - Remarks,P3Q20 - Value Chain Assessment (L) - Whether,P3Q20 - Value Chain Assessment (L) - Subtype,P3Q20 - Value Chain Assessment (L) - Remarks,P3Q21 - VC Corrective Action (L) - Whether,P3Q21 - VC Corrective Action (L) - Subtype,P3Q21 - VC Corrective Action (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,true,Reasonable,,false,,,...,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [74]:
df_p3.to_excel('principle3_assurance.xlsx', index=False)

In [78]:
import os
import pandas as pd
from tqdm import tqdm

principle4_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall flags
p4_flags = {
    'TypeOfAssuranceForPrinciple4EssentialIndicators': 'P4 - Overall (E)',
    'TypeOfAssuranceForPrinciple4LeadershipIndicators': 'P4 - Overall (L)'
}

# Essential Indicators (2 Questions)
essential_questions = [
    (
        "WhetherTheProcessesForIdentifyingKeyStakeholderGroupsOfTheEntityIsAssuredByAssurer",
        "AssuranceSubTypeForTheProcessesForIdentifyingKeyStakeholderGroupsOfTheEntity",
        "RemarksForAssuranceOfTheProcessesForIdentifyingKeyStakeholderGroupsOfTheEntity",
        "P4Q1 - Stakeholder Identification Process (E)"
    ),
    (
        "WhetherListStakeholderGroupsIdentifiedAsKeyForYourEntityAndTheFrequencyOfEngagementWithEachStakeholderGroupIsAssuredByAssurer",
        "AssuranceSubTypeForListStakeholderGroupsIdentifiedAsKeyForYourEntityAndTheFrequencyOfEngagementWithEachStakeholderGroup",
        "RemarksForAssuranceOfListStakeholderGroupsIdentifiedAsKeyForYourEntityAndTheFrequencyOfEngagementWithEachStakeholderGroup",
        "P4Q2 - Key Stakeholder Groups & Engagement (E)"
    )
]

# Leadership Indicators (3 Questions)
leadership_questions = [
    (
        "WhetherTheProcessesForConsultationBetweenStakeholdersAndTheBoardOnEconomicEnvironmentalAndSocialTopicsOrIfConsultationIsDelegatedHowIsFeedbackFromSuchConsultationsProvidedToTheBoardIsAssuredByAssurer",
        "AssuranceSubTypeForTheProcessesForConsultationBetweenStakeholdersAndTheBoardOnEconomicEnvironmentalAndSocialTopicsOrIfConsultationIsDelegatedHowIsFeedbackFromSuchConsultationsProvidedToTheBoard",
        "RemarksForAssuranceOfTheProcessesForConsultationBetweenStakeholdersAndTheBoardOnEconomicEnvironmentalAndSocialTopicsOrIfConsultationIsDelegatedHowIsFeedbackFromSuchConsultationsProvidedToTheBoard",
        "P4Q3 - Consultation Process & Board Feedback (L)"
    ),
    (
        "AssurerHasAssuredWhetherStakeholderConsultationIsUsedToSupportTheIdentificationAndManagementOfEnvironmentalAndSocialTopics",
        "AssuranceSubTypeForWhetherStakeholderConsultationIsUsedToSupportTheIdentificationAndManagementOfEnvironmentalAndSocialTopics",
        "RemarksForAssuranceOfWhetherStakeholderConsultationIsUsedToSupportTheIdentificationAndManagementOfEnvironmentalAndSocialTopics",
        "P4Q4 - Stakeholder Consultation Use (L)"
    ),
    (
        "WhetherDetailsOfInstancesOfEngagementWithAndActionsTakenToAddressTheConcernsOfVulnerableOrMarginalizedStakeholderGroupsIsAssuredByAssurer",
        "AssuranceSubTypeForDetailsOfInstancesOfEngagementWithAndActionsTakenToAddressTheConcernsOfVulnerableOrMarginalizedStakeholderGroups",
        "RemarksForAssuranceOfDetailsOfInstancesOfEngagementWithAndActionsTakenToAddressTheConcernsOfVulnerableOrMarginalizedStakeholderGroups",
        "P4Q5 - Engagement with Marginalized Stakeholders (L)"
    )
]


# Loop through files
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    for field in p4_flags:
        row.append(get_val(df, field))

    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle4_data.append(row)

# Build column headers
columns = ["CIN", "Company"]
columns += list(p4_flags.values())

for _, _, _, label in essential_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

for _, _, _, label in leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p4 = pd.DataFrame(principle4_data, columns=columns)
df_p4 = df_p4.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:05<00:00,  3.21it/s]


In [79]:
df_p4.shape

(1174, 19)

In [80]:
df_p4

Unnamed: 0,CIN,Company,P4 - Overall (E),P4 - Overall (L),P4Q1 - Stakeholder Identification Process (E) - Whether,P4Q1 - Stakeholder Identification Process (E) - Subtype,P4Q1 - Stakeholder Identification Process (E) - Remarks,P4Q2 - Key Stakeholder Groups & Engagement (E) - Whether,P4Q2 - Key Stakeholder Groups & Engagement (E) - Subtype,P4Q2 - Key Stakeholder Groups & Engagement (E) - Remarks,P4Q3 - Consultation Process & Board Feedback (L) - Whether,P4Q3 - Consultation Process & Board Feedback (L) - Subtype,P4Q3 - Consultation Process & Board Feedback (L) - Remarks,P4Q4 - Stakeholder Consultation Use (L) - Whether,P4Q4 - Stakeholder Consultation Use (L) - Subtype,P4Q4 - Stakeholder Consultation Use (L) - Remarks,P4Q5 - Engagement with Marginalized Stakeholders (L) - Whether,P4Q5 - Engagement with Marginalized Stakeholders (L) - Subtype,P4Q5 - Engagement with Marginalized Stakeholders (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,false,,,false,,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,,,,,,,,,


In [84]:
df_p4.to_excel('principle4_assurance.xlsx', index=False)

In [81]:
import os
import pandas as pd
from tqdm import tqdm

principle5_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall flags
p5_flags = {
    'TypeOfAssuranceForPrinciple5EssentialIndicators': 'P5 - Overall (E)',
    'TypeOfAssuranceForPrinciple5LeadershipIndicators': 'P5 - Overall (L)'
}

# Essential Indicators (11 Questions)
essential_questions = [
    ("WhetherEmployeesAndWorkersWhoHaveBeenProvidedTrainingOnHumanRightsIssuesAndPoliciesOfTheEntityIsAssuredByAssurer",
     "AssuranceSubTypeForEmployeesAndWorkersWhoHaveBeenProvidedTrainingOnHumanRightsIssuesAndPoliciesOfTheEntity",
     "RemarksForAssuranceOfEmployeesAndWorkersWhoHaveBeenProvidedTrainingOnHumanRightsIssuesAndPoliciesOfTheEntity",
     "P5Q1 - Human Rights Training (E)"),
    ("WhetherDetailsOfMinimumWagesPaidToEmployeesAndWorkersIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfMinimumWagesPaidToEmployeesAndWorkers",
     "RemarksForAssuranceOfDetailsOfMinimumWagesPaidToEmployeesAndWorkers",
     "P5Q2 - Minimum Wages (E)"),
    ("WhetherDetailsOfMedianOfRemunerationOrSalaryOrWagesAndWagesPaidToFemaleIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfMedianOfRemunerationOrSalaryOrWagesAndWagesPaidToFemale",
     "RemarksForAssuranceOfDetailsOfMedianOfRemunerationOrSalaryOrWagesAndWagesPaidToFemale",
     "P5Q3 - Remuneration Details (E)"),
    ("AssurerHasAssuredWhetherDoYouHaveAFocalPointResponsibleForAddressingHumanRightsImpactsOrIssuesCausedOrContributedToByTheBusiness",
     "AssuranceSubTypeForWhetherDoYouHaveAFocalPointResponsibleForAddressingHumanRightsImpactsOrIssuesCausedOrContributedToByTheBusiness",
     "RemarksForAssuranceOfWhetherDoYouHaveAFocalPointResponsibleForAddressingHumanRightsImpactsOrIssuesCausedOrContributedToByTheBusiness",
     "P5Q4 - Focal Point for HR (E)"),
    ("WhetherTheInternalMechanismsInPlaceToRedressGrievancesRelatedToHumanRightsIssuesIsAssuredByAssurer",
     "AssuranceSubTypeForTheInternalMechanismsInPlaceToRedressGrievancesRelatedToHumanRightsIssues",
     "RemarksForAssuranceOfTheInternalMechanismsInPlaceToRedressGrievancesRelatedToHumanRightsIssues",
     "P5Q5 - Internal Grievance Redressal (E)"),
    ("WhetherDetailsOfComplaintsMadeByEmployeesAndWorkersIsAssuredByAssurerAsPerP5",
     "AssuranceSubTypeForDetailsOfComplaintsMadeByEmployeesAndWorkersAsPerP5",
     "RemarksForAssuranceOfDetailsOfComplaintsMadeByEmployeesAndWorkersAsPerP5",
     "P5Q6 - Complaints (E)"),
    ("WhetherComplaintsFiledUnderTheSexualHarassmentOfWomenAtWorkplaceIsAssuredByAssurer",
     "AssuranceSubTypeForComplaintsFiledUnderTheSexualHarassmentOfWomenAtWorkplace",
     "RemarksForAssuranceOfComplaintsFiledUnderTheSexualHarassmentOfWomenAtWorkplace",
     "P5Q7 - Sexual Harassment Cases (E)"),
    ("WhetherMechanismsToPreventAdverseConsequencesToTheComplainantInDiscriminationAndHarassmentCasesIsAssuredByAssurer",
     "AssuranceSubTypeForMechanismsToPreventAdverseConsequencesToTheComplainantInDiscriminationAndHarassmentCases",
     "RemarksForAssuranceOfMechanismsToPreventAdverseConsequencesToTheComplainantInDiscriminationAndHarassmentCases",
     "P5Q8 - Protection for Complainants (E)"),
    ("WhetherHumanRightsRequirementsFormPartOfYourBusinessAgreementsAndContractsIsAssuredByAssurer",
     "AssuranceSubTypeForHumanRightsRequirementsFormPartOfYourBusinessAgreementsAndContracts",
     "RemarksForAssuranceOfHumanRightsRequirementsFormPartOfYourBusinessAgreementsAndContracts",
     "P5Q9 - HR in Agreements (E)"),
    ("WhetherAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP5IsAssuredByAssurer",
     "AssuranceSubTypeForAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP5",
     "RemarksForAssuranceOfAssessmentsOfYourPlantsAndOfficesThatWereAssessedForTheYearP5",
     "P5Q10 - Assessments (E)"),
    ("WhetherDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfPlantAndOfficeIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfPlantAndOffice",
     "RemarksForAssuranceOfDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfPlantAndOffice",
     "P5Q11 - Corrective Actions on Plant Assessment (E)")
]

# Leadership Indicators (5 Questions)
leadership_questions = [
    ("WhetherDetailsOfABusinessProcessBeingModifiedOrIntroducedAsAResultOfAddressingHumanRightsGrievancesOrComplaintsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfABusinessProcessBeingModifiedOrIntroducedAsAResultOfAddressingHumanRightsGrievancesOrComplaints",
     "RemarksForAssuranceOfDetailsOfABusinessProcessBeingModifiedOrIntroducedAsAResultOfAddressingHumanRightsGrievancesOrComplaints",
     "P5Q12 - Business Process Modified (L)"),
    ("WhetherDetailsOfTheScopeAndCoverageOfAnyHumanRightsDueDiligenceConductedIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTheScopeAndCoverageOfAnyHumanRightsDueDiligenceConducted",
     "RemarksForAssuranceOfDetailsOfTheScopeAndCoverageOfAnyHumanRightsDueDiligenceConducted",
     "P5Q13 - HR Due Diligence (L)"),
    ("AssurerHasAssuredWhetherThePremiseOrOfficeOfTheEntityAccessibleToDifferentlyAbledVisitorsAsPerTheRequirementsOfTheRightsOfPersonsWithDisabilitiesAct2016",
     "AssuranceSubTypeForWhetherThePremiseOrOfficeOfTheEntityAccessibleToDifferentlyAbledVisitorsAsPerTheRequirementsOfTheRightsOfPersonsWithDisabilitiesAct2016",
     "RemarksForAssuranceOfWhetherThePremiseOrOfficeOfTheEntityAccessibleToDifferentlyAbledVisitorsAsPerTheRequirementsOfTheRightsOfPersonsWithDisabilitiesAct2016",
     "P5Q14 - Accessibility of Office for Visitors (L)"),
    ("WhetherDetailsOnAssessmentOfValueChainPartnersP5IsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOnAssessmentOfValueChainPartnersP5",
     "RemarksForAssuranceOfDetailsOnAssessmentOfValueChainPartnersP5",
     "P5Q15 - VC Partner Assessment (L)"),
    ("WhetherDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfValueChainPartnerIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfValueChainPartner",
     "RemarksForAssuranceOfDetailsOfAnyCorrectiveActionsTakenOrUnderwayToAddressSignificantRisksOrConcernsArisingFromTheAssessmentsOfValueChainPartner",
     "P5Q16 - VC Partner Corrective Actions (L)")
]

for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Add overall flags
    for field in p5_flags:
        row.append(get_val(df, field))

    # Essential Indicators
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership Indicators
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle5_data.append(row)

# Build column headers
columns = ["CIN", "Company"]
columns += list(p5_flags.values())

for _, _, _, label in essential_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

for _, _, _, label in leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p5 = pd.DataFrame(principle5_data, columns=columns)
df_p5 = df_p5.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:48<00:00,  2.87it/s]


In [82]:
df_p5.shape

(1174, 52)

In [83]:
df_p5

Unnamed: 0,CIN,Company,P5 - Overall (E),P5 - Overall (L),P5Q1 - Human Rights Training (E) - Whether,P5Q1 - Human Rights Training (E) - Subtype,P5Q1 - Human Rights Training (E) - Remarks,P5Q2 - Minimum Wages (E) - Whether,P5Q2 - Minimum Wages (E) - Subtype,P5Q2 - Minimum Wages (E) - Remarks,...,P5Q13 - HR Due Diligence (L) - Remarks,P5Q14 - Accessibility of Office for Visitors (L) - Whether,P5Q14 - Accessibility of Office for Visitors (L) - Subtype,P5Q14 - Accessibility of Office for Visitors (L) - Remarks,P5Q15 - VC Partner Assessment (L) - Whether,P5Q15 - VC Partner Assessment (L) - Subtype,P5Q15 - VC Partner Assessment (L) - Remarks,P5Q16 - VC Partner Corrective Actions (L) - Whether,P5Q16 - VC Partner Corrective Actions (L) - Subtype,P5Q16 - VC Partner Corrective Actions (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,false,,,false,,,...,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [85]:
df_p5.to_excel('principle5_assurance.xlsx', index=False)

In [89]:
import os
import pandas as pd
from tqdm import tqdm

principle6_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall flags
p6_flags = {
    'TypeOfAssuranceForPrinciple6EssentialIndicators': 'P6 - Overall (E)',
    'TypeOfAssuranceForPrinciple6LeadershipIndicators': 'P6 - Overall (L)'
}

# Essential Questions (13)
essential_questions = [
    ("WhetherDetailsOfTotalEnergyConsumptionInJoulesOrMultiplesAndEnergyIntensityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTotalEnergyConsumptionInJoulesOrMultiplesAndEnergyIntensity",
     "RemarksForAssuranceOfDetailsOfTotalEnergyConsumptionInJoulesOrMultiplesAndEnergyIntensity",
     "P6Q1 - Energy Consumption (E)"),

    ("AssurerHasAssuredWhetherTheEntityHaveAnySitesOrFacilitiesIdentifiedAsDesignatedConsumersUnderThePerformanceAchieveAndTradeSchemeOfTheGovernmentOfIndia",
     "AssuranceSubTypeForWhetherTheEntityHaveAnySitesOrFacilitiesIdentifiedAsDesignatedConsumersUnderThePerformanceAchieveAndTradeSchemeOfTheGovernmentOfIndia",
     "RemarksForAssuranceOfWhetherTheEntityHaveAnySitesOrFacilitiesIdentifiedAsDesignatedConsumersUnderThePerformanceAchieveAndTradeSchemeOfTheGovernmentOfIndia",
     "P6Q2 - PAT Scheme Sites (E)"),

    ("WhetherDetailsOfTheDisclosuresRelatedToWaterWithdrawalIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTheDisclosuresRelatedToWaterWithdrawal",
     "RemarksForAssuranceOfDetailsOfTheDisclosuresRelatedToWaterWithdrawal",
     "P6Q3 - Water Withdrawal (E)"),

    ("WhetherDetailsOfTheDisclosuresRelatedToWaterDischargedIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTheDisclosuresRelatedToWaterDischarged",
     "RemarksForAssuranceOfDetailsOfTheDisclosuresRelatedToWaterDischarged",
     "P6Q4 - Water Discharge (E)"),

    ("WhetherTheEntityImplementedAMechanismForZeroLiquidDischargeIsAssuredByAssurer",
     "AssuranceSubTypeForTheEntityImplementedAMechanismForZeroLiquidDischarge",
     "RemarksForAssuranceOfTheEntityImplementedAMechanismForZeroLiquidDischarge",
     "P6Q5 - Zero Liquid Discharge (E)"),

    ("WhetherDetailsOfAirEmissionsOtherThanGhgEmissionsByTheEntityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfAirEmissionsOtherThanGhgEmissionsByTheEntity",
     "RemarksForAssuranceOfDetailsOfAirEmissionsOtherThanGhgEmissionsByTheEntity",
     "P6Q6 - Other Air Emissions (E)"),

    ("WhetherDetailsOfGreenHouseGasEmissionsAndItsIntensityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfGreenHouseGasEmissionsAndItsIntensity",
     "RemarksForAssuranceOfDetailsOfGreenHouseGasEmissionsAndItsIntensity",
     "P6Q7 - GHG Emissions (E)"),

    ("AssurerHasAssuredWhetherTheEntityHaveAnyProjectRelatedToReducingGreenHouseGasEmission",
     "AssuranceSubTypeForWhetherTheEntityHaveAnyProjectRelatedToReducingGreenHouseGasEmission",
     "RemarksForAssuranceOfWhetherTheEntityHaveAnyProjectRelatedToReducingGreenHouseGasEmission",
     "P6Q8 - GHG Reduction Projects (E)"),

    ("WhetherDetailsRelatedToWasteManagementByTheEntityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsRelatedToWasteManagementByTheEntity",
     "RemarksForAssuranceOfDetailsRelatedToWasteManagementByTheEntity",
     "P6Q9 - Waste Management (E)"),

    ("WhetherDetailsOfWasteManagementPracticesAdoptedInYourEstablishmentsAndTheStrategyAdoptedByCompanyToReduceUsageOfHazardousAndToxicChemicalsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfWasteManagementPracticesAdoptedInYourEstablishmentsAndTheStrategyAdoptedByCompanyToReduceUsageOfHazardousAndToxicChemicals",
     "RemarksForAssuranceOfDetailsOfWasteManagementPracticesAdoptedInYourEstablishmentsAndTheStrategyAdoptedByCompanyToReduceUsageOfHazardousAndToxicChemicals",
     "P6Q10 - Hazardous Waste Strategy (E)"),

    ("WhetherDetailsOfOperationsOrOfficesInOrAroundEcologicallySensitiveAreasWhereEnvironmentalApprovalsOrClearancesAreRequiredIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfOperationsOrOfficesInOrAroundEcologicallySensitiveAreasWhereEnvironmentalApprovalsOrClearancesAreRequired",
     "RemarksForAssuranceOfDetailsOfOperationsOrOfficesInOrAroundEcologicallySensitiveAreasWhereEnvironmentalApprovalsOrClearancesAreRequired",
     "P6Q11 - Eco-Sensitive Area Ops (E)"),

    ("WhetherDetailsOfEnvironmentalImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLawsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfEnvironmentalImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLaws",
     "RemarksForAssuranceOfDetailsOfEnvironmentalImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLaws",
     "P6Q12 - EIAs Undertaken (E)"),

    ("AssurerHasAssuredWhetherTheEntityCompliantWithTheApplicableEnvironmentalLaw",
     "AssuranceSubTypeForWhetherTheEntityCompliantWithTheApplicableEnvironmentalLaw",
     "RemarksForAssuranceOfWhetherTheEntityCompliantWithTheApplicableEnvironmentalLaw",
     "P6Q13 - Environmental Compliance (E)")
]

# Leadership Questions (7)
leadership_questions = [
    ("WhetherWaterWithdrawalOrConsumptionAndDischargeInAreasOfWaterStressInKilolitresIsAssuredByAssurer",
     "AssuranceSubTypeForWaterWithdrawalOrConsumptionAndDischargeInAreasOfWaterStressInKilolitres",
     "RemarksForAssuranceOfWaterWithdrawalOrConsumptionAndDischargeInAreasOfWaterStressInKilolitres",
     "P6Q14 - Water Stress Zones (L)"),

    ("WhetherDetailsOfTotalScope3EmissionsAndItsIntensityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTotalScope3EmissionsAndItsIntensity",
     "RemarksForAssuranceOfDetailsOfTotalScope3EmissionsAndItsIntensity",
     "P6Q15 - Scope 3 Emissions (L)"),

    ("WhetherDetailsOfSignificantDirectAndIndirectImpactOfTheEntityOnBiodiversityInSuchAreasAlongWithPreventionAndRemediationActivitiesIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfSignificantDirectAndIndirectImpactOfTheEntityOnBiodiversityInSuchAreasAlongWithPreventionAndRemediationActivities",
     "RemarksForAssuranceOfDetailsOfSignificantDirectAndIndirectImpactOfTheEntityOnBiodiversityInSuchAreasAlongWithPreventionAndRemediationActivities",
     "P6Q16 - Biodiversity Impact (L)"),

    ("WhetherTheEntityHasUndertakenAnySpecificInitiativesOrUsedInnovativeTechnologyOrSolutionsToImproveResourceEfficiencyIsAssuredByAssurer",
     "AssuranceSubTypeForTheEntityHasUndertakenAnySpecificInitiativesOrUsedInnovativeTechnologyOrSolutionsToImproveResourceEfficiency",
     "RemarksForAssuranceOfTheEntityHasUndertakenAnySpecificInitiativesOrUsedInnovativeTechnologyOrSolutionsToImproveResourceEfficiency",
     "P6Q17 - Resource Efficiency Initiatives (L)"),

    ("AssurerHasAssuredWhetherTheEntityHaveABusinessContinuityAndDisasterManagementPlan",
     "AssuranceSubTypeForWhetherTheEntityHaveABusinessContinuityAndDisasterManagementPlan",
     "RemarksForAssuranceOfWhetherTheEntityHaveABusinessContinuityAndDisasterManagementPlan",
     "P6Q18 - BCP/DR Plan (L)"),

    ("WhetherDiscloseAnySignificantAdverseImpactToTheEnvironmentArisingFromTheValueChainOfTheEntityWhatMitigationOrAdaptationMeasuresHaveBeenTakenByTheEntityInThisRegardIsAssuredByAssurer",
     "AssuranceSubTypeForDiscloseAnySignificantAdverseImpactToTheEnvironmentArisingFromTheValueChainOfTheEntityWhatMitigationOrAdaptationMeasuresHaveBeenTakenByTheEntityInThisRegard",
     "RemarksForAssuranceOfDiscloseAnySignificantAdverseImpactToTheEnvironmentArisingFromTheValueChainOfTheEntityWhatMitigationOrAdaptationMeasuresHaveBeenTakenByTheEntityInThisRegard",
     "P6Q19 - Value Chain Impact (L)"),

    ("WhetherPercentageOfValueChainPartnersByValueOfBusinessDoneWithSuchPartnersThatWereAssessedForEnvironmentalImpactsIsAssuredByAssurer",
     "AssuranceSubTypeForPercentageOfValueChainPartnersByValueOfBusinessDoneWithSuchPartnersThatWereAssessedForEnvironmentalImpacts",
     "RemarksForAssuranceOfPercentageOfValueChainPartnersByValueOfBusinessDoneWithSuchPartnersThatWereAssessedForEnvironmentalImpacts",
     "P6Q20 - Value Chain Partner Assessment (L)")
]

# Loop through files
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Overall flags
    for field in p6_flags:
        row.append(get_val(df, field))

    # Essential
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle6_data.append(row)

# Column headers
columns = ["CIN", "Company"]
columns += list(p6_flags.values())

for _, _, _, label in essential_questions + leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p6 = pd.DataFrame(principle6_data, columns=columns)
df_p6 = df_p6.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:47<00:00,  2.88it/s]


In [90]:
df_p6.shape

(1174, 64)

In [91]:
df_p6

Unnamed: 0,CIN,Company,P6 - Overall (E),P6 - Overall (L),P6Q1 - Energy Consumption (E) - Whether,P6Q1 - Energy Consumption (E) - Subtype,P6Q1 - Energy Consumption (E) - Remarks,P6Q2 - PAT Scheme Sites (E) - Whether,P6Q2 - PAT Scheme Sites (E) - Subtype,P6Q2 - PAT Scheme Sites (E) - Remarks,...,P6Q17 - Resource Efficiency Initiatives (L) - Remarks,P6Q18 - BCP/DR Plan (L) - Whether,P6Q18 - BCP/DR Plan (L) - Subtype,P6Q18 - BCP/DR Plan (L) - Remarks,P6Q19 - Value Chain Impact (L) - Whether,P6Q19 - Value Chain Impact (L) - Subtype,P6Q19 - Value Chain Impact (L) - Remarks,P6Q20 - Value Chain Partner Assessment (L) - Whether,P6Q20 - Value Chain Partner Assessment (L) - Subtype,P6Q20 - Value Chain Partner Assessment (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,true,Reasonable,,false,,,...,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [92]:
df_p6.to_excel('principle6_assurance.xlsx', index=False)

In [94]:
import os
import pandas as pd
from tqdm import tqdm

principle7_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall flags
p7_flags = {
    'TypeOfAssuranceForPrinciple7EssentialIndicators': 'P7 - Overall (E)',
    'TypeOfAssuranceForPrinciple7LeadershipIndicators': 'P7 - Overall (L)'
}

# Essential Indicators (2)
essential_questions = [
    ("WhetherTheEntityIsAMemberOfOrAffiliatedToTradeAndIndustryChambersOrAssociationsDeterminedBasedOnTheTotalMembersOfSuchBodyIsAssuredByAssurer",
     "AssuranceSubTypeForTheEntityIsAMemberOfOrAffiliatedToTradeAndIndustryChambersOrAssociationsDeterminedBasedOnTheTotalMembersOfSuchBody",
     "RemarksForAssuranceOfTheEntityIsAMemberOfOrAffiliatedToTradeAndIndustryChambersOrAssociationsDeterminedBasedOnTheTotalMembersOfSuchBody",
     "P7Q1 - Chamber Membership (E)"),

    ("WhetherDetailsOfCorrectiveActionTakenOrUnderwayOnAnyIssuesRelatedToAntiCompetitiveConductByTheEntityBasedOnAdverseOrdersFromRegulatoryAuthoritiesIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfCorrectiveActionTakenOrUnderwayOnAnyIssuesRelatedToAntiCompetitiveConductByTheEntityBasedOnAdverseOrdersFromRegulatoryAuthorities",
     "RemarksForAssuranceOfDetailsOfCorrectiveActionTakenOrUnderwayOnAnyIssuesRelatedToAntiCompetitiveConductByTheEntityBasedOnAdverseOrdersFromRegulatoryAuthorities",
     "P7Q2 - Anti-Competitive Conduct (E)")
]

# Leadership Indicators (1)
leadership_questions = [
    ("WhetherDetailsOfPublicPolicyPositionsAdvocatedByTheEntityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfPublicPolicyPositionsAdvocatedByTheEntity",
     "RemarksForAssuranceOfDetailsOfPublicPolicyPositionsAdvocatedByTheEntity",
     "P7Q3 - Public Policy Advocacy (L)")
]

# Loop through files
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Overall flags
    for field in p7_flags:
        row.append(get_val(df, field))

    # Essential
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle7_data.append(row)

# Column headers
columns = ["CIN", "Company"]
columns += list(p7_flags.values())

for _, _, _, label in essential_questions + leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p7 = pd.DataFrame(principle7_data, columns=columns)
df_p7 = df_p7.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:10<00:00,  3.17it/s]


In [95]:
df_p7.shape

(1174, 13)

In [96]:
df_p7

Unnamed: 0,CIN,Company,P7 - Overall (E),P7 - Overall (L),P7Q1 - Chamber Membership (E) - Whether,P7Q1 - Chamber Membership (E) - Subtype,P7Q1 - Chamber Membership (E) - Remarks,P7Q2 - Anti-Competitive Conduct (E) - Whether,P7Q2 - Anti-Competitive Conduct (E) - Subtype,P7Q2 - Anti-Competitive Conduct (E) - Remarks,P7Q3 - Public Policy Advocacy (L) - Whether,P7Q3 - Public Policy Advocacy (L) - Subtype,P7Q3 - Public Policy Advocacy (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,,,


In [97]:
df_p7.to_excel('principle7_assurance.xlsx', index=False)

In [98]:
import os
import pandas as pd
from tqdm import tqdm

principle8_data = []

def get_val(df, field):
    val = df.loc[df['Element Name'] == field, 'Fact Value'].values
    return val[0] if len(val) > 0 else ""

# Overall assurance fields
p8_flags = {
    'TypeOfAssuranceForPrinciple8EssentialIndicators': 'P8 - Overall (E)',
    'TypeOfAssuranceForPrinciple8LeadershipIndicators': 'P8 - Overall (L)'
}

# Essential Indicators (5)
essential_questions = [
    ("WhetherDetailsOfSocialImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLawsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfSocialImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLaws",
     "RemarksForAssuranceOfDetailsOfSocialImpactAssessmentsOfProjectsUndertakenByTheEntityBasedOnApplicableLaws",
     "P8Q1 - Social Impact Assessments (E)"),

    ("WhetherDetailsOfProjectsForWhichOngoingRehabilitationAndResettlementIsBeingUndertakenByEntityIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfProjectsForWhichOngoingRehabilitationAndResettlementIsBeingUndertakenByEntity",
     "RemarksForAssuranceOfDetailsOfProjectsForWhichOngoingRehabilitationAndResettlementIsBeingUndertakenByEntity",
     "P8Q2 - Rehabilitation & Resettlement (E)"),

    ("WhetherDescribeTheMechanismsToReceiveAndRedressGrievancesOfTheCommunityIsAssuredByAssurer",
     "AssuranceSubTypeForDescribeTheMechanismsToReceiveAndRedressGrievancesOfTheCommunity",
     "RemarksForAssuranceOfDescribeTheMechanismsToReceiveAndRedressGrievancesOfTheCommunity",
     "P8Q3 - Grievance Mechanisms (E)"),

    ("WhetherPercentageOfInputMaterialInputsToTotalInputsByValueSourcedFromSuppliersIsAssuredByAssurer",
     "AssuranceSubTypeForPercentageOfInputMaterialInputsToTotalInputsByValueSourcedFromSuppliers",
     "RemarksForAssuranceOfPercentageOfInputMaterialInputsToTotalInputsByValueSourcedFromSuppliers",
     "P8Q4 - Sourcing from Suppliers (E)"),

    ("WhetherJobCreationInSmallerTownsDiscloseWagesPaidToPersonsEmployedIncludingEmployeesOrWorkersEmployedOnAPermanentOrNonPermanentOrOnContractBasisIsAssuredByAssurer",
     "AssuranceSubTypeForJobCreationInSmallerTownsDiscloseWagesPaidToPersonsEmployedIncludingEmployeesOrWorkersEmployedOnAPermanentOrNonPermanentOrOnContractBasis",
     "RemarksForAssuranceOfJobCreationInSmallerTownsDiscloseWagesPaidToPersonsEmployedIncludingEmployeesOrWorkersEmployedOnAPermanentOrNonPermanentOrOnContractBasis",
     "P8Q5 - Job Creation in Smaller Towns (E)")
]

# Leadership Indicators (6)
leadership_questions = [
    ("WhetherDetailsOfActionsTakenToMitigateAnyNegativeSocialImpactsIdentifiedInTheSocialImpactAssessmentsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfActionsTakenToMitigateAnyNegativeSocialImpactsIdentifiedInTheSocialImpactAssessments",
     "RemarksForAssuranceOfDetailsOfActionsTakenToMitigateAnyNegativeSocialImpactsIdentifiedInTheSocialImpactAssessments",
     "P8Q6 - Actions to Mitigate Social Impact (L)"),

    ("WhetherDetailsOfCSRProjectsUndertakenInDesignatedAspirationalDistrictsAsIdentifiedByGovernmentBodiesIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfCSRProjectsUndertakenInDesignatedAspirationalDistrictsAsIdentifiedByGovernmentBodies",
     "RemarksForAssuranceOfDetailsOfCSRProjectsUndertakenInDesignatedAspirationalDistrictsAsIdentifiedByGovernmentBodies",
     "P8Q7 - CSR in Aspirational Districts (L)"),

    ("WhetherAPreferentialProcurementPolicyWherePreferenceToPurchaseFromSuppliersComprisingMarginalizedOrVulnerableGroupsAndItsPercentageOfTotalProcurementByValueDoesItConstituteIsAssuredByAssurer",
     "AssuranceSubTypeForAPreferentialProcurementPolicyWherePreferenceToPurchaseFromSuppliersComprisingMarginalizedOrVulnerableGroupsAndItsPercentageOfTotalProcurementByValueDoesItConstitute",
     "RemarksForAssuranceOfAPreferentialProcurementPolicyWherePreferenceToPurchaseFromSuppliersComprisingMarginalizedOrVulnerableGroupsAndItsPercentageOfTotalProcurementByValueDoesItConstitute",
     "P8Q8 - Preferential Procurement (L)"),

    ("WhetherDetailsOfTheBenefitsDerivedAndSharedFromTheIntellectualPropertiesOwnedOrAcquiredIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfTheBenefitsDerivedAndSharedFromTheIntellectualPropertiesOwnedOrAcquired",
     "RemarksForAssuranceOfDetailsOfTheBenefitsDerivedAndSharedFromTheIntellectualPropertiesOwnedOrAcquired",
     "P8Q9 - IP Benefits from Traditional Knowledge (L)"),

    ("WhetherDetailsOfCorrectiveActionsTakenOrUnderwayBasedOnAnyAdverseOrderInIntellectualPropertyRelatedDisputesWhereinUsageOfTraditionalKnowledgeIsInvolvedIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfCorrectiveActionsTakenOrUnderwayBasedOnAnyAdverseOrderInIntellectualPropertyRelatedDisputesWhereinUsageOfTraditionalKnowledgeIsInvolved",
     "RemarksForAssuranceOfDetailsOfCorrectiveActionsTakenOrUnderwayBasedOnAnyAdverseOrderInIntellectualPropertyRelatedDisputesWhereinUsageOfTraditionalKnowledgeIsInvolved",
     "P8Q10 - IP Dispute Corrective Actions (L)"),

    ("WhetherDetailsOfBeneficiariesOfCSRProjectsIsAssuredByAssurer",
     "AssuranceSubTypeForDetailsOfBeneficiariesOfCSRProjects",
     "RemarksForAssuranceOfDetailsOfBeneficiariesOfCSRProjects",
     "P8Q11 - CSR Project Beneficiaries (L)")
]

# Loop through files
for file in tqdm(os.listdir(folder_path)):
    if not file.endswith(".xlsx"):
        continue
    df = pd.read_excel(os.path.join(folder_path, file))

    row = [
        get_val(df, "CorporateIdentityNumber"),
        get_val(df, "NameOfTheCompany")
    ]

    # Overall flags
    for field in p8_flags:
        row.append(get_val(df, field))

    # Essential
    for f_w, f_s, f_r, _ in essential_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    # Leadership
    for f_w, f_s, f_r, _ in leadership_questions:
        row.append(get_val(df, f_w))
        row.append(get_val(df, f_s))
        row.append(get_val(df, f_r))

    principle8_data.append(row)

# Column headers
columns = ["CIN", "Company"]
columns += list(p8_flags.values())

for _, _, _, label in essential_questions + leadership_questions:
    columns += [f"{label} - Whether", f"{label} - Subtype", f"{label} - Remarks"]

df_p8 = pd.DataFrame(principle8_data, columns=columns)
df_p8 = df_p8.sort_values(by="Company")


100%|██████████████████████████████████████████████████████████████████████████████| 1174/1174 [06:00<00:00,  3.26it/s]


In [99]:
df_p8.shape

(1174, 37)

In [100]:
df_p8

Unnamed: 0,CIN,Company,P8 - Overall (E),P8 - Overall (L),P8Q1 - Social Impact Assessments (E) - Whether,P8Q1 - Social Impact Assessments (E) - Subtype,P8Q1 - Social Impact Assessments (E) - Remarks,P8Q2 - Rehabilitation & Resettlement (E) - Whether,P8Q2 - Rehabilitation & Resettlement (E) - Subtype,P8Q2 - Rehabilitation & Resettlement (E) - Remarks,...,P8Q8 - Preferential Procurement (L) - Remarks,P8Q9 - IP Benefits from Traditional Knowledge (L) - Whether,P8Q9 - IP Benefits from Traditional Knowledge (L) - Subtype,P8Q9 - IP Benefits from Traditional Knowledge (L) - Remarks,P8Q10 - IP Dispute Corrective Actions (L) - Whether,P8Q10 - IP Dispute Corrective Actions (L) - Subtype,P8Q10 - IP Dispute Corrective Actions (L) - Remarks,P8Q11 - CSR Project Beneficiaries (L) - Whether,P8Q11 - CSR Project Beneficiaries (L) - Subtype,P8Q11 - CSR Project Beneficiaries (L) - Remarks
0,L74140MH2008PLC177884,360 One Wam Limited,,,,,,,,,...,,,,,,,,,,
1,L67120MH1993PLC074411,3I Infotech Limited,,,,,,,,,...,,,,,,,,,,
2,L31300KA1987PLC013543,3M India Limited,,,,,,,,,...,,,,,,,,,,
3,L67190MH2007PLC289249,5paisa Capital Limited,,,,,,,,,...,,,,,,,,,,
4,L29142TN1988PLC015586,63 Moons Technologies Limited,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1171,L24231GJ2000PLC038352,Zota Health Care Limited,,,,,,,,,...,,,,,,,,,,
1172,L24230GJ1995PLC025878,Zydus Lifesciences Limited,All,All,true,Reasonable,,true,Reasonable,,...,,true,Reasonable,,true,Reasonable,,true,Reasonable,
1173,L15201GJ1994PLC023490,Zydus Wellness Limited,Partial,Partial,false,,,false,,,...,,false,,,false,,,false,,
279,L72200MH2000PLC125319,eClerx Services Limited,,,,,,,,,...,,,,,,,,,,


In [101]:
df_p8.to_excel('principle8_assurance.xlsx', index=False)

### Data Breaches and their impact

In [2]:
import pandas as pd
import os
import re
from openpyxl.styles import Font, PatternFill, Alignment




In [3]:
class DataBreachExtractor:
    """
    A class to extract and process data breach data from files.
    """
    def __init__(self):
        """Initializes the extractor with configuration settings."""
        self.ELEMENT_NAME_COL = 'Element Name'
        self.VALUE_COL = 'Fact Value'
        self.COMPANY_NAME_ELEMENT = 'NameOfTheCompany'
        # Element names for data breach information
        self.DATA_BREACH_INSTANCES_ELEMENT = 'NumberOfInstancesOfDataBreachesAlongWithImpact'
        self.DATA_BREACH_PERCENTAGE_ELEMENT = 'PercentageOfDataBreachesInvolvingPersonallyIdentifiableInformationOfCustomers'
        self.DATA_BREACH_IMPACT_ELEMENT = 'DetailsOfImpactOfDataBreachesExplanatoryTextBlock'


    def _get_text_value(self, df, element_name, default_value='0'):
        """
        Helper to find an element name in a DataFrame and return its value as a string.
        """
        if self.ELEMENT_NAME_COL not in df.columns or self.VALUE_COL not in df.columns:
            return default_value
        series = df.loc[df[self.ELEMENT_NAME_COL] == element_name, self.VALUE_COL]
        if not series.empty and not pd.isna(series.iloc[0]):
            return str(series.iloc[0])
        return default_value

    def _process_single_file(self, file_path):
        """
        Processes a single file (Excel or CSV) and extracts all required data as text.
        """
        try:
            if file_path.endswith(('.xlsx', '.xls')):
                df = pd.read_excel(file_path)
            elif file_path.endswith('.csv'):
                df = pd.read_csv(file_path, on_bad_lines='skip')
            else:
                print(f"Skipping unsupported file type: {os.path.basename(file_path)}")
                return None
        except Exception as e:
            print(f"Error reading file {os.path.basename(file_path)}: {e}")
            return None

        # Extract all required fields as text
        company_name = self._get_text_value(df, self.COMPANY_NAME_ELEMENT, "Unknown Company")
        data_breach_instances = self._get_text_value(df, self.DATA_BREACH_INSTANCES_ELEMENT)
        data_breach_percentage = self._get_text_value(df, self.DATA_BREACH_PERCENTAGE_ELEMENT)
        data_breach_impact = self._get_text_value(df, self.DATA_BREACH_IMPACT_ELEMENT, "N/A")


        data_entry = {
            'Company Name': company_name,
            'Data Breach Instances': data_breach_instances,
            '% Breaches with Customer PII': data_breach_percentage,
            'Details of Breach Impact': data_breach_impact
        }
        
        print(f"--- Successfully extracted text data for: {company_name}")
        return data_entry

    def _convert_df_to_numeric(self, df):
        """
        Converts specified data columns in the DataFrame from text to numeric types.
        """
        print("\n[INFO] Converting extracted text data to numbers...")
        
        def clean_and_convert(value):
            if pd.isna(value): return 0
            value_str = str(value).strip().lower()

            # Specific phrase from user request, converted to lower case
            no_complaints_phrase = "there are no customer complaints or any penalties by regulatory authorities related to data privacy or cyber security during fy24."

            # List of strings to be treated as zero
            zero_strings = ['not applicable', 'n/a', '-', 'nil', 'none', '', no_complaints_phrase]

            if value_str in zero_strings:
                return 0
            
            if value_str.startswith('(') and value_str.endswith(')'):
                value_str = '-' + value_str[1:-1]
            cleaned_str = re.sub(r'[^0-9.-]', '', value_str)
            if not cleaned_str or cleaned_str in ['-', '.']: return 0
            try:
                return float(cleaned_str)
            except (ValueError, TypeError):
                return 0

        # Define which columns should be converted to numeric format
        numeric_cols = ['Data Breach Instances', '% Breaches with Customer PII']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = df[col].apply(clean_and_convert)
        
        print("[INFO] Data conversion complete.")
        return df

    def process_directory(self, directory):
        """
        Processes all valid files in a given directory, extracts data, and returns a cleaned numeric DataFrame.
        """
        if not os.path.isdir(directory) or not os.listdir(directory):
            print(f"Error: The directory '{directory}' is empty or does not exist.")
            return pd.DataFrame()

        print(f"[INFO] Starting analysis of files in '{directory}'...\n")
        all_data = []
        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            data_entry = self._process_single_file(file_path)
            if data_entry:
                all_data.append(data_entry)

        if not all_data:
            print("No valid data could be processed from the files.")
            return pd.DataFrame()

        # Create DataFrame and ensure correct column order
        summary_df = pd.DataFrame(all_data)
        column_order = [
            'Company Name', 'Data Breach Instances', '% Breaches with Customer PII', 'Details of Breach Impact'
        ]
        # Reorder DataFrame and fill missing columns with 0 or 'N/A'
        summary_df = summary_df.reindex(columns=column_order)
        
        # Convert the specified text data to numeric
        return self._convert_df_to_numeric(summary_df)

    def export_to_excel(self, df, output_file):
        """
        Exports the main data to a simple Excel file.
        """
        if df.empty:
            print("Cannot export empty DataFrame.")
            return

        print(f"\n[INFO] Exporting data to '{output_file}'...")
        try:
            with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Data Breach Data', index=False)
                
                # Format the header
                ws = writer.sheets['Data Breach Data']
                header_font = Font(bold=True, color="FFFFFF")
                header_fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
                
                for i, col_name in enumerate(df.columns, 1):
                    cell = ws.cell(row=1, column=i)
                    cell.font = header_font
                    cell.fill = header_fill

                # Auto-adjust column widths
                for column in ws.columns:
                    max_length = max(len(str(cell.value)) for cell in column if cell.value)
                    ws.column_dimensions[column[0].column_letter].width = min(max_length + 2, 50)
            
            print(f"SUCCESS: Master Excel sheet '{output_file}' has been created.")

        except PermissionError:
            print(f"ERROR: Permission denied. Is '{output_file}' open? Please close it and try again.")
        except Exception as e:
            print(f"An error occurred while saving the Excel file: {e}")


In [4]:

# --- Main execution block ---
if __name__ == '__main__':
    # Define source directory and output filename
    source_directory = 'excel_files'
    output_excel_file = 'p9_data_breaches.xlsx'

    # Create source directory if it doesn't exist
    if not os.path.exists(source_directory):
        os.makedirs(source_directory)
        print(f"Created directory: {source_directory}")
     
    # Initialize and run the extractor
    extractor = DataBreachExtractor()
    main_data_df = extractor.process_directory(source_directory)

main_data_df

[INFO] Starting analysis of files in 'excel_files'...

--- Successfully extracted text data for: 360 One Wam Limited
--- Successfully extracted text data for: 3I Infotech Limited
--- Successfully extracted text data for: 3M India Limited
--- Successfully extracted text data for: 5paisa Capital Limited
--- Successfully extracted text data for: 63 Moons Technologies Limited
--- Successfully extracted text data for: Aarti Drugs Limited
--- Successfully extracted text data for: Aarti Industries Limited
--- Successfully extracted text data for: Aarti Pharmalabs Limited
--- Successfully extracted text data for: Aavas Financiers Limited
--- Successfully extracted text data for: Abans Holdings Limited
--- Successfully extracted text data for: ABB India Limited
--- Successfully extracted text data for: Accelya Solutions India Limited
--- Successfully extracted text data for: ACC Limited
--- Successfully extracted text data for: Action Construction Equipment Limited
--- Successfully extracted te

Unnamed: 0,Company Name,Data Breach Instances,% Breaches with Customer PII,Details of Breach Impact
0,360 One Wam Limited,0.0,0.0,Nil
1,3I Infotech Limited,0.0,0.0,No incident of any data breach has taken place...
2,3M India Limited,0.0,0.0,Not Applicable
3,5paisa Capital Limited,0.0,0.0,Not Applicable
4,63 Moons Technologies Limited,0.0,0.0,
...,...,...,...,...
1169,ZF Commercial Vehicle Control Systems India Li...,0.0,0.0,No data incidents and/or breaches were reporte...
1170,Zomato Limited,0.0,0.0,Not applicable
1171,Zota Health Care Limited,0.0,0.0,N.A.
1172,Zydus Lifesciences Limited,0.0,0.0,Nil


In [5]:
extractor.export_to_excel(main_data_df, output_excel_file)


[INFO] Exporting data to 'p9_data_breaches.xlsx'...
SUCCESS: Master Excel sheet 'p9_data_breaches.xlsx' has been created.
