In [1]:
import sys
from pathlib import Path
sys.path.append("../1_code")  # since you're in 2_output

# Import your libraries first
from libraries.imports import *  
from stage3 import (
    full_content_preserving_standardize,
    extract_relevant_pages,
    clean_relevant_pages_from_range,
    extract_section_reference
)

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4


In [2]:
REPO_ROOT = Path().resolve().parent  # Adjust path as needed
filtered_tables_dir = REPO_ROOT / "2_output" / "filtered_tables"
merged_output_dir = REPO_ROOT / "2_output" / "standardized_merged_by_company"
os.makedirs(merged_output_dir, exist_ok=True)

company_groups = defaultdict(list)
company_error_log = {}
last_merged_df = None

# Step 1: Locate company CSVs, even inside page_### subfolders
for root, dirs, files in os.walk(str(filtered_tables_dir)):
    for file in files:
        if file.endswith(".csv"):
            path_parts = Path(root).parts
            if 'filtered_tables' in path_parts:
                idx = path_parts.index('filtered_tables')
                if len(path_parts) > idx + 1:
                    company = path_parts[idx + 1]
                else:
                    company = Path(root).name
            else:
                company = Path(root).name

            company_groups[company].append(os.path.join(root, file))

# Step 2: Merge per company
for company, file_list in company_groups.items():
    merged_df = pd.DataFrame(columns=["Code", "Name", "Page Range", "Description", "Relevant Pages"])
    errors = []

    for file in sorted(file_list):
        std_df, error = full_content_preserving_standardize(file)
        if error:
            errors.append((file, error))
            continue
        if std_df is not None:
            # Override Page Range and Relevant Pages using folder name
            folder = Path(file).parent.name
            match = re.search(r'page[_\-]?0*([1-9]\d{0,3})', folder)
            page_str = match.group(1) if match else ''
            if page_str:
                std_df["Page Range"] = page_str
                std_df["Relevant Pages"] = page_str
            merged_df = pd.concat([merged_df, std_df], ignore_index=True)

    if errors:
        company_error_log[company] = errors

    if merged_df.empty:
        continue

    # ✅ Normalize: remove "ESRS 2 " from start of code
    merged_df["Code"] = merged_df["Code"].str.replace(r'(?i)^ESRS\s*\d*\s*', '', regex=True).str.strip()

    # ⚠️ Only group if multiple rows have same Code – otherwise skip grouping for now
    merged_df["Relevant Pages"] = merged_df.apply(
        lambda r: extract_relevant_pages(r["Description"], r["Name"], r["Page Range"]), axis=1
    )
    merged_df["Relevant Pages"] = merged_df.apply(clean_relevant_pages_from_range, axis=1)
    merged_df["Section Reference"] = merged_df.apply(
        lambda r: extract_section_reference(r["Name"], r["Description"]), axis=1
    )

    # Step: Clean it against page range and standalone number rules
    def clean_section_against_pages(row):
        section = str(row["Section Reference"]).strip()
        if not section:
            return ""

        # Extract all page numbers from Page Range and Relevant Pages
        page_vals = set(re.findall(r'\d{2,4}', str(row["Page Range"])))
        rel_vals = set(re.findall(r'\d{2,4}', str(row["Relevant Pages"])))
        combined_pages = page_vals.union(rel_vals)

        # Keep only valid parts
        filtered_parts = []
        for part in section.split(" | "):
            clean = part.strip()
            if clean.isdigit():
                if clean in combined_pages or int(clean) > 1000:
                    continue
            filtered_parts.append(clean)

        return " | ".join(filtered_parts)

    merged_df["Section Reference"] = merged_df.apply(clean_section_against_pages, axis=1)

    # ✅ Group but retain first non-empty Page Range
    group_dict = {
        "Name": lambda x: " | ".join(x.dropna().unique()),
        "Description": lambda x: " | ".join(x.dropna().unique()),
        "Page Range": lambda x: next((v for v in x if pd.notna(v) and str(v).strip()), ""),
        "Relevant Pages": lambda x: ",".join(sorted(
            {p.strip() for p in ",".join(x.astype(str)).split(",") if p.strip()},
            key=lambda p: int(p) if p.isdigit() else float('inf')
        )),
    }

    if "Section Reference" in merged_df.columns:
        group_dict["Section Reference"] = lambda x: next((v for v in x if pd.notna(v) and str(v).strip()), "")

    merged_df = merged_df.groupby("Code", as_index=False).agg(group_dict)

    # ✅ Save result
    output_path = merged_output_dir / f"{company}_standardized_full.csv"
    print(f"✅ Saved updated: {output_path} (rows: {len(merged_df)})")
    merged_df.to_csv(output_path, index=False)

    last_merged_df = merged_df.copy()

# Step 3: Summary and preview
print("📂 Output Directory:", merged_output_dir)
print("\n⚠️ Errors (if any):")
for company, errors in list(company_error_log.items())[:3]:
    print(f"- {company}: {len(errors)} error(s)")

print("\n✅ Preview of Last Merged Company Table:")
if last_merged_df is not None:
    print(last_merged_df.head(10))
else:
    print("No valid company tables processed.")

✅ Saved updated: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/standardized_merged_by_company1/Volkswagen_standardized_full.csv (rows: 23)
✅ Saved updated: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/standardized_merged_by_company1/RFA_ELO_2024_EN-1_standardized_full.csv (rows: 1)
✅ Saved updated: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/standardized_merged_by_company1/Commerzbank_Group_Annual_Report_2024_standardized_full.csv (rows: 48)
✅ Saved updated: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/standardized_merged_by_company1/upm-annual-report-2024_standardized_full.csv (rows: 15)
✅ Saved updated: /Users/valeriiaklynna/Documents/GitHub/bachelor-thesis-group32-new/bachelor-thesis-group32-folder/2_output/standardized_merged_by_company1/