In [57]:
!pip install python-docx




In [1]:
# ---------------------------
# INSTALL DEPENDENCIES
# ---------------------------
!pip install -q pdfplumber python-docx requests bitsandbytes transformers accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m102.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m119.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# ---------------------------
# IMPORTS
# ---------------------------
import os
import json
import re
import time
import pdfplumber
import requests
from docx import Document
from docx.shared import Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
from google.colab import files


In [12]:
import os
import json
import time
import requests
import pdfplumber
from typing import Dict, List, Any

# --- CONFIGURATION ---
# Replace this with your actual public Ngrok URL (e.g., "https://abcd-1234.ngrok-free.app")
NGROK_BASE_URL = "https://buddy-scorpionic-hilda.ngrok-free.dev/"
MODEL_NAME = "llama3"

class BRSRJsonEngine:
    """
    Advanced Engine to process structured BRSR JSON templates using Llama 3 via Ngrok.
    Extracts data from PDFs and populates the JSON schema.
    """
    def __init__(self, pdf_path: str, json_path: str, ngrok_url: str):
        self.pdf_path = pdf_path
        self.json_path = json_path
        self.ngrok_url = ngrok_url.rstrip('/')
        self.report_json = {}
        self.context_text = ""
        self.final_output = {}

    def _query_llama(self, prompt: str) -> Any:
        """Calls Llama 3 via Ngrok endpoint with JSON formatting enabled and robust error handling."""
        url = f"{self.ngrok_url}/api/chat"

        system_instruction = (
            "You are a professional BRSR Auditor. You will be given a portion of a JSON template "
            "and text from an Annual Report. Your task is to fill the 'answer' or 'tableData' fields "
            "based on the report. Maintain absolute numeric precision. If info is missing, use 'Not Disclosed'. "
            "Return ONLY a valid JSON object."
        )

        payload = {
            "model": MODEL_NAME,
            "messages": [
                {"role": "system", "content": system_instruction},
                {"role": "user", "content": prompt}
            ],
            "stream": False,
            "format": "json"
        }

        for i in range(5):
            try:
                response = requests.post(url, json=payload, timeout=300)
                response.raise_for_status()
                result = response.json()

                # Defensive Extraction
                if not isinstance(result, dict):
                    continue

                message = result.get('message', {})
                if isinstance(message, dict):
                    text_response = message.get('content', '')
                else:
                    text_response = str(message)

                if not text_response:
                    continue

                # Clean Llama output: Locate actual JSON block
                text_clean = text_response.strip()
                start_idx = text_clean.find('{')
                end_idx = text_clean.rfind('}') + 1

                if start_idx != -1 and end_idx > start_idx:
                    json_str = text_clean[start_idx:end_idx]
                    return json.loads(json_str)
                else:
                    return json.loads(text_clean)

            except Exception as e:
                if i < 4:
                    time.sleep(2**i)
                else:
                    print(f"[!] Llama 3 API Final Error: {e}")
                    return None
        return None

    def load_files(self):
        """Loads the JSON template and PDF content."""
        if not os.path.exists(self.json_path):
            print(f"[-] Error: JSON path {self.json_path} not found.")
            return False

        print(f"[*] Loading JSON template: {self.json_path}")
        with open(self.json_path, 'r') as f:
            try:
                self.report_json = json.load(f)
            except json.JSONDecodeError as e:
                print(f"[-] Error: Failed to parse JSON template: {e}")
                return False

        print(f"[*] Reading Annual Report PDF: {self.pdf_path}")
        text = ""
        try:
            with pdfplumber.open(self.pdf_path) as pdf:
                # Processing 80 pages to stay within local LLM context limits
                for page in pdf.pages[:80]:
                    extracted = page.extract_text()
                    if extracted: text += extracted + "\n"
            self.context_text = text
            return len(text) > 0
        except Exception as e:
            print(f"[-] PDF Error: {e}")
            return False

    def process_sections(self):
        """Iterates through the JSON sections and populates data using Llama 3."""
        # Create a deep copy to preserve the original structure
        self.final_output = json.loads(json.dumps(self.report_json))

        sections_list = []

        # Robust discovery of the sections list
        if isinstance(self.final_output, list):
            sections_list = self.final_output
        elif isinstance(self.final_output, dict):
            # Check 'sections' key first
            potential_sections = self.final_output.get("sections")
            if isinstance(potential_sections, list):
                sections_list = potential_sections
            elif isinstance(potential_sections, dict):
                # If 'sections' is a dict of ID -> Section, extract values
                sections_list = list(potential_sections.values())
            # If no 'sections' key, check if root itself is a section
            elif any(k in self.final_output for k in ["subsections", "fields", "principles", "content"]):
                sections_list = [self.final_output]
            else:
                # Fallback: Find any nested value that looks like a section
                for val in self.final_output.values():
                    if isinstance(val, list) and len(val) > 0 and isinstance(val[0], dict):
                        if any(k in val[0] for k in ["subsections", "fields", "question"]):
                            sections_list = val
                            break

                if not sections_list:
                    print("[-] Error: Could not locate a list of sections in the JSON.")
                    print(f"[*] Root keys found: {list(self.final_output.keys())}")
                    return
        else:
            print("[-] Error: JSON template structure is invalid.")
            return

        for section in sections_list:
            if not isinstance(section, dict):
                continue

            # Identification logic
            section_id = section.get("sectionId", section.get("id", "General"))
            section_title = section.get("sectionTitle", section.get("title", "Untitled Section"))
            print(f"[*] Llama 3 is processing Section {section_id}: {section_title}...")

            # Context window management: ~30k chars is safer for Llama 3 8B
            prompt = f"""
            ANNUAL REPORT CONTEXT (EXCERPT):
            {self.context_text[:30000]}

            TASK:
            Below is a JSON schema for a BRSR reporting section.
            Please provide values for all 'fields' and 'tables' defined in this snippet using data from the report.

            JSON SCHEMA TO FILL:
            {json.dumps(section, indent=2)}

            RETURN:
            Return the SAME JSON structure, but for every object in 'fields', update the 'answer' key.
            For every object in 'tables', update the 'dataRows' or relevant value keys based on report evidence.
            """

            filled_section = self._query_llama(prompt)
            if filled_section and isinstance(filled_section, dict):
                # Merge AI results back into the template section
                section.update(filled_section)

            time.sleep(1)

    def save_result(self, output_path: str):
        """Saves the fully populated JSON."""
        print(f"[*] Saving populated report to: {output_path}")
        with open(output_path, 'w') as f:
            json.dump(self.final_output, f, indent=4)
        print("[+] SUCCESS: Process complete.")

if __name__ == "__main__":
    # Settings
    INPUT_JSON = "brsr_questions.json"
    INPUT_PDF = "BFUTILITIE_27112025155734_BRSR_BFUL_27112025.pdf"
    OUTPUT_FILE = "BRSR_Populated_Llama3.json"

    # Verify Ngrok URL is set
    if "YOUR_NGROK_URL" in NGROK_BASE_URL:
        print("[!] Warning: Please update NGROK_BASE_URL with your public tunnel address.")
    else:
        # Initialize and Run
        engine = BRSRJsonEngine(pdf_path=INPUT_PDF, json_path=INPUT_JSON, ngrok_url=NGROK_BASE_URL)

        try:
            if engine.load_files():
                engine.process_sections()
                engine.save_result(OUTPUT_FILE)
            else:
                print("[-] Error: Failed to initialize source files.")
        except Exception as e:
            print(f"[-] Pipeline Error: {e}")

[*] Loading JSON template: brsr_questions.json
[*] Reading Annual Report PDF: BFUTILITIE_27112025155734_BRSR_BFUL_27112025.pdf
[*] Llama 3 is processing Section General: SECTION A: GENERAL DISCLOSURES...
[*] Llama 3 is processing Section General: SECTION B: MANAGEMENT AND PROCESS DISCLOSURES...
[*] Llama 3 is processing Section General: SECTION C: PRINCIPLE WISE PERFORMANCE DISCLOSURE...
[*] Saving populated report to: BRSR_Populated_Llama3.json
[+] SUCCESS: Process complete.
