# requirements.txt

In [1]:
# python==3.10.16
# numpy==1.26.4
# pdf2image==1.17.0
# pillow==10.4.0
# langchain==0.3.12
# langchain-openai==0.2.12
# openai==1.57.4
# rich==13.9.4
# pandas==2.2.3
# xlsxwriter==3.2.0
# openpyxl==3.1.5
# chromadb==0.6.3
# nltk==3.9.1
# rouge_score==0.1.2
# bert_score==0.3.13
# ipywidgets==8.1.5
# conda install -c conda-forge poppler #for pdf2image
# sudo apt-get install poppler-utils #for pdf2image

# data_ingestion.py

In [2]:
"""
Data Ingestion Module
=====================
This module provides utilities to process PDF files for text extraction using LLM-based methods.
Features include:
- Conversion of PDF pages for extraction.
- Extracting text from PDFs using LLM.
- Batch processing of PDFs, supporting individual files or directories.
"""

import io
import os
import openai
import shutil
import base64
import warnings
import pandas as pd
from openai import OpenAI
from PIL.Image import Image
from typing import Dict, Optional, List
from pdf2image import convert_from_path

# Config loading
from config import (
    OPENAI_API_KEY,
    MODEL_NAME_PDF,
    MODEL_TOKEN_PDF,
    SYSTEM_PROMPT_PDF,
    PDF_FOLDER,
    PDF_RAW_TEXT_FOLDER,
)


class DataIngestion:
    """
    Handles PDF processing and text extraction using LLMs.
    """

    def __init__(self):
        self.openai_api_key = OPENAI_API_KEY
        self.model_name = MODEL_NAME_PDF
        self.model_token = MODEL_TOKEN_PDF
        self.pdf_folder = PDF_FOLDER
        self.raw_text_folder = PDF_RAW_TEXT_FOLDER

    # Convert pdf pages to images
    def convert_doc_to_images(self, path: str) -> List[Image]:
        """
        Convert a PDF document into a list of image objects.

        Parameters
        ----------
        path : str
            Path to the PDF file.

        Returns
        -------
        list
            A list of PIL Image objects, each representing a page of the PDF.
        """

        return convert_from_path(path)

    def get_img_uri(self, img: Image) -> str:
        """
        Encode a PIL Image object as a Base64 data URI.

        Parameters
        ----------
        img : PIL.Image.Image
            The image to encode.

        Returns
        -------
        str
            The Base64 encoded string in data URI format.
        """

        png_buffer = io.BytesIO()
        img.save(png_buffer, format="PNG")
        png_buffer.seek(0)

        base64_png = base64.b64encode(png_buffer.read()).decode('utf-8')

        data_uri = f"data:image/png;base64,{base64_png}"
        return data_uri

    def analyze_image(self, imag_info: List[dict], system_prompt: str = SYSTEM_PROMPT_PDF) -> str:
        """
        Use LLM to extract text from image information.

        Parameters
        ----------
        imag_info : List[dict]
            List of image data encoded as Base64 URIs.
        system_prompt : str, optional
            System prompt to guide the LLM's behavior.

        Returns
        -------
        str
            Extracted text from the LLM's response.
        """
        # Initializing OpenAI client
        openai.api_key = self.openai_api_key
        client = OpenAI()

        response = client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "system", "content": system_prompt},
                {
                    "role": "user",
                    "content": imag_info
                },
            ],
            max_tokens=self.model_token,
            temperature=0,
            top_p=0.1
        )
        return response.choices[0].message.content

    def extract_text_from_pdf(
        self,
        pdf_path: str,
        raw_text_folder: Optional[str] = None,
        k: int = 4,
        system_prompt: str = SYSTEM_PROMPT_PDF,
    ) -> None:
        """
        Extract text from a single PDF file.

        Parameters
        ----------
        pdf_path : str
            Path to the PDF file.
        raw_text_folder : Optional[str], optional
            Folder to store extracted raw text files (default: self.raw_text_folder).
        k : int, optional
            Number of pages to process per batch (default: 4).
        system_prompt : str, optional
            System prompt for the LLM (default: SYSTEM_PROMPT_PDF).

        Returns
        -------
        None
        """

        if raw_text_folder is None:
            raw_text_folder = self.raw_text_folder

        file_name = os.path.splitext(os.path.basename(pdf_path))[0]
        images = self.convert_doc_to_images(pdf_path)
        image_cnt = len(images)

        # Create a folder for storing output
        folder_path = os.path.join(raw_text_folder, file_name)
        if os.path.exists(folder_path):
            shutil.rmtree(folder_path)
        os.makedirs(folder_path)

        # Process images in batches
        for i in range(0, image_cnt, k):
            imag_info = []
            batch = images[i: i + k]
            for img in batch:
                img_url = self.get_img_uri(img)
                imag_info.append({
                    "type": "image_url",
                    "image_url": {"url": img_url}
                })

            print(
                f"Processing [{pdf_path}]; Pages: {i+1} to {min(i+k,image_cnt)}")

            # Extract data using LLM
            extract_content = self.analyze_image(
                imag_info, system_prompt=SYSTEM_PROMPT_PDF)

            # Save the extracted data to a text file
            text_path = os.path.join(
                folder_path, f"pages_{i+1}_to_{min(i+k,image_cnt)}.txt")
            with open(text_path, "w") as file:
                file.write(extract_content)
        return None

    def extract_text_from_pdf_list(
        self,
        pdf_paths: List[str],
        raw_text_folder: Optional[str] = None,
        k: int = 4,
        system_prompt: str = SYSTEM_PROMPT_PDF,
    ) -> None:
        """
        Extract text from a list of PDF files.

        Parameters
        ----------
        pdf_paths : List[str]
            List of paths to PDF files.
        raw_text_folder : Optional[str], optional
            Folder to store extracted raw text files (default: self.raw_text_folder).
        k : int, optional
            Number of pages to process per batch (default: 4).
        system_prompt : str, optional
            System prompt for the LLM (default: SYSTEM_PROMPT_PDF).

        Returns
        -------
        None
        """

        if raw_text_folder is None:
            raw_text_folder = self.raw_text_folder

        # List all PDF files in provided list
        pdf_paths = [file for file in file_paths if file.endswith(".pdf")]
        if len(pdf_paths) < 1:
            warnings.warn("There is not pdf in the provided list!")
            return None
        else:
            for pdf_path in pdf_paths:
                print(f"Start to extract data from {pdf_path}")
                self.extract_text_from_pdf(
                    pdf_path=pdf_path, raw_text_folder=raw_text_folder, k=k, system_prompt=system_prompt)
            return None

    # extract text from all pdfs in a folder
    def extract_text_from_pdf_all(
        self,
        pdf_folder_path: Optional[str] = None,
        raw_text_folder: Optional[str] = None,
        k: int = 4,
        system_prompt: str = SYSTEM_PROMPT_PDF,
    ) -> None:
        """
        Extract text from all PDFs in a specified folder.

        Parameters
        ----------
        pdf_folder_path : Optional[str], optional
            Path to the folder containing PDF files (default: self.pdf_folder).
        raw_text_folder : Optional[str], optional
            Folder to store extracted raw text files (default: self.raw_text_folder).
        k : int, optional
            Number of pages to process per batch (default: 4).
        system_prompt : str, optional
            System prompt for the LLM (default: SYSTEM_PROMPT_PDF).

        Returns
        -------
        None
        """
        if pdf_folder_path is None:
            pdf_folder_path = self.pdf_folder
        if raw_text_folder is None:
            raw_text_folder = self.raw_text_folder

        # Get all PDF file paths in the folder
        pdf_paths = [
            os.path.join(pdf_folder_path, file)
            for file in os.listdir(pdf_folder_path)
            if file.endswith(".pdf")
        ]

        self.extract_text_from_pdf_list(
            pdf_paths=pdf_paths, raw_text_folder=raw_text_folder, k=k, system_prompt=system_prompt)

        return None

## test

In [3]:
# extract text from a single pdf file
DI=DataIngestion()
pdf_path="./pdfs/fwc_sample_financial_statement.pdf"
DI.extract_text_from_pdf(pdf_path=pdf_path)

# # extract text from pdfs in a list 
# DI=DataIngestion()
# pdf_path_list=["./pdfs/fwc_sample_financial_statement.pdf","./pdfs/fwc_sample_financial_statement.pdf"]
# DI.extract_text_from_pdf_list(pdf_paths=pdf_path_list)

# # extract text from all pdfs in a folder 
# DI=DataIngestion()
# DI.extract_text_from_pdf_folder(pdf_folder_path=PDF_FOLDER)
# DI.extract_text_from_pdf_folder()


Processing [./pdfs/fwc_sample_financial_statement.pdf]; Pages: 1 to 4
Processing [./pdfs/fwc_sample_financial_statement.pdf]; Pages: 5 to 7


# data_processing.py

In [4]:
"""
Data Processing Module
======================
This module handles text preprocessing, data extraction and transformation for further analysis.
Features include:
- Preprocessing text with LLMs using LangChain and OpenAI.
- Extracting structured-formatted data from raw text content.
- Processing text files in single or batch mode to further clean/process the extracted data.
"""

import os
import re
import json
import warnings
from langchain import OpenAI
from langchain.chains import LLMChain
from typing import Dict, Optional, List

# Config loading
from config import (
    OPENAI_API_KEY,
    MODEL_TEXT_PREPRO,
    PROMPT_PREPRO,
    PDF_RAW_TEXT_FOLDER,
    PDF_DATA_FOLDER,
)


class DataProcessing:
    """
    Handles text preprocessing, JSON extraction, and processing extracted text data.
    """

    def __init__(self):
        self.openai_api_key = OPENAI_API_KEY
        self.model_name = MODEL_TEXT_PREPRO
        self.pdf_data_folder = PDF_DATA_FOLDER
        self.raw_text_folder = PDF_RAW_TEXT_FOLDER

    def data_preprocess(self, str_data: str, prompt: str = PROMPT_PREPRO) -> str:
        """
        Preprocess text using an LLM through LangChain.

        Parameters
        ----------
        str_data : str
            The input text data to preprocess.
        prompt : str, optional
            Prompt template guiding the LLM behavior (default: PROMPT_PREPRO).

        Returns
        -------
        str
            Processed text response from the LLM.
        """
        llm = OpenAI(openai_api_key=self.openai_api_key,
                     model_name=self.model_name)
        prompt = prompt.replace("TEXT_INPUT", str_data)
        chain = LLMChain(llm=llm, prompt=prompt)
        response = chain.run()
        # print(response)
        return response

    def extract_json(self, str_content: str, llm_flag: int = 0) -> Dict:
        """
        Extract JSON data from a string. Optionally preprocess with an LLM.

        Parameters
        ----------
        str_content : str
            The string containing JSON content.
        llm_flag : int, optional
            Whether to preprocess the extracted JSON with an LLM (default: 0).

        Returns
        -------
        Dict
            Extracted JSON content as a dictionary.
        """
        # Locate JSON block in the string
        # .find() will return the BEGINNING of the match
        start_pos = str_content.find("```json\n") + len("```json\n")
        end_pos = str_content.find("\n```")
        json_chunk = str_content[start_pos:end_pos]

        if llm_flag == 1:
            # Preprocess extracted json text using Langchain+OpenAI
            json_chunk = self.data_preprocess(
                str_data=json_chunk, prompt=PROMPT_PREPRO)
            start_pos = json_chunk.find("```") + len("```")
            end_pos = json_chunk.find("```")
            json_chunk = json_chunk[start_pos:end_pos]

        # Parse the JSON string
        data_dict = json.loads(json_chunk)

        return data_dict

    def data_processing(self, input_text_folder: str, output_folder: Optional[str]) -> None:
        """
        Process raw text files in a folder to generate consolidated JSON data.

        Parameters
        ----------
        input_text_folder : str
            Folder containing raw text files.
        output_folder : Optional[str]
            Folder to store the processed JSON file (default: self.pdf_data_folder).

        Returns
        -------
        None
        """
        if output_folder is None:
            output_folder = self.pdf_data_folder
        text_folder_name = os.path.basename(
            os.path.normpath(input_text_folder))
        # List all text files in the folder
        text_files = [f for f in os.listdir(
            input_text_folder) if f.endswith(".txt")]
        text_files.sort(key=lambda x: int(re.findall(r'\d+', x)
                        [0]) if re.findall(r'\d+', x) else 0)

        pdf_json_data = {}
        print(f"Start to process data from {input_text_folder}")
        # Process each text file
        for file_name in text_files:
            file_path = os.path.join(input_text_folder, file_name)
            with open(file_path, "r") as file:
                text_data = file.read()

            # Convert extracted text to JSON
            content_json = self.extract_json(text_data)
            pdf_json_data.update(content_json)

        # Save consolidated JSON data
        output_file_path = os.path.join(
            output_folder, f"{text_folder_name}.json")
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        with open(output_file_path, 'w') as f:
            json.dump(pdf_json_data, f, ensure_ascii=False)
        print(f"Complete processing data from {input_text_folder}")

        return None

    def data_processing_list(self, input_text_folder_list: List[str], output_folder: Optional[str]) -> None:
        """
        Process text files from multiple folders.

        Parameters
        ----------
        input_text_folder_list : List[str]
            List of folder paths containing text files.
        output_folder : Optional[str]
            Folder to store processed JSON files (default: self.pdf_data_folder).

        Returns
        -------
        None
        """
        if output_folder is None:
            output_folder = self.pdf_data_folder
        if len(input_text_folder_list) < 1:
            warnings.warn("There is not file in the provided list!")
            return None
        else:
            for folder in input_text_folder_list:
                self.data_processing(
                    input_text_folder=folder, output_folder=output_folder)
            return None

    def data_processing_all(self, input_text_parent_folder: Optional[str], output_folder: Optional[str]) -> None:
        """
        Process all subfolders within a parent folder.

        Parameters
        ----------
        input_text_parent_folder : Optional[str]
            Parent folder containing subfolders with text files (default: self.raw_text_folder).
        output_folder : Optional[str]
            Folder to store processed JSON files (default: self.pdf_data_folder).

        Returns
        -------
        None
        """
        if input_text_parent_folder is None:
            input_text_parent_folder = self.raw_text_folder
        if output_folder is None:
            output_folder = self.pdf_data_folder
        # List all subfolders
        subfolders = [os.path.join(input_text_parent_folder, name)
                      for name in os.listdir(input_text_parent_folder)
                      if os.path.isdir(os.path.join(input_text_parent_folder, name))]
        if len(subfolders) < 1:
            warnings.warn("There is not file in the provided Folder!")
            return None
        else:
            for folder in subfolders:
                self.data_processing(
                    input_text_folder=folder, output_folder=output_folder)
            return None


## test

In [5]:
# Process extracted text in a single folder (one PDF file)
DP=DataProcessing()
input_text_folder="./raw_texts/fwc_sample_financial_statement"
DP.data_processing(input_text_folder=input_text_folder, output_folder="./pdfs_data")

# # Process extracted text in a list of folders (multi PDF files)
# DP=DataProcessing()
# input_text_folder_list=["./raw_texts/fwc_sample_financial_statement"]
# DP.data_processing_list(input_text_folder_list=input_text_folder_list, output_folder="./pdfs_data")

# # Process all extracted text (ALL PDF files)
# DP=DataProcessing()
# # DP.data_processing_all(input_text_parent_folder="./raw_texts", output_folder="./pdfs_data")
# DP.data_processing_all()

Start to process data from ./raw_texts/fwc_sample_financial_statement
Complete processing data from ./raw_texts/fwc_sample_financial_statement


# reports.py

In [20]:
"""
Reports Module
==============
This module provides utilities to generate financial reports from (extracted) data, including:
- Generating Excel reports with multiple sheets.
- Extracting financial metrics using LLMs.
- Generating financial summaries using LangChain with LLM.
"""
import os
import json
import csv
import pandas as pd
import warnings
import pathlib
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from typing import Any, List, Optional, Union

# Config loading
from config import (
    OPENAI_API_KEY,
    MODEL_NAME_REPORT,
    TEMPERATURE_REPORT,
    CHUNK_SIZE_REPORT,
    CHUNK_OVERLAP_REPORT,
    MODEL_TEXT_EMBED,
    REPORT_FOLDER,
)


class Reports:
    """
    Handles the generation of financial reports, including key metric extraction,
    tabular format generation, and financial data summarization.
    """

    def __init__(self):
        self.openai_api_key = OPENAI_API_KEY
        self.model_name = MODEL_NAME_REPORT
        self.model_temperature = TEMPERATURE_REPORT
        self.chunk_size = CHUNK_SIZE_REPORT
        self.chunk_overlap = CHUNK_OVERLAP_REPORT
        self.model_text_embed = MODEL_TEXT_EMBED
        self.report_folder = REPORT_FOLDER
        self.vector_store = None

    def extract_data_from_json(self, json_file_path: str) -> dict:
        """
        Reads and parses a JSON file.

        Parameters
        ----------
        json_file_path : str
            Path to the JSON file.

        Returns
        -------
        dict
            Parsed JSON data as a dictionary.
        """

        with open(json_file_path, 'r') as file:
            json_data = json.load(file)
        return json_data

    def flatten_json_to_rows(self, json_obj: dict, parent_keys: Optional[List[str]] = None) -> List[dict]:
        """
        Recursively flattens a nested JSON object into a list of dictionaries, each representing a row.

        Parameters
        ----------
        json_obj : dict
            The nested JSON object to flatten.
        parent_keys : Optional[List[str]]
            A list of parent keys representing the hierarchy (default: None).

        Returns
        -------
        List[dict]
            A list of dictionaries, each representing a flattened row.
        """

        if parent_keys is None:
            parent_keys = []

        rows = []
        for key, value in json_obj.items():
            current_keys = parent_keys + [key]
            if isinstance(value, dict):
                rows.extend(self.flatten_json_to_rows(value, current_keys))
            else:
                row = {f"Level {i+1}": parent_keys[i]
                       for i in range(len(parent_keys))}
                row[f"Level {len(current_keys)}"] = key
                row["Value"] = value
                rows.append(row)
        return rows

    def save_to_excel_all(self, input_file_path: str, report_path: Optional[str] = None) -> None:
        """
        Saves the flattened JSON data into an Excel file with multiple sheets.

        Parameters
        ----------
        input_file_path : str
            Path to the JSON file to process.
        report_path : Optional[str], optional
            Directory to save the generated Excel report (default: self.report_folder).

        Returns
        -------
        None
        """

        if report_path is None:
            report_path = self.report_folder

        # Open and read the JSON file
        with open(input_file_path, 'r') as file:
            input_json_data = json.load(file)

        file_name = os.path.splitext(os.path.basename(input_file_path))[0]

        file_path = os.path.join(report_path, f"{file_name}.xlsx")

        # Flatten the JSON data
        all_rows = self.flatten_json_to_rows(json_obj=input_json_data)
        df_all = pd.DataFrame(all_rows)  # Create a DataFrame for all data

        # Create an Excel writer object
        print(f"Start to translate the data [{input_file_path}] to excel ...")
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
            # Save all data to "Allinone" sheet
            df_all.to_excel(writer, index=False, sheet_name="Allinone")

            # Extract first-level keys and create separate sheets
            for first_level_key in input_json_data.keys():
                filtered_rows = [
                    {k: v for k, v in row.items() if row.get("Level 1") !=
                     first_level_key or k != "Level 1"}
                    for row in all_rows
                    if row.get("Level 1") == first_level_key
                ]
                df_filtered = pd.DataFrame(filtered_rows)
                df_filtered.to_excel(writer, index=False,
                                     sheet_name=first_level_key)

        print(f"Data has been written to {file_path}")

        return None

    def value_convert(self, value: Any) -> Union[int, float, None, str]:
        """
        Converts string values to numerical values where possible.

        Parameters
        ----------
        value : Any
            The value to convert.

        Returns
        -------
        Union[int, float, None, str]
            The converted value.
        """
        # Keep numerical types as is
        if isinstance(value, (int, float)):
            return value
        # Convert null-like values to None
        if value is None or value == "" or str(value).lower() == "null":
            return None
        try:
            # Try to convert to integer
            return int(value)
        except ValueError:
            try:
                # Try to convert to float
                return float(value)
            except ValueError:
                # Return the original value if it's not a number
                return value

    def convert_str_to_num(self, data: Any) -> Any:
        """
        Recursively converts string values in a dictionary or JSON object to numerical values.

        Parameters
        ----------
        data : Any
            The dictionary, list, or value to process.

        Returns
        -------
        Any
            The processed data with numerical values converted.
        """

        # Process the dictionary recursively
        if isinstance(data, dict):
            return {key: self.convert_str_to_num(value) for key, value in data.items()}
        elif isinstance(data, list):
            return [convert_str_to_num(item) for item in data]
        else:
            return self.value_convert(data)

    def build_vector_store(
        self,
        input_data: Union[str, dict, list],
        separator: str = "\n",
        chunk_size: Optional[int] = None,
        chunk_overlap: Optional[int] = None
    ) -> Any:
        """
        Builds a vector store for input data.

        Parameters
        ----------
        input_data : Union[str, dict, list]
            The data to process (JSON, list, or plain text).
        separator : str, optional
            Text chunk separator (default: "\n").
        chunk_size : Optional[int], optional
            Maximum chunk size (default: self.chunk_size).
        chunk_overlap : Optional[int], optional
            Overlap size between chunks (default: self.chunk_overlap).

        Returns
        -------
        Any
            A Chroma vector store object.
        """

        if chunk_size is None:
            chunk_size = self.chunk_size
        if chunk_overlap is None:
            chunk_overlap = self.chunk_overlap

        print("Building vector store...")
        # Convert input_data to a single string
        if isinstance(input_data, dict) or isinstance(input_data, list):
            raw_text = json.dumps(input_data, ensure_ascii=False)
        else:
            # For simplicity: treat everything else as plain text
            raw_text = str(input_data)

        # Split text into chunks
        splitter = CharacterTextSplitter(
            separator=separator,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        chunks = splitter.split_text(raw_text)
        print(f"Created {len(chunks)} chunks from input data.")

        # Build vector store
        embeddings = OpenAIEmbeddings(
            openai_api_key=self.openai_api_key, model=self.model_text_embed)
        self.vector_store = Chroma.from_texts(chunks, embeddings)
        print("Vector store built and stored.")

        return self.vector_store

    def financial_data_retriever(self, vector_store=None, user_query: Optional[str] = None, k: int = 5) -> str:
        """
        Retrieves relevant text about the company's financial health from a vector store.

        Parameters
        ----------
        user_query : str, optional (default None)
            Additional user request specifying what to focus on (e.g. 'cash flow changes').
        vector_store : Any, optional (default None)
            A vector store object (e.g. Chroma or FAISS). If None, uses self.vector_store.
        k : int, default 5
            Number of top relevant chunks to retrieve.

        Returns
        -------
        str
            A single string concatenating all retrieved text chunks.
            (If no docs are found or the vector store is missing, returns an empty string.)
        """

        # 1. Use self.vector_store if no store is passed
        store = vector_store if vector_store is not None else self.vector_store
        if not store:
            warnings.warn(
                "Warning: No vector store available. Returning empty string.")
            return ""

        # 2. Construct the retrieval prompt
        combined_prompt = (
            "Provide relevant text about the company's financial statements, including:\n"
            "- Key metrics such as revenue, net income, operating expenses, cash flow\n"
            "- Any trends or observations relevant to the company's financial health\n"
            "- Any details relevant for constructing a high-level financial summary\n"
        )

        # If user_query is provided, incorporate it
        if user_query:
            combined_prompt += f"\nSpecific user request: {user_query}"

        # 3. Retrieve top-k docs
        retriever = store.as_retriever(search_kwargs={"k": k})
        docs = retriever.invoke(combined_prompt)

        # 4. Concatenate doc text
        if not docs:
            print("No documents retrieved from the vector store.")
            return ""

        retrieved_text = "\n\n".join(doc.page_content for doc in docs)
        return retrieved_text

    def extract_financial_metrics(
        self,
        metrics_schemas: List[ResponseSchema],
        data_input: Any,
        csv_file: Optional[str] = None
    ) -> dict:
        """
        Extracts key financial metrics from input data and saves them to a CSV file if specified.

        Parameters
        ----------
        metrics_schemas : List[ResponseSchema]
            A list of `ResponseSchema` objects defining the metrics to extract.
            Example:
                [ResponseSchema(name="revenue", description="Total revenue recognized"),
                 ResponseSchema(name="net_income", description="Net income after taxes")]
        data_input : Any
            Input data containing financial information. Can be a file path, dictionary, list, or plain text.
        csv_file : Optional[str], optional
            Path to save the extracted metrics in a CSV file (default: None).

        Returns
        -------
        dict
            Extracted financial metrics as a dictionary.
        """

        # 1. Convert input to string
        data_str = None

        # Case A: data_input is a file path
        if isinstance(data_input, str) and os.path.isfile(data_input):
            file_path = pathlib.Path(data_input)
            try:
                if file_path.suffix.lower() == ".json":
                    # It's a JSON file, parse it into a dict or list
                    with open(file_path, "r", encoding="utf-8") as f:
                        parsed_json = json.load(f)
                    # Convert it back to a string for the LLM
                    data_str = json.dumps(parsed_json, ensure_ascii=False)
                else:
                    # It's a text file (or unknown extension)
                    with open(file_path, "r", encoding="utf-8") as f:
                        data_str = f.read()
            except Exception as e:
                warnings.warn(
                    f"Failed to read or parse file '{data_input}': {e}")
                return {}

        # Case B: data_input is a Python object (dict, list, etc.)
        elif isinstance(data_input, (dict, list)):
            data_str = json.dumps(data_input, ensure_ascii=False)

        # Case C: data_input is a string (not an existing file path)
        elif isinstance(data_input, str):
            data_str = data_input

        # Optional: If there are other cases (int, float, custom objects), convert them to string
        else:
            data_str = str(data_input)

        # If data_str is empty or whitespace, return empty
        if not data_str or not data_str.strip():
            warnings.warn(
                "Resulting string is empty or whitespace only. Returning empty metrics.")
            return {}

        # 2. Build structured parser from provided metrics_schemas
        output_parser = StructuredOutputParser.from_response_schemas(
            metrics_schemas)
        format_instructions = output_parser.get_format_instructions()

        # 3. Craft system & user messages
        system_message = SystemMessage(
            content=(
                "You are a financial data extraction assistant. "
                "You will receive arbitrary data (now converted to a string). "
                "Your task is to extract the requested metrics in valid JSON format, "
                "with no extra commentary."
            )
        )

        user_message_content = f"""
        Below is the input data:
    
        {data_str}
    
        Please parse it and return a JSON object with these fields:
        {format_instructions}
        """.strip()
        user_message = HumanMessage(content=user_message_content)

        # 4. Call LLM OPENAI_API_KEY,
        llm = ChatOpenAI(openai_api_key=self.openai_api_key,
                         model_name=self.model_name, temperature=self.model_temperature)
        response_content = llm.invoke([system_message, user_message]).content

        # 5. Parse the LLM response
        try:
            parsed_metrics = output_parser.parse(response_content)
        except Exception as e:
            warnings.warn(f"Failed to parse structured LLM output: {e}", )
            return {}
        parsed_metrics = self.convert_str_to_num(parsed_metrics)

        # 6. Save to CSV if csv_file is provided
        if csv_file is not None and parsed_metrics:
            try:
                with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
                    fieldnames = ["Financial Metrics", "Value"]
                    writer = csv.DictWriter(f, fieldnames=fieldnames)
                    writer.writeheader()
                    # Each key-value pair in parsed_metrics becomes one row
                    for metric_key, metric_value in parsed_metrics.items():
                        writer.writerow({
                            "Financial Metrics": metric_key,
                            "Value": metric_value
                        })
                print(
                    f"Metrics saved to {csv_file} with columns 'Financial Metrics' and 'Value'")
            except Exception as e:
                warnings.warn(f"Error writing to CSV file '{csv_file}': {e}")

        # 7. Return the dictionary
        return parsed_metrics

    def generate_financial_summary(
        self,
        key_metrics: Optional[dict] = None,
        financial_data: Optional[Union[dict, str]] = None,
        add_request: Optional[str] = None,
        output_file: Optional[str] = None
    ) -> str:
        """
        Generates a textual summary report highlighting the company's financial health.

        Parameters
        ----------
        key_metrics : Optional[dict]
            A dictionary of key financial metrics (e.g., revenue, net income).
        financial_data : Optional[Union[dict, str]]
            Additional financial information, either as a dictionary or plain text.
        add_request : Optional[str], optional
            Additional instructions or custom requests for the summary (default: None).
        output_file : Optional[str], optional
            Path to save the generated summary report (default: None).

        Returns
        -------
        str
            A summary report of the company's financial health.
        """

        # 1. Validate that there's at least some data to summarize
        metrics_is_valid = bool(key_metrics)
        financial_data_is_valid = bool(financial_data)

        if not metrics_is_valid and not financial_data_is_valid:
            raise ValueError(
                "Insufficient data: Both 'key_metrics' and 'financial_data' are missing or empty."
            )

        # 2. Convert 'key_metrics' and 'financial_data' to strings
        if key_metrics:
            metrics_str = str(key_metrics)
        else:
            metrics_str=""

        if financial_data:
            data_str = str(financial_data)
        else:
            data_str=""

        # 3. Build the system and user messages for the LLM
        system_message_content = (
            "You are a financial analysis assistant.\n"
            "Your goal is to generate a concise summary of the company's financial health. "
            "Only consider the provided metrics/data, and do not invent facts."
        )
        system_message = SystemMessage(content=system_message_content)

        # Base instructions
        user_instructions = """
        Based on the provided data, generate a summary report that highlights the financial health of the company. The report should include:
          - Key financial metrics
          - Any notable trends or observations
          - A short narrative summary in natural language
        """

        # If 'add_request' exists, append it
        if add_request:
            user_instructions += f"\nAdditional user request:\n{add_request}\n"

        user_message_content = f"""
            --- Financial Statement Metrics ---
            {metrics_str}
    
            --- Additional Financial Data ---
            {data_str}
    
            {user_instructions}
            """.strip()

        user_message = HumanMessage(content=user_message_content)

        # 4. Call LLM
        llm = ChatOpenAI(openai_api_key=self.openai_api_key,
                         model_name=self.model_name, temperature=self.model_temperature)

        # 5. Generate summary
        print("Requesting summary from LLM...")
        response = llm.invoke([system_message, user_message])
        summary_text = response.content.strip()

        # 6. Save summary to 'output_file' if provided
        if output_file:
            try:
                with open(output_file, mode="w", encoding="utf-8") as f:
                    f.write(summary_text)
                print(f"Summary is saved to {output_file} \n")
            except Exception as e:
                warnings.warn(
                    f"Error writing summary to file '{output_file}': {e}")

        # display(Markdown(summary_text))
        return summary_text


## test

In [7]:
# Extracted all financial metrics in a tabular format
input_file="./pdfs_data/fwc_sample_financial_statement.json"
RP=Reports()
RP.save_to_excel_all(input_file_path=input_file, report_path="./reports")

Start to translate the data [./pdfs_data/fwc_sample_financial_statement.json] to excel ...
Data has been written to ./reports/fwc_sample_financial_statement.xlsx




In [21]:
# Extracted financial metrics in a csv
input_file="./pdfs_data/fwc_sample_financial_statement.json"
metrics_schemas=[
    ResponseSchema(name="Revenue Last Year", description="Total revenue recognized in Last Year"),
    ResponseSchema(name="Revenue Previous Year", description="Total revenue recognized in Previous Year"),
    ResponseSchema(name="Net Income Last Year", description="Net income after taxes in Last Year"),
    ResponseSchema(name="Net Income Previous Year", description="Net income after taxes in Previous Year"),
    ResponseSchema(name="Operating Expenses Last Year", description="Total operating expenses in Last Year"),
    ResponseSchema(name="Operating Expenses Previous Year", description="Total operating expenses in Previous Year"),
    ResponseSchema(name="Cash Flow Last Year", description="Cash flow from operations in Last Year"),
    ResponseSchema(name="Cash Flow Previous Year", description="Cash flow from operations in Previous Year")
]
output_file="./reports/fwc_key_metrics.csv"
RP=Reports()
key_metrics=RP.extract_financial_metrics(metrics_schemas=metrics_schemas, data_input=input_file, csv_file=output_file)
key_metrics

Metrics saved to ./reports/fwc_key_metrics.csv with columns 'Financial Metrics' and 'Value'


{'Revenue Last Year': 7797000,
 'Revenue Previous Year': 7701000,
 'Net Income Last Year': 529000,
 'Net Income Previous Year': 1025000,
 'Operating Expenses Last Year': 7268000,
 'Operating Expenses Previous Year': 6676000,
 'Cash Flow Last Year': 536000,
 'Cash Flow Previous Year': 902000}

In [22]:
# Retrieve relevant Info/Data from `Large statements`
input_file="./pdfs_data/fwc_sample_financial_statement.json"
# specific_request="Focus on any changes in cash flow over last two fiscal years"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
RP.build_vector_store(input_data=financial_statements)
# relevant_data=RP.financial_data_retriever(user_query=specific_request)
relevant_data=RP.financial_data_retriever()
print(relevant_data[:500])

Building vector store...
Created 1 chunks from input data.
Vector store built and stored.




{"Statement of Comprehensive Income": {"For the year ended 30 June": {"Revenue": {"Membership subscriptions": {"Last Year": 6748000, "Previous Year": 6571000}, "Interest": {"Last Year": 251000, "Previous Year": 231000}, "Rental income": {"Last Year": 185000, "Previous Year": 244000}, "Other revenue": {"Notes": "3A", "Last Year": 613000, "Previous Year": 655000}, "Total revenue": {"Last Year": 7797000, "Previous Year": 7701000}}, "Expenses": {"Employee expenses": {"Notes": "4A", "Last Year": 3167


In [23]:
# Generate Summary Report for Statement of comprehensive income
from rich.markdown import Markdown
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary_statement_comprehensive_income.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=None,
    financial_data=financial_statements,
    add_request="Generate summary report based on Statement of comprehensive income",
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary_statement_comprehensive_income.txt 



In [24]:
# Generate Summary Report for Statement of financial position
from rich.markdown import Markdown
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary_statement_financial_position.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=None,
    financial_data=financial_statements,
    add_request="Generate summary report based on Statement of financial position",
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary_statement_financial_position.txt 



In [25]:
# Generate Summary Report for Statement of changes in equity
from rich.markdown import Markdown
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary_statement_changes_in_equity.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=None,
    financial_data=financial_statements,
    add_request="Generate summary report based on Statement of changes in equity",
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary_statement_changes_in_equity.txt 



In [26]:
# Generate Summary Report for Statement of cash flows
from rich.markdown import Markdown
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary_statement_cash_flows.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=None,
    financial_data=financial_statements,
    add_request="Generate summary report based on Statement of cash flows",
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary_statement_cash_flows.txt 



In [27]:
# Generate Summary Report for all financial statements
from rich.markdown import Markdown
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary_all.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=None,
    financial_data=financial_statements,
    add_request="Generate summary report based on all financial statements",
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary_all.txt 



In [28]:
# Generate Summary Report based on the provided metric
from rich.markdown import Markdown
key_metrics={
    'Revenue Last Year': '7797000',
    'Revenue Previous Year': '7701000',
    'Net Income Last Year': '529000',
    'Net Income Previous Year': '1025000',
    'Operating Expenses Last Year': '7268000',
    'Operating Expenses Previous Year': '6676000',
    'Cash Flow Last Year': '536000',
    'Cash Flow Previous Year': '902000'
}
input_file="./pdfs_data/fwc_sample_financial_statement.json"
output_summary_file="./reports/financial_summary.txt"
RP=Reports()
financial_statements=RP.extract_data_from_json(input_file)
financial_summary=RP.generate_financial_summary(
    key_metrics=key_metrics, 
    financial_data=None,
    add_request=None,
    output_file=output_summary_file
)

Markdown(financial_summary)

Requesting summary from LLM...
Summary is saved to ./reports/financial_summary.txt 



# data_evaluation.py

In [16]:
"""
Data Evaluation Module
======================
This module provides methods to evaluate:
1) Data Extraction: Measures the precision, recall, and overall extraction score.
2) Key Metric Accuracy: Evaluates numeric errors (MAE, RMSE) and exact match metrics.
3) Summary Quality: Uses traditional NLP metrics (BLEU, ROUGE, BERTScore) and LLM-based evaluation.
"""

import re
import os
import json
import math
import warnings
from typing import Dict, List, Optional, Union
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from langchain_openai import ChatOpenAI
from langchain.schema import SystemMessage, HumanMessage
from langchain.prompts import PromptTemplate

from config import (
    OPENAI_API_KEY,
    MODEL_NAME_EVAL,
    TEMPERATURE_EVAL,
)


class DataEvaluation:
    """
    Provides methods to evaluate:
    1) Data Extraction: Precision, Recall, and overall extraction score.
    2) Key Metric Accuracy: Using MAE, RMSE, and exact match metrics.
    3) Summary Quality: NLP-based metrics (BLEU, ROUGE, BERTScore) and LLM-based evaluation.
    """

    def evaluate_data_extraction(self, extracted_data: str, ground_truth_data: str) -> Dict[str, float]:
        """
        Evaluate the quality of data extraction based on:
        - Partial Precision: The proportion of extracted tokens that match the ground truth.
        - Partial Recall: The proportion of ground truth tokens found in the extracted data.
        - Extraction Score: An average of partial precision and partial recall.

        Parameters
        ----------
        extracted_data : str
            Extracted text or path to a file containing extracted data.
        ground_truth_data : str
            Reference text confirmed to be correct.

        Returns
        -------
        dict
            A dictionary containing partial precision, partial recall, and extraction score.
        """

        if ground_truth_data:
            ground_truth_str = str(ground_truth_data)

        # Read extracted data if it is a file path
        if isinstance(extracted_data, str) and os.path.isfile(extracted_data):
            with open(extracted_data, 'r') as file:
                extracted_str = str(json.load(file))
        else:
            extracted_str = extracted_data

        # Tokenize ground truth and extracted data
        gt_tokens = set(re.findall(r"\w+", ground_truth_str.lower()))
        ex_tokens = set(re.findall(r"\w+", extracted_str.lower()))

        # Ensure ground truth tokens exist
        if not gt_tokens:
            warnings.warn("Ground truth data is empty or no tokens found. "
                          "Cannot compute meaningful partial metrics.")
            return {
                "partial_precision": 0.0,
                "partial_recall": 0.0,
                "extraction_score": 0.0
            }

        # Calculate true positives
        true_positives = gt_tokens.intersection(ex_tokens)
        partial_recall = len(true_positives) / \
            len(gt_tokens) if len(gt_tokens) else 0.0

        # Precision considers only the tokens in extracted data that match ground truth
        extracted_in_gt = ex_tokens.intersection(
            gt_tokens)  # same as true_positives
        partial_precision_denom = len(extracted_in_gt)
        partial_precision = (
            len(true_positives) / partial_precision_denom
            if partial_precision_denom > 0
            else (1.0 if len(gt_tokens) == 0 else 0.0)
        )

        extraction_score = 0.5 * partial_precision + 0.5 * partial_recall

        # print(
        #     f"Data Extraction (Partial) -> Precision: {partial_precision:.2f}, "
        #     f"Recall: {partial_recall:.2f}, Score: {extraction_score:.2f}"
        # )

        return {
            "partial_precision": partial_precision,
            "partial_recall": partial_recall,
            "extraction_score": extraction_score
        }

    def evaluate_key_metric_accuracy(self,
                                     extracted_metrics: Dict[str, Union[int, float]],
                                     ground_truth_metrics: Dict[str,
                                                                Union[int, float]]
                                     ) -> Dict[str, float]:
        """
        Evaluate the accuracy of key financial metrics extracted by the pipeline.

        Metrics:
        - Numeric Error Metrics: MAE, RMSE, naive accuracy_score
        - Exact Match Metrics: Precision, Recall, F1

        Parameters
        ----------
        extracted_metrics : Dict[str, Union[int, float]]
            Metrics extracted by the pipeline.
        ground_truth_metrics : Dict[str, Union[int, float]]
            Correct reference metrics.

        Returns
        -------
        dict
            A dictionary with numeric error metrics (MAE, RMSE) and exact match metrics (Precision, Recall, F1).
        """

        # ---------------------------
        #  Numeric Error Computation
        # ---------------------------
        absolute_errors = []
        squared_errors = []
        total_items = 0

        # Compare only keys that appear in ground_truth
        for key, gt_val in ground_truth_metrics.items():
            ex_val = extracted_metrics.get(key, None)
            if ex_val is None:
                # Missing in extraction, skip numeric error
                continue
            # Compare numeric difference
            try:
                diff = float(gt_val) - float(ex_val)
                absolute_errors.append(abs(diff))
                squared_errors.append(diff ** 2)
                total_items += 1
            except (ValueError, TypeError):
                warnings.warn(
                    f"Could not convert extracted metric {ex_val} to float for key '{key}'. Skipping.")

        # If we have no comparable numeric items
        if total_items == 0:
            mae = 0.0
            rmse = 0.0
            naive_accuracy = 0.0
        else:
            mae = sum(absolute_errors) / total_items
            mse = sum(squared_errors) / total_items
            rmse = math.sqrt(mse)

            # A naive approach to define an "accuracy_score": 1 - (rmse / average_of_ground_truth_values)
            gt_sum = 0.0
            gt_count = 0
            for val in ground_truth_metrics.values():
                try:
                    gt_sum += float(val)
                    gt_count += 1
                except:
                    pass
            if gt_count == 0:
                naive_accuracy = 0.0
            else:
                avg_gt = gt_sum / gt_count
                if avg_gt > 0:
                    naive_accuracy = max(0.0, 1.0 - (rmse / avg_gt))
                else:
                    naive_accuracy = 0.0

        # print(f"Numeric Error => MAE: {mae:.2f}, RMSE: {rmse:.2f}, naive_accuracy: {naive_accuracy:.2f}")

        # --------------------------------
        # Exact Match (Precision/Recall)
        # --------------------------------
        # A "true positive" means the key is in both dicts with EXACT same numeric value.
        # A "false negative" means the key is in ground_truth but is either missing or
        # has a different numeric value in extracted.
        tp = 0
        fn = 0
        fp = 0

        for key, gt_val in ground_truth_metrics.items():
            ex_val = extracted_metrics.get(key, None)
            if ex_val is None:
                fn += 1
                continue
            # Check if numeric value is exactly the same
            try:
                if abs(float(gt_val) - float(ex_val)) < 1e-12:
                    tp += 1
                else:
                    fp += 1
                    fn += 1
            except (ValueError, TypeError):
                fp += 1
                fn += 1

        exact_precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        exact_recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        if (exact_precision + exact_recall) > 0:
            exact_f1 = 2.0 * (exact_precision * exact_recall) / \
                (exact_precision + exact_recall)
        else:
            exact_f1 = 0.0

        # print(f"Exact Match => Precision: {exact_precision:.2f}, Recall: {exact_recall:.2f}, F1: {exact_f1:.2f}")

        return {
            "mae": mae,
            "rmse": rmse,
            "accuracy_score": naive_accuracy,
            "exact_match_precision": exact_precision,
            "exact_match_recall": exact_recall,
            "exact_match_f1": exact_f1
        }

    def evaluate_summary(self, generated_summary, ground_truth) -> Dict[str, float]:
        """
        Evaluate the quality of a generated summary using traditional NLP metrics (BLEU, ROUGE, BERTScore)
        and LLM-based evaluation for human-like scoring.

        Parameters
        ----------
        generated_summary : str
            Summary generated by the pipeline.
        ground_truth : str
            Reference summary to compare against.

        Returns
        -------
        Dict[str, Union[float, str]]
            A dictionary containing scores for BLEU, ROUGE, BERTScore, and an LLM-based assessment.
        """

        evaluation_results = {}

        # Traditional NLP Metrics

        # BLEU Score
        print("Start caculating BLEU Score...")
        smoothing_function = SmoothingFunction().method1
        bleu_score = sentence_bleu(
            [ground_truth.split()],
            generated_summary.split(),
            smoothing_function=smoothing_function
        )
        evaluation_results["BLEU"] = bleu_score

        # ROUGE Score
        print("Start caculating ROUGE Score...")
        rouge = rouge_scorer.RougeScorer(
            ['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        rouge_scores = rouge.score(generated_summary, ground_truth)
        evaluation_results["ROUGE-1"] = rouge_scores['rouge1'].fmeasure
        evaluation_results["ROUGE-2"] = rouge_scores['rouge2'].fmeasure
        evaluation_results["ROUGE-L"] = rouge_scores['rougeL'].fmeasure

        # BERTScore
        print("Start caculating BERTScore...")
        P, R, F1 = bert_score([generated_summary], [ground_truth], lang="en")
        evaluation_results["BERTScore-Precision"] = P.mean().item()
        evaluation_results["BERTScore-Recall"] = R.mean().item()
        evaluation_results["BERTScore-F1"] = F1.mean().item()

        # LLM-Based Evaluation
        print("Start LLM-Based Evaluation...")
        system_message_content = (
            "You are an expert in financial analysis and language assessment. Evaluate the following generated summary of a financial statement against the reference summary. \n"
            "Assess the quality of the summary based on these criteria: \n\n"
            "1. Fluency: Is the language clear, grammatically correct, and professional?\n"
            "2. Coherence: Does the summary logically flow and connect relevant points effectively?\n"
            "3. Relevance: Does the summary accurately reflect the key financial information and metrics?\n"
            "4. Conciseness: Is the summary brief yet comprehensive, avoiding unnecessary details?\n\n"
            "Provide a score (out of 10) for each criterion and include a brief explanation for your ratings. Conclude with an overall evaluation of the summary's quality."
        )
        system_message = SystemMessage(content=system_message_content)

        prompt_template = PromptTemplate(
            template="""
            Reference Summary:
            {GROUND_TRUTH}
    
            Generated Summary:
            {GENERATED_SUMMARY}
            """,
            input_variables=["ground_truth", "generated_summary"]
        )

        evaluation_prompt = prompt_template.format(
            GROUND_TRUTH=ground_truth,
            GENERATED_SUMMARY=generated_summary
        )
        user_message = HumanMessage(content=evaluation_prompt)

        llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,
                         model_name=MODEL_NAME_EVAL, temperature=TEMPERATURE_EVAL)
        response = llm.invoke([system_message, user_message])
        llm_evaluation = response.content.strip()

        evaluation_results["LLM_Evaluation"] = llm_evaluation

        return evaluation_results


## test

In [17]:
# Data extraction Evaluation
extracted_data_file="./pdfs_data/fwc_sample_financial_statement.json"
ground_truth_data={
    "Revenue": {"Membership subscriptions": {"Last Year": 6748000, "Previous Year": 6571000}, 
    "Interest": {"Last Year": 251000, "Previous Year": 231000}, 
    "Rental income": {"Last Year": 185000, "Previous Year": 244000}, 
    "Other revenue": {"Notes": "3A", "Last Year": 613000, "Previous Year": 655000}, 
    "Total revenue": {"Last Year": 7797000, "Previous Year": "xxxxxx"}} 
}

DE=DataEvaluation()
eval_data_extraction=DE.evaluate_data_extraction(extracted_data=extracted_data_file, ground_truth_data=ground_truth_data)

print("Data Extraction Accuracy:")
for key, value in eval_data_extraction.items():
    print(f"  {key}: {value}")

Data Extraction Accuracy:
  partial_precision: 1.0
  partial_recall: 0.9565217391304348
  extraction_score: 0.9782608695652174


In [18]:
# Extracted metric Evaluation
extracted_metrics={
    'Revenue Last Year': 0,
    'Revenue Previous Year': 7701000,
    'Net Income Last Year': 529000,
    'Net Income Previous Year': 1025000,
    'Operating Expenses Last Year': 7268000,
    'Operating Expenses Previous Year': 6676000,
    'Cash Flow Last Year': 536000,
    'Cash Flow Previous Year': 902000
}

ground_truth_metrics={
    'Revenue Last Year': 7797000,
    'Revenue Previous Year': 7701000,
    'Net Income Last Year': 529000,
    'Net Income Previous Year': 1025000,
    'Operating Expenses Last Year': 7268000,
    'Operating Expenses Previous Year': 6676000,
    'Cash Flow Last Year': 536000,
    'Cash Flow Previous Year': 902000
}

DE=DataEvaluation()
metric_accuracy=DE.evaluate_key_metric_accuracy(extracted_metrics=extracted_metrics, ground_truth_metrics=ground_truth_metrics)

print("Key Metric Extraction Accuracy:")
for key, value in metric_accuracy.items():
    print(f"  {key}: {value}")

Key Metric Extraction Accuracy:
  mae: 974625.0
  rmse: 2756655.7864557556
  accuracy_score: 0.32005776988203594
  exact_match_precision: 0.875
  exact_match_recall: 0.875
  exact_match_f1: 0.875


In [19]:
# Summary result Evaluation
generated_summary = """
The company has shown a slight increase in revenue but a notable decrease in net income due to rising 
operating expenses. The significant investment in land and buildings has increased total assets but also led to 
higher liabilities due to new borrowings. While the company's equity position has improved, the decrease in cash 
flow from operations and the substantial cash outflow for investments highlight potential liquidity concerns. The 
company may need to focus on managing expenses and improving operational efficiency to enhance profitability and 
cash flow stability.                                                                                               
"""
ground_truth = """
The company experienced a modest increase in revenue, but a notable decline in net income, primarily driven by 
rising operating expenses. Significant investments in land and buildings have boosted total assets but also increased 
liabilities due to new borrowings. While the equity position has strengthened, the decrease in cash flow from 
operations and substantial cash outflows for investments raise potential liquidity concerns. To address these challenges,
the company may need to prioritize expense management and enhance operational efficiency to improve profitability and 
ensure cash flow stability.                                                                                              
"""

DE=DataEvaluation()
summary_accuracy=DE.evaluate_summary(generated_summary=generated_summary, ground_truth=ground_truth)

print("\nSummary Report Accuracy:")
for key, value in summary_accuracy.items():
    if key == "LLM_Evaluation":
        print(f"\n{key} - Detailed Evaluation:")
        lines = value.splitlines()
        for line in lines:
            print(f"  {line.strip()}")
    else:
        print(f"  {key}: {value}")

Start caculating BLEU Score...
Start caculating ROUGE Score...
Start caculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start LLM-Based Evaluation...

Summary Report Accuracy:
  BLEU: 0.33644277918974813
  ROUGE-1: 0.8160919540229885
  ROUGE-2: 0.5581395348837209
  ROUGE-L: 0.7586206896551724
  BERTScore-Precision: 0.9605397582054138
  BERTScore-Recall: 0.9528236389160156
  BERTScore-F1: 0.956666111946106

LLM_Evaluation - Detailed Evaluation:
  1. **Fluency: 9/10**
  
  The language in the generated summary is clear, grammatically correct, and professional. It effectively communicates the financial situation of the company without any noticeable errors. The only reason for not giving a perfect score is the lack of variation in sentence structure, which could enhance readability.
  
  2. **Coherence: 9/10**
  
  The summary logically flows and connects relevant points effectively. It follows a clear structure, starting with revenue and net income, moving to investments and liabilities, and concluding with equity and liquidity concerns. The transition between these points is smooth, maintaining coherence