In [3]:
import pandas as pd
import re


# doc load
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import TokenTextSplitter


# schema
from typing import List
from langchain_core.pydantic_v1 import BaseModel, Field


# extractor
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate
from langchain_core.messages import SystemMessage

FILE = "../data/input/1. SGX Group reports 1H FY2024 net profit of S$251 million.pdf"

## Read Files

In [5]:
# Load PDF file and split into pages
pages = PyPDFLoader(FILE).load_and_split()

# Clean up page content by replacing multiple newlines with a single one
text = " ".join(re.sub("\n \n+", "\n", page.page_content) for page in pages)

text

'Singapore Exchange Limited  \nCompany Reg No. 199904940D  \n2 Shenton Way, #02 -02 SGX Centre 1, Singapore 068804  \nmain: +65 6236 8888   fax: +65 6535 6994  \nsgx.com  \n  \n   \n \n \nNews Release  \n1 February 202 4 \nSGX Group reports 1H FY2024 net profit of S$251 million  \n1H FY202 4 Financial Summary  \n 1H FY2024  1H FY2024  Adjusted * \nRevenue  S$592.2 million, up 3.6%  \nEBITDA  S$344.6 million, up 3.2%  S$344.6 million, up 3.2%  \nNet profit attributable to equity holders \nof the company (NPAT)  S$281.6 million, down 1.0%  S$251.4 million, up 6.2%  \nEarnings per share (EPS)  26.3 cents  23.5 cents  \nInterim quarterly dividend per share  8.5 cents, up 0.5 cents  \n* Adjusted EBITDA, NPAT and EPS exclude certain non -cash and non -recurring items that have less bearing on SGX’s \noperating performance. Hence, they better reflect the group’s underlying performance. Adjusted figures are non -SFRS(I) \nmeasures. Please refer to Section 7 of our financial results for reconci

In [6]:
text_splitter = TokenTextSplitter(
    # you may try different chunk_size and chunk_overlap for a better extraction
    chunk_size=1000,
    chunk_overlap=20,
)

texts = text_splitter.split_text(text)
len(texts)

4

## The Schema

In [6]:
# Reference
# * https://python.langchain.com/v0.2/docs/tutorials/extraction/
# * https://blog.langchain.dev/open-source-extraction-service/

class FinancialCategory(BaseModel):
    """Information about a financial category"""

    # ^ Doc-string for the entity FinancialCategory.
    # This doc-string is sent to the LLM as the description of the schema FinancialCategory,
    # and it can help to improve extraction results.

    # Note that:
    # 1. In the original article, each field is an `optional` -- this allows the model to decline to extract it! You could have a try.
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: str = Field(description="Name of the financial figure, such as total revenue.")
    value: str = Field(description="Complete value of the financial figure, including numbers, currency symbols, and scale (e.g., $123 million).")
    period: str = Field(description="Reported period of the financial figure.")
    source_sentence: str = Field(description="Source sentence of text where financial figure was found.")


class Data(BaseModel):
    """Extracted data about financial category."""

    # Creates a model so that we can extract multiple entities.
    finfigure: List[FinancialCategory]

## The Extractor

In [7]:
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=
            (
                "Your goal is to extract structured information from the user's input that matches the form described below. "
                "When extracting information, please make sure it matches the type information exactly. "
                "Do not add any attributes that do not appear in the schema shown below. "
                """
                TypeScript
                financial_figure: { // Extract financial figures from the text, including their names, values, reported periods, and source sentences.
                    name: string // Name of the financial figure, such as revenue, expense, NPAT, EBITDA
                    value: string // Complete value of the financial figure, including numbers, currency symbols, and scale (e.g., $123 million)
                    period: string // Reported period for the financial figure.
                    source_sentence: string // The sentence from the text where this financial figure is mentioned.
                }                
                """
                "Please output the extracted information in JSON format. Do not output anything except for the extracted information. "
                "Do not add any clarifying information. Do not add any fields that are not in the schema. If the text contains attributes that do not appear in the schema, please return null. "
                "All output must be in JSON format and follow the schema specified above. Wrap the JSON in <json> tags."
                "Below are some examples"
                """
                Input: In the first quarter of 2021, Company A reported a revenue of $500 million
                Output: <json>{"financial_figure": {"name": "revenue", "value": "$500 million", "period": "first quarter of 2021", "source_sentence": "In the first quarter of 2021, Company A reported a revenue of $500 million."}}</json>
                Input: The total expenses for fiscal year 2020 were approximately $200 million
                Output: <json>{"financial_figure": {"name": "expenses", "value": "$200 million", "period": "fiscal year 2020", "source_sentence": "The total expenses for fiscal year 2020 were approximately $200 million."}}</json>                
                """
            )
        ),
        # Placeholder for user input.
        HumanMessagePromptTemplate.from_template("{text}"),
    ]
)

In [8]:
prompt.messages

[SystemMessage(content='Your goal is to extract structured information from the user\'s input that matches the form described below. When extracting information, please make sure it matches the type information exactly. Do not add any attributes that do not appear in the schema shown below. \n                TypeScript\n                financial_figure: { // Extract financial figures from the text, including their names, values, reported periods, and source sentences.\n                    name: string // Name of the financial figure, such as revenue, expense, NPAT, EBITDA\n                    value: string // Complete value of the financial figure, including numbers, currency symbols, and scale (e.g., $123 million)\n                    period: string // Reported period for the financial figure.\n                    source_sentence: string // The sentence from the text where this financial figure is mentioned.\n                }                \n                Please output the extract

In [9]:
from langchain_mistralai import ChatMistralAI
from langchain_openai import ChatOpenAI

llm = ChatMistralAI(
    api_key="",
    model_name="mistral-large-latest",
    temperature=0,
    max_tokens=32000
)

# llm = ChatOpenAI(
#     api_key="",
#     model="gpt-4-1106-preview",
#     temperature=0)

runnable = prompt | llm.with_structured_output(
    schema=Data,
    include_raw=True,
    )

## Extract

In [18]:
extractions = [runnable.invoke({"text": text}) for text in texts]
extractions

[{'raw': AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'BwHmaFogK', 'function': {'name': 'Data', 'arguments': '{"finfigure": [{"name": "revenue", "value": "S$592.2 million", "period": "1H FY2024", "source_sentence": "Revenue increased 3.6% to S$592.2 million ( S$571.4 million), mainly driven by higher revenues from Currencies and Commodities2 and Platform and Others, partially offset by lower Equities \\u2013 Cash2 and Equities \\u2013 Derivatives2 revenue."}, {"name": "adjusted EBITDA", "value": "S$344.6 million", "period": "1H FY2024", "source_sentence": "Adjusted EBITDA rose to S$344.6 million ( S$334.1 million) , while adjusted earnings per share increased to 23.5 cents (22.2 cents)."}, {"name": "adjusted net profit", "value": "S$251.4 million", "period": "1H FY2024", "source_sentence": "Singapore Exchange (SGX Group ) today reported 1H FY202 4 adjusted net profit of S$ 251.4 million (S$2 36.8 million)."}]}'}}]}, response_metadata={'token_usage': {'prompt_tokens': 

In [17]:
result_df = pd.DataFrame()

for extraction in extractions:
    result_df = pd.concat([
        result_df,
        pd.DataFrame(data.dict() for data in extraction['parsed'].finfigure)
        ])
result_df

Unnamed: 0,name,value,period,source_sentence
0,revenue,S$592.2 million,1H FY2024,Revenue increased 3.6% to S$592.2 million ( S$...
1,adjusted EBITDA,S$344.6 million,1H FY2024,Adjusted EBITDA rose to S$344.6 million ( S$33...
2,adjusted net profit,S$251.4 million,1H FY2024,Singapore Exchange (SGX Group ) today reported...
0,FICC revenue,$151.9 million,not specified,FICC revenue increased 28.1% to S$151.9 millio...
1,Fixed Income revenue,$3.9 million,not specified,Fixed Income revenue decreased 8.4% to S$3.9 m...
2,Listing revenue,$2.5 million,not specified,"Listing revenue: S$2.5 million, down 1.4% from..."
3,Corporate actions and other revenue,$1.4 million,not specified,Corporate actions and other revenue: S$1.4 mil...
4,Currencies and Commodities revenue,$148.0 million,not specified,Currencies and Commodities revenue increased 2...
5,OTC FX revenue,$40.9 million,not specified,OTC FX revenue was S$40.9 million ( S$36.2 mil...
6,Trading and clearing revenue,$111.3 million,not specified,"Trading and clearing revenue: S$111.3 million,..."


In [19]:
result_df.to_csv('../data/output/extractions.csv', index=False)

In [10]:
result_df = pd.read_csv('../data/output/extractions.csv')
print(result_df.to_markdown())

|    | name                                                                              | value           | period               | source_sentence                                                                                                                                                                                                                          |
|---:|:----------------------------------------------------------------------------------|:----------------|:---------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|  0 | revenue                                                                           | S$592.2 million | 1H FY2024            | Revenue increased 3.6% to S$592.2 million ( S$571.4 million), mainly driven by higher revenues from Currencies and Commodities2 and Pl