In [63]:
import dotenv

dotenv.load_dotenv()

True

### Data format

Sometimes data is not available as plaintext but as pdf files.
In this case one need to pay close attention to the format of extracted output. 

Tables from pdf files are often extracted as list of 

As an example we will use a small artifical table with subscription plans and two pages from the financial report of the European Investment Bank (EIB) which we also included in data folder.

source:
https://www.eib.org/attachments/lucalli/20220270_eib_financial_report_2022_en.pdf

In [4]:
# those are installed with requirements.txt
from langchain.document_loaders import PDFMinerLoader, PDFPlumberLoader, PyPDFLoader, UnstructuredPDFLoader

# whole available list of loaders
# each of them may require installation of additional packages
# from langchain.document_loaders import (
#     AmazonTextractPDFLoader,
#     MathpixPDFLoader,
#     OnlinePDFLoader,
#     PDFMinerLoader,
#     PDFMinerPDFasHTMLLoader,
#     PDFPlumberLoader,
#     PyMuPDFLoader,
#     PyPDFDirectoryLoader,
#     PyPDFium2Loader,
#     PyPDFLoader,
#     UnstructuredPDFLoader,
# )

In [5]:
PATH_SMALL = './data/small_table.pdf'
PATH_BIG = './data/financial_graphs.pdf'
# PATH_BIG = './data/financial_report.pdf'

In [9]:
# probably PyPDFLoader is the best

# doc = PDFMinerLoader(PATH_SMALL).load()[0].page_content
# doc = PDFPlumberLoader(PATH_SMALL).load()[0].page_content
doc = PyPDFLoader(PATH_SMALL).load()[0].page_content
# doc = UnstructuredPDFLoader(PATH_SMALL).load()[0].page_content

print(doc)

Service Period Subscription Price
TVmonthS 20 USD
M 30 USD
L 400 USD
yearS 200 USD
M 300 USD
L 400 USD
InternetmonthS 10 USD
M 25 USD
L 50 USD
yearS 100 USD
M 250 USD
L 500 USD
PhonemonthS 6 USD
M 12 USD
L 18 USD
yearS 60 USD
M 120 USD
L 180 USD


Let's check if the model can get the answer from this data.

In [13]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    SystemMessagePromptTemplate,
)
from langchain.chat_models import ChatOpenAI
from langchain.schema import StrOutputParser

system_template = """Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Check if the context is relevant to the question. If it is not, ust say that you don't know, don't try to make up an answer.
----------------
{context}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}"),
]
CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)

qa_chain = CHAT_PROMPT | ChatOpenAI(model='gpt-3.5-turbo', temperature=0.) | StrOutputParser()

qa_chain.invoke({
    'context': doc,
    'question': "What is the Internet S subscription price for one year?"
})

'The Internet S subscription price for one year is 100 USD.'

In [15]:
qa_chain.invoke({
    'context': doc,
    'question': "What is the cheapest subscription price for one year for TV?"
})

'The cheapest subscription price for one year for TV is 200 USD.'

In [None]:
qa_chain.invoke({
    'context': doc,
    'question': "What is the Internet S subscription price for one year in Lithuania?"
})

'The Internet S subscription price for one year in Lithuania is 100 USD.'

In [16]:
# doc2 = PDFMinerLoader(PATH_BIG).load()[0].page_content
# doc2 = PDFPlumberLoader(PATH_BIG).load()[0].page_content
doc2 = PyPDFLoader(PATH_BIG).load()[0].page_content
# doc2 = UnstructuredPDFLoader(PATH_BIG).load()[0].page_content

print(doc2)

3EUROPEAN INVESTMENT BANK: HIGHLIGHTS 2022Member StatesOther outside EU*Enlargement countriesEastern NeighbourhoodAfrica and Middle EastAsiaLatin America & Caribbean86%1%1%0%8%2%2%* Computed under CRR/CRD IV and based on the Bank’s standalone financial statements.Own funds of EUR 78 billion represent 14% of the balance sheet total.
The increase in the CET1 ratio is largely driven by the positive development of the stock and the increase in Tier 1 capital generated from the annual surplus accumulation.OWN FUNDS COMPOSITION (EUR m) 
COMMON EQUITY TIER 1 CET1 RATIO*
SIGNATURES BY REGION IN 202231/12/202231/12/2021Called capitalReservesProﬁt for the ﬁnancial year22 19122 19153 8782 36651 3122 566
56 7629 327
31/12/202231/12/202135.1%32.3%56 7629 327
*  Other outside EU amounts to 0.04% and refers to signatures for counterparties located in Norway and Switzerland.


In [17]:
qa_chain.invoke({
    'context': doc2,
    'question': "What percentage of signatures per region in 2022 are from Asia?"
})

# 8%

'Based on the given information, the percentage of signatures from Asia in 2022 is not provided.'

In [18]:
qa_chain.invoke({
    'context': doc2,
    'question': "What is the reason for the increase in the CET1 ratio?"
})

# The increase in the CET1 ratio is largely driven by the positive development of the stock and the increase in Tier 1 capital generated from the annual surplus accumulation.

'The reason for the increase in the CET1 ratio is largely driven by the positive development of the stock and the increase in Tier 1 capital generated from the annual surplus accumulation.'

### MetaData filters

In [22]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI

from langchain.chains.query_constructor.schema import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.docstore.document import Document

In [23]:
docs = [
    Document(
        page_content="doc content 1",
        metadata={
            "offer": "TV",
            "period": 'year',
            "is_available": True,
            "subscription": "S",
            "price": 100
        },
    ),
    Document(
        page_content="doc content 2",
        metadata={
            "offer": "Internet",
            "period": 'year',
            "is_available": True,
            "subscription": "M",
            "price": 250
        },
    ),
    Document(
        page_content="doc content 3",
        metadata={
            "offer": "Internet",
            "period": 'year',
            "is_available": False,
            "subscription": "M",
            "price": 100
        },
    ),
    Document(
        page_content="doc content 4",
        metadata={
            "offer": "TV",
            "period": 'year',
            "is_available": False,
            "subscription": "L",
            "price": 150
        },
    ),
]

metadata_field_info = [
    AttributeInfo(
        name="offer",
        description="The offer type, can be one of ['TV', 'Internet']",
        type="string",
    ),
    AttributeInfo(
        name="period",
        description="The period of the offer, can be one of ['year', 'month']",
        type="integer",
    ),
    AttributeInfo(
        name="is_available",
        description="Whether the offer is available",
        type="boolean",
    ),
    AttributeInfo(
        name="subscription", 
        description="The subscription type, can be one of ['S', 'M', 'L']", 
        type="string"
    ),
    AttributeInfo(
        name="price",
        description="The price of the offer",
        type="integer",
    ),
]

In [25]:
embeddings = OpenAIEmbeddings()

vectorstore = Chroma.from_documents(docs, embeddings)

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0.)

document_content_description = "Details for all the products offers"

retriever = SelfQueryRetriever.from_llm(
    llm, vectorstore, document_content_description, metadata_field_info, verbose=True
)


In [29]:
retriever.query_constructor.invoke(
    {
        "query" : "What is the cheapest subscription for one year for TV?"
    })

StructuredQuery(query=' ', filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='offer', value='TV'), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='period', value=1), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='subscription', value='S')]), limit=None)

In [27]:
retriever.query_constructor.invoke(
    {
        "query" : "What are some unavailable offers?"
    })

StructuredQuery(query=' ', filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='is_available', value=False), limit=None)