In [2]:
"""
Unstructured (text + tables) pdf QA using TableTextRetriever and TableReader.

Similar to https://docs.haystack.deepset.ai/docs/table_qa
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, PreProcessor, TableTextRetriever, TableReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

In [3]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    save_json=True
)

In [4]:
PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/test/Sasol Sustainability Report_2021_22Sep21_10h30_0_0 - short.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [16]:
document_store = InMemoryDocumentStore(embedding_dim=512)

In [17]:
document_store.write_documents(docs)

In [18]:
retriever = TableTextRetriever(
    document_store=document_store,
    query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder",
    passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder",
    table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder",
)

# Add embeddings to the documents in the document store
document_store.update_embeddings(retriever=retriever)

Documents Processed: 10000 docs [00:01, 8657.42 docs/s]      


In [19]:
reader = TableReader()

In [20]:
table_qa_pipeline = Pipeline()
table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"])
table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableTextRetriever"])

In [32]:
prediction = table_qa_pipeline.run("How many employees were there in 2018?")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


In [33]:
prediction

{'query': 'How many employees were there in 2018?',
 'answers': [<Answer {'answer': '31 270', 'type': 'extractive', 'score': 1.0, 'context': [['Human Capital - Our people', 'Footnote', '2021', '2020', '2019', '2018', 'Level of assurance 2021'], ['Employee numbers', '1', '28 949', '31 001', '31 429', '31 270', ''], ['Employee turnover', '1', '3 869', '1 936', '1780', '1 560', ''], ['Safety', '', '', '', '', '', ''], ['Recordable Case Rate', '2', '0,26', '0,27', '0,26', '0,27', 'Limited'], ['- Employee', '', '0,29', '0,31', '0,31', '0,30', ''], ['- Service provider', '', '0,22', '0,23', '0,22', '0,25', ''], ['Lost Work Day Case Rate (LWDCR)', '', '0,14', '0,11', '0,10', '0,11', ''], ['- Employee', '', '0,16', '0,14', '0,13', '0,15', ''], ['- Service provider', '', '0,11', '0,08', '0,08', '0,09', ''], ['Employee and service provider fatalities*', '', '2', '6', '3', '4', 'Limited'], ['- Employee', '', '1', '3', '2', '3', ''], ['- Service provider', '', '1', '3', '1', '1', ''], ['Employee a

In [107]:
answer = prediction['answers'][0].answer
print(answer)

31 001
