In [53]:
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, PreProcessor, TableTextRetriever, TableReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

In [4]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    save_json=True
)

In [5]:
PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/test/Sasol Sustainability Report_2021_22Sep21_10h30_0_0 - short.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [109]:
docs[1].content

Unnamed: 0,Human Capital - Our people,Footnote,2021,2020,2019,2018,Level of assurance 2021
0,Employee numbers,1,28 949,31 001,31 429,31 270,
1,Employee turnover,1,3 869,1 936,1780,1 560,
2,Safety,,,,,,
3,Recordable Case Rate,2,026,027,026,027,Limited
4,- Employee,,029,031,031,030,
5,- Service provider,,022,023,022,025,
6,Lost Work Day Case Rate (LWDCR),,014,011,010,011,
7,- Employee,,016,014,013,015,
8,- Service provider,,011,008,008,009,
9,Employee and service provider fatalities*,,2,6,3,4,Limited


In [41]:
document_store = InMemoryDocumentStore(embedding_dim=512)

In [42]:
document_store.write_documents(docs)

In [43]:
retriever = TableTextRetriever(
    document_store=document_store,
    query_embedding_model="deepset/bert-small-mm_retrieval-question_encoder",
    passage_embedding_model="deepset/bert-small-mm_retrieval-passage_encoder",
    table_embedding_model="deepset/bert-small-mm_retrieval-table_encoder",
)
document_store.update_embeddings(retriever=retriever)

Documents Processed: 10000 docs [00:00, 359003.01 docs/s]    


In [48]:
reader = TableReader()

In [49]:
table_qa_pipeline = Pipeline()
table_qa_pipeline.add_node(component=retriever, name="TableTextRetriever", inputs=["Query"])
table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["TableTextRetriever"])

In [105]:
prediction = table_qa_pipeline.run("How many employees were there in 2020?")

  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]
  text = normalize_for_match(row[col_index].text)
  cell = row[col_index]


In [106]:
prediction

{'query': 'How many employees were there in 2020?',
 'answers': [<Answer {'answer': '31 001', 'type': 'extractive', 'score': 1.0, 'context': [['Human Capital - Our people', 'Footnote', '2021', '2020', '2019', '2018', 'Level of assurance 2021'], ['Employee numbers', '1', '28 949', '31 001', '31 429', '31 270', ''], ['Employee turnover', '1', '3 869', '1 936', '1780', '1 560', ''], ['Safety', '', '', '', '', '', ''], ['Recordable Case Rate', '2', '0,26', '0,27', '0,26', '0,27', 'Limited'], ['- Employee', '', '0,29', '0,31', '0,31', '0,30', ''], ['- Service provider', '', '0,22', '0,23', '0,22', '0,25', ''], ['Lost Work Day Case Rate (LWDCR)', '', '0,14', '0,11', '0,10', '0,11', ''], ['- Employee', '', '0,16', '0,14', '0,13', '0,15', ''], ['- Service provider', '', '0,11', '0,08', '0,08', '0,09', ''], ['Employee and service provider fatalities*', '', '2', '6', '3', '4', 'Limited'], ['- Employee', '', '1', '3', '2', '3', ''], ['- Service provider', '', '1', '3', '1', '1', ''], ['Employee a

In [107]:
answer = prediction['answers'][0].answer
print(answer)

31 001
