In [14]:
import os
import PyPDF2
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines.standard_pipelines import TextIndexingPipeline
from haystack.nodes import BM25Retriever
from haystack.nodes import FARMReader
from haystack.pipelines import ExtractiveQAPipeline
from pprint import pprint
from haystack.utils import print_answers

In [15]:
# Convert PDF file to text
text = ''
with open('eng12.pdf', 'rb') as f:
    pdf = PyPDF2.PdfReader(f)
    for page_idx in range(len(pdf.pages)):
        page_obj = pdf.pages[page_idx]
        text += page_obj.extract_text()
with open('source_documents/example.txt', 'w') as txt_file:
    txt_file.write(text)

In [16]:
document_store = InMemoryDocumentStore(use_bm25=True)
doc_dir = "source_documents"
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)]
indexing_pipeline = TextIndexingPipeline(document_store)
indexing_pipeline.run_batch(file_paths=files_to_index)

Converting files: 100%|██████████| 1/1 [00:00<00:00, 13.34it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose word count is higher than the split length.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  4.25docs/s]
Updating BM25 representation...: 100%|██████████| 302/302 [00:00<00:00, 6485.38 docs/s]


{'documents': [<Document: {'content': 'GOVERNMENT OF TAMIL NADU\nA publication under Free T extbook Programme of Government of Tamil Nadu\nDepartment of School EducationHIGHER SECONDARY - SECOND YEARCOMMUNICATIVE\nENGLISH\nUntouchability is Inhuman and a Crime\n12th Communicative English Book.indb   1 02/02/19   5:08 PM\nwww.tntextbooks.inGovernment of Tamil Nadu\nFirst Edition -  2019\n(Published under New Syllabus)\nTamil NaduTextbook and Educational\nServices Corporation\nwww.textbooksonline.tn.nic.inState Council of Educational\nResearch and Training\n© SCERT 2019\nPrinting & PublishingContent Creation\nII\nThe wise\npossess allNOT FOR SALE\n12th Communicative English Book.indb   2 02/02/19   5:08 PM\nwww.tntextbooks.inTHE NATIONAL ANTHEM\nJana-gana-mana-adhinayaka jaya he\nBharata-bhagya-vidhata.\nPunjaba-Sindhu-Gujarata-Maratha-\nDravida-Utkala-Banga\nVindhya-Himachala-Yamuna-Ganga\nUchchhala-jaladhi-taranga\nTava subha name jage,\nTava Subha asisa mage,\nGahe tava jaya-gatha.\nJ

In [17]:
retriever = BM25Retriever(document_store=document_store)

In [18]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [19]:
pipe = ExtractiveQAPipeline(reader, retriever)

In [23]:
prediction = pipe.run(
    query="what is Paper presentation ?",
    params={
        "Retriever": {"top_k": 2},
        "Reader": {"top_k": 1}
    }
)

Inferencing Samples: 100%|██████████| 1/1 [00:03<00:00,  3.51s/ Batches]


In [24]:
pprint(prediction)

{'answers': [<Answer {'answer': 'all about how\nyou present your topic in front of an audience', 'type': 'extractive', 'score': 0.47157591581344604, 'context': 'DayP APER PRESENTATION\nPaper presentation is all about how\nyou present your topic in front of an audience. Hence, the way you present the topic, the w', 'offsets_in_document': [{'start': 253, 'end': 313}], 'offsets_in_context': [{'start': 45, 'end': 105}], 'document_ids': ['ead6cee3799b7efe1ede2006834a4f49'], 'meta': {'_split_id': 77}}>],
 'documents': [<Document: {'content': 'receives data displays position\ntransmits signal relays information Listening\n12th Communicative English Book.indb   43 02/02/19   5:09 PM\nwww.tntextbooks.inPage 45 Communicative English Page 44 Have Another DayP APER PRESENTATION\nPaper presentation is all about how\nyou present your topic in front of an audience. Hence, the way you present the topic, the way you put across your points, presentation style and language are important.\nPaper presentat

In [26]:
print_answers(prediction)

'Query: what is Paper presentation ?'
'Answers:'
[   <Answer {'answer': 'all about how\nyou present your topic in front of an audience', 'type': 'extractive', 'score': 0.47157591581344604, 'context': 'DayP APER PRESENTATION\nPaper presentation is all about how\nyou present your topic in front of an audience. Hence, the way you present the topic, the w', 'offsets_in_document': [{'start': 253, 'end': 313}], 'offsets_in_context': [{'start': 45, 'end': 105}], 'document_ids': ['ead6cee3799b7efe1ede2006834a4f49'], 'meta': {'_split_id': 77}}>]


In [25]:
prediction['answers'][0].context

'DayP APER PRESENTATION\nPaper presentation is all about how\nyou present your topic in front of an audience. Hence, the way you present the topic, the w'