In [None]:
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever
from haystack.schema import Document
from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers

doc_dir = "/mnt/sda/haystack/enrich_data"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip"
fetch_archive_from_http(url = s3_url, output_dir = doc_dir)

In [None]:
# TIme to pre process. Note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents

all_docs = convert_files_to_docs(dir_path = doc_dir)
preprocessor_sliding_window = PreProcessor(split_overlap = 3, split_length = 10, split_respect_sentence_boundary = False)
docs_sliding_window = preprocessor_sliding_window.process(all_docs)

In [None]:
doc_classifier = TransformersDocumentClassifier(
    model_name_or_path = "cross-encoder/nli-distilroberta-base",
    task = "zero-shot-classification",
    labels = ["music", "natural language processing", "history"],
    batch_size = 16,
)

# classify using gpu, batch_size makes sure we do not run out of memory
classified_docs = doc_classifier.predict(docs_sliding_window)
print(classified_docs[0].to_dict())

In [None]:
# Initialize DocumentStore and index documents
document_store = ElasticsearchDocumentStore(host = "localhost", port = 9200, username="elastic", 
    password="cMYVjbUMj_8_664gC6R8", index = "document", scheme = "https", verify_certs = True,
    ca_certs = "/home/tanvir/work/qa-experiment/http_ca.crt")

document_store.delete_all_documents()
document_store.write_documents(classified_docs)

# check if indexed docs contain classification results
test_doc = document_store.get_all_documents()[0]
print(f'document {test_doc.id} with content \n\n{test_doc.content}\n\nhas label {test_doc.meta["classification"]["label"]}')

In [None]:
# For querying the data, all we have to do to filter for one of our classes is to set a filter on "classification.label".
from haystack.pipelines import ExtractiveQAPipeline

retriever = BM25Retriever(document_store = document_store)
reader = FARMReader(model_name_or_path = "deepset/roberta-base-squad2", use_gpu = True)
pipe = ExtractiveQAPipeline(reader, retriever)

prediction = pipe.run(
    query = "What is heavy metal?",
    params = {"Retriever": {"top_k": 10, "filters": {"classification.label": ["music"]}}, "Reader": {"top_k": 5}},
)

print_answers(prediction, details = "high")

In [7]:
# Now lets wrap everything up with an indexing pipeline

from pathlib import Path
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter

file_type_classifier = FileTypeClassifier()
text_converter = TextConverter()
pdf_converter = PDFToTextConverter()
docx_converter = DocxToTextConverter()

indexing_pipeline_with_classification = Pipeline()
indexing_pipeline_with_classification.add_node(component = file_type_classifier, name = "FileTypeClassifier", inputs = ["File"])
indexing_pipeline_with_classification.add_node(component = text_converter, name = "TextConverter", inputs = ["FileTypeClassifier.output_1"])
indexing_pipeline_with_classification.add_node(component = pdf_converter, name = "PdfConverter", inputs = ["FileTypeClassifier.output_2"])
indexing_pipeline_with_classification.add_node(component = docx_converter, name = "DocxConverter", inputs = ["FileTypeClassifier.output_4"])
indexing_pipeline_with_classification.add_node(component = preprocessor_sliding_window, name = "Preprocessor", inputs = ["TextConverter", "PdfConverter", "DocxConverter"])
indexing_pipeline_with_classification.add_node(component = doc_classifier, name = "DocumentClassifier", inputs = ["Preprocessor"])
indexing_pipeline_with_classification.add_node(component = document_store, name = "DocumentStore", inputs = ["DocumentClassifier"])

document_store.delete_documents()
txt_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".txt"]
pdf_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".pdf"]
docx_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".docx"]
indexing_pipeline_with_classification.run(file_paths = txt_files)
indexing_pipeline_with_classification.run(file_paths = pdf_files)
indexing_pipeline_with_classification.run(file_paths = docx_files)

document_store.get_all_documents()[0]

pdftotext version 22.02.0
Copyright 2005-2022 The Poppler Developers - http://poppler.freedesktop.org
Copyright 1996-2011 Glyph & Cog, LLC
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 160.18docs/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 65.67docs/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 3228.87docs/s]


<Document: {'content': 'Classics or classical studies is the study of classical antiquity,', 'content_type': 'text', 'score': None, 'meta': {'_split_id': 0, 'classification': {'sequence': 'Classics or classical studies is the study of classical antiquity,', 'labels': ['music', 'natural language processing', 'history'], 'scores': [0.3458789885044098, 0.3373076319694519, 0.3168133497238159], 'label': 'music'}}, 'embedding': None, 'id': '5f06721d4e5ddd207e8de318274a89b6'}>

In [8]:
# we can store this pipeline in a yaml file
indexing_pipeline_with_classification.save_to_yaml("indexing_pipeline_with_classification.yaml")