# Indexing pipeline: Save and load document store

In [None]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer
from transformers import GenerationConfig
from haystack.nodes import PromptNode
from haystack.nodes import EmbeddingRetriever

In [35]:
retriever_dict = dict()
text_converter_dict = dict() 
preprocessor_dict = dict()
document_store_dict = dict()
prompt_node_dict = dict()

In [36]:
iteration = 6
text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters
text_converter_dict[iteration] = text_converter

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by=None, # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)
preprocessor_dict[iteration] = preprocessor

In [16]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{append_timestamp('faiss_document_store')}.db",
    faiss_index_factory_str="Flat"
    )

In [37]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers"
)
retriever_dict[iteration] = retriever

In [38]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
faiss_filename = append_timestamp('faiss_document_store')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{faiss_filename}.db",
    faiss_index_factory_str="Flat"
    )
document_store_dict[iteration] = document_store

In [39]:
hf_access_token = os.getenv('access_token_huggingface')
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(model_name, api_key=hf_access_token, max_length=256)
prompt_node_dict[iteration] = prompt_node

In [40]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
p.add_node(component=retriever, name="Retriever", inputs=["DocumentStore"])

# p.add_node(component=prompt_node, name="PromptNode", inputs=["DocumentStore"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 12.64it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]Document 60b653a8599c46ea25daf045f69f22e9 is 14092 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 54.45docs/s]
Writing Documents:   0%|          | 0/2 [00:00<?, ?it/s]

Writing Documents: 10000it [00:00, 49016.00it/s]        


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

In [44]:
def print_n_docs(iteration):
    print(f'document_stores length: {len(document_store_dict[iteration].get_all_documents())}\n')
    print(f'retriever document_stores length: {len(retriever_dict[iteration].document_store.get_all_documents())}')
    
print_n_docs(iteration)

document_stores length: 2

retriever document_stores length: 2


## Update embeddings before saving

In [56]:
document_store.update_embeddings(retriever)
document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

Updating Embedding:   0%|          | 0/2 [00:00<?, ? docs/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:10, 986.87 docs/s]       


In [57]:
import sys


# saved_document_store
try:
    saved_document_store = FAISSDocumentStore.load(
        # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
except Exception as error:
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    message = f"Error in line {lineno} of {filename}: {str(error)}"
    print(message)
    saved_document_store = FAISSDocumentStore(
        sql_url=f"sqlite:///../data/testing_2023-11-22/{faiss_filename}.db"
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
    

# Summarization pipeline

In [12]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer
from transformers import GenerationConfig
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser
from haystack.nodes import EmbeddingRetriever

In [19]:
prompt_node_dict = dict()
output_dict = dict()
system_message_dict = dict()
prompt_dict = dict()

In [9]:
saved_document_store = FAISSDocumentStore.load(
    index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
    )
print(type(saved_document_store))
saved_document_store.get_all_documents()

<class 'haystack.document_stores.faiss.FAISSDocumentStore'>


[<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increases in muscle st

In [24]:
iteration = 1

prompt = PromptTemplate( # https://docs.haystack.deepset.ai/docs/prompt_node#prompttemplates
    prompt='{query}\n\n Messages: {join(documents)} \n\nSummary: '
)
prompt_dict[iteration] = prompt


In [43]:
system_message = """
Summarize the messages to create a Frequently Asked Questions document.
"""
system_message_dict[iteration] = system_message

hf_access_token = os.getenv('access_token_huggingface')
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(
    model_name, api_key=hf_access_token, max_length=256,
    default_prompt_template=prompt,
    )
prompt_node_dict[iteration] = prompt_node # https://docs.haystack.deepset.ai/docs/prompt_node#in-a-pipeline

summarize_pipeline = Pipeline()

summarize_pipeline.add_node(component=prompt_node, name="PromptNode", inputs=["Query"])

output = summarize_pipeline.run(query=system_message, documents=saved_document_store.get_all_documents())
output_dict[iteration] = output
output

Token indices sequence length is longer than the specified maximum sequence length for this model (3101 > 1024). Running this sequence through the model will result in indexing errors


{'results': ['uprofen and acetylsalicylic acid.\n\nFAQs\n\nQ: What is the study about?\nA: The study investigates the effects of high doses of anti-inflammatory drugs on muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nQ: What were the findings of the study?\nA: The study found that high doses of ibuprofen compromised resistance exercise-induced muscle hypertrophy independent of training mode, while increases in muscle strength were attenuated by ibuprofen only when training was performed with maximal all-out repetitions. The study also found that while the resistance-training intervention resulted in several muscle molecular adaptations, the only marked difference across medical treatment was a significant ibuprofen-induced downregulation of the inflammatory cytokine IL-6 mRNA, compared with an upregulation in the ASA group.\n\nQ: What is the significance of the findings?\nA: The study provides strong proof-of-principle evidence that the effect o

In [47]:
print(len(output['results']))
print(output['results'][0])

1
uprofen and acetylsalicylic acid.

FAQs

Q: What is the study about?
A: The study investigates the effects of high doses of anti-inflammatory drugs on muscle strength and hypertrophic adaptations to resistance training in young adults.

Q: What were the findings of the study?
A: The study found that high doses of ibuprofen compromised resistance exercise-induced muscle hypertrophy independent of training mode, while increases in muscle strength were attenuated by ibuprofen only when training was performed with maximal all-out repetitions. The study also found that while the resistance-training intervention resulted in several muscle molecular adaptations, the only marked difference across medical treatment was a significant ibuprofen-induced downregulation of the inflammatory cytokine IL-6 mRNA, compared with an upregulation in the ASA group.

Q: What is the significance of the findings?
A: The study provides strong proof-of-principle evidence that the effect of NSAIDs on muscle adap

# *End of Page*