# Copied from [previous notebook](silvia%202023-11-20%20Haystack%20experiment.ipynb)

In [13]:
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer

In [14]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)


In [18]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='word', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [19]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [None]:


p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

p.run(file_paths=[f"{path}/journal_article.txt"])

# Failed attempts

## Iteration 1

In [None]:
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer

In [20]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='word', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [22]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    faiss_index_factory_str="Flat"
    )

In [23]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [24]:


p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 53.96it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 53.00docs/s]
Writing Documents: 10000it [00:00, 105976.71it/s]        


{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

### Load the documents

### 1.1

In [27]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0711'
config_filename = 'journal_article_config_2023-11-22_0711'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore(
    sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    faiss_index_path=f'{path}/{index_filename}', faiss_config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: If faiss_index_path is passed, no other params besides faiss_config_path are allowed.

### 1.2

In [28]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0711'
config_filename = 'journal_article_config_2023-11-22_0711'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore(
    sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    # faiss_index_path=f'{path}/{index_filename}', faiss_config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

### 1.3

In [29]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0711'
config_filename = 'journal_article_config_2023-11-22_0711'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore(
    # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    faiss_index_path=f'{path}/{index_filename}', faiss_config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

## Iteration 2 Save document_store after running pipeline

In [None]:
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer

In [35]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='word', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [36]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    faiss_index_factory_str="Flat"
    )

In [37]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 139.08it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 161.84docs/s]
Writing Documents: 10000it [00:00, 160202.89it/s]        


{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

In [38]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [47]:
document_store.get_all_documents()

[<Document: {'content': 'Specifically, in the present cohort of young subjects, it is possible that blunting of important inflammatory processes, as reflected by the downregulated IL-6 gene expression, contributed to the attenuated hypertrophic response because previous studies have shown that induction of IL-6, and also other inflammatory and proteolytic factors, has an important role in muscle regenerative processes during hypertrophic conditions.32,33 In contrast, the role of inflammatory processes may be different in the elderly where an ibuprofen-induced downregulation of IL-6 could reduce chronic low-grade inflammation and thereby restore the blunted anabolic response to resistance exercise typically seen in aged populations.34 Thus, based on the current data and given the complex regulation of mRNA and protein levels,35,36 future studies should directly compare the response to resistance exercise and NSAID treatment in young vs. old populations as well as incorporate better time

In [48]:
document_store.get_all_documents()[0]

<Document: {'content': 'Specifically, in the present cohort of young subjects, it is possible that blunting of important inflammatory processes, as reflected by the downregulated IL-6 gene expression, contributed to the attenuated hypertrophic response because previous studies have shown that induction of IL-6, and also other inflammatory and proteolytic factors, has an important role in muscle regenerative processes during hypertrophic conditions.32,33 In contrast, the role of inflammatory processes may be different in the elderly where an ibuprofen-induced downregulation of IL-6 could reduce chronic low-grade inflammation and thereby restore the blunted anabolic response to resistance exercise typically seen in aged populations.34 Thus, based on the current data and given the complex regulation of mRNA and protein levels,35,36 future studies should directly compare the response to resistance exercise and NSAID treatment in young vs. old populations as well as incorporate better time 

## Load saved documents

In [None]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0732'
config_filename = 'journal_article_config_2023-11-22_0732'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore.load(
    # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

In [43]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0732'
config_filename = 'journal_article_config_2023-11-22_0732'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore(
    sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    # faiss_index_path=f'{path}/{index_filename}', faiss_config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

In [44]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_0732'
config_filename = 'journal_article_config_2023-11-22_0732'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore(
    # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    faiss_index_path=f'{path}/{index_filename}', faiss_config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

## Iteration 3 add summarize node

In [1]:
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer

In [2]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='word', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [3]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{append_timestamp('faiss_document_store')}.db",
    faiss_index_factory_str="Flat"
    )

In [4]:
summarizer = TransformersSummarizer(model_name_or_path="google/pegasus-xsum") # https://docs.haystack.deepset.ai/docs/summarizer

Downloading config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


: 

In [None]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
p.add_node(component=)

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 139.08it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 161.84docs/s]
Writing Documents: 10000it [00:00, 160202.89it/s]        


{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

In [None]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

## Iteration 4

In [6]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer
from transformers import GenerationConfig
from haystack.nodes import PromptNode


In [7]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by='word', # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [8]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{append_timestamp('faiss_document_store')}.db",
    faiss_index_factory_str="Flat"
    )

In [5]:
hf_access_token = os.getenv('access_token_huggingface')
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(model_name, api_key=hf_access_token, max_length=256)

Downloading tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [9]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
p.add_node(component=prompt_node, name="PromptNode", inputs=["DocumentStore"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 19.63it/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 20.33docs/s]
Writing Documents: 10000it [00:00, 86681.56it/s]         


{'results': [],
 'invocation_context': {'file_paths': ['../data/testing_2023-11-22//journal_article.txt'],
  'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ib

In [10]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [11]:
from datetime import datetime

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_1458'
config_filename = 'journal_article_config_2023-11-22_1458'
path = '../data/testing_2023-11-22'
# saved_document_store
saved_document_store = FAISSDocumentStore.load(
    # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
    index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
    )

# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (12) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

## Iteration 5

In [13]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer
from transformers import GenerationConfig
from haystack.nodes import PromptNode
from haystack.nodes import EmbeddingRetriever

In [14]:

text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by=None, # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)

In [15]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers"
)

Downloading .gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading 1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [16]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{append_timestamp('faiss_document_store')}.db",
    faiss_index_factory_str="Flat"
    )

In [None]:
hf_access_token = os.getenv('access_token_huggingface')
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(model_name, api_key=hf_access_token, max_length=256)

Downloading tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

In [17]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
p.add_node(component=retriever, name="Retriever", inputs=["DocumentStore"])

# p.add_node(component=prompt_node, name="PromptNode", inputs=["DocumentStore"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files:   0%|          | 0/1 [00:00<?, ?it/s]

Converting files: 100%|██████████| 1/1 [00:00<00:00, 24.18it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]Document 60b653a8599c46ea25daf045f69f22e9 is 14092 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 182.03docs/s]
Writing Documents: 10000it [00:00, 130512.43it/s]       


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

In [18]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [22]:
from datetime import datetime
import sys

def append_timestamp(string):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    return f'{string}_{timestamp}' 

index_filename = 'journal_article_index_2023-11-22_1519'
config_filename = 'journal_article_config_2023-11-22_1519'
path = '../data/testing_2023-11-22'
# saved_document_store
try:
    saved_document_store = FAISSDocumentStore.load(
        # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
except Exception as error:
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    message = f"Error in line {lineno} of {filename}: {str(error)}"
    print(message)
    saved_document_store = FAISSDocumentStore(
        sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store_2023-11-22_1519.db"
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
    

Error in line 13 of /tmp/ipykernel_15158/2990856550.py: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.


ValueError: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

In [27]:
print(document_store.embedding_dim)
document_store

768


<haystack.document_stores.faiss.FAISSDocumentStore at 0x7fadcef16140>

In [33]:
document_store.get_all_documents()

[<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increases in muscle st

In [32]:
retriever.document_store.get_all_documents()

[<Document: {'content': 'Specifically, in the present cohort of young subjects, it is possible that blunting of important inflammatory processes, as reflected by the downregulated IL-6 gene expression, contributed to the attenuated hypertrophic response because previous studies have shown that induction of IL-6, and also other inflammatory and proteolytic factors, has an important role in muscle regenerative processes during hypertrophic conditions.32,33 In contrast, the role of inflammatory processes may be different in the elderly where an ibuprofen-induced downregulation of IL-6 could reduce chronic low-grade inflammation and thereby restore the blunted anabolic response to resistance exercise typically seen in aged populations.34 Thus, based on the current data and given the complex regulation of mRNA and protein levels,35,36 future studies should directly compare the response to resistance exercise and NSAID treatment in young vs. old populations as well as incorporate better time

# Success: Able to save and load document store - iteration 6

In [None]:
import os
from haystack.pipelines import Pipeline
from haystack.nodes import TextConverter
from haystack.nodes import PreProcessor
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import TransformersSummarizer
from transformers import GenerationConfig
from haystack.nodes import PromptNode
from haystack.nodes import EmbeddingRetriever

In [35]:
retriever_dict = dict()
text_converter_dict = dict() 
preprocessor_dict = dict()
document_store_dict = dict()
prompt_node_dict = dict()

In [36]:
iteration = 6
text_converter = TextConverter() # https://docs.haystack.deepset.ai/docs/file_converters
text_converter_dict[iteration] = text_converter

preprocessor = PreProcessor( # https://docs.haystack.deepset.ai/docs/preprocessor
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
  	remove_substrings=None,
    split_by=None, # Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting.
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=0,
  	max_chars_check = 10000
)
preprocessor_dict[iteration] = preprocessor

In [37]:
retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers"
)
retriever_dict[iteration] = retriever

In [38]:
from datetime import datetime

def append_timestamp(string, ext=None):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H%M")
    if ext:
        return f'{string}_{timestamp}.{ext}'
    else:
        return f'{string}_{timestamp}' 

index_filename = append_timestamp('journal_article_index')
config_filename = append_timestamp('journal_article_config')
faiss_filename = append_timestamp('faiss_document_store')
path = '../data/testing_2023-11-22/'
document_store = FAISSDocumentStore( # https://docs.haystack.deepset.ai/reference/document-store-api#faissdocumentstore
    sql_url=f"sqlite:///../data/testing_2023-11-22/{faiss_filename}.db",
    faiss_index_factory_str="Flat"
    )
document_store_dict[iteration] = document_store

In [39]:
hf_access_token = os.getenv('access_token_huggingface')
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
prompt_node = PromptNode(model_name, api_key=hf_access_token, max_length=256)
prompt_node_dict[iteration] = prompt_node

In [40]:

p = Pipeline()
p.add_node(component=text_converter, name="TextConverter", inputs=["File"])
p.add_node(component=preprocessor, name="PreProcessor", inputs=["TextConverter"])
p.add_node(component=document_store, name="DocumentStore", inputs=["PreProcessor"])
p.add_node(component=retriever, name="Retriever", inputs=["DocumentStore"])

# p.add_node(component=prompt_node, name="PromptNode", inputs=["DocumentStore"])

p.run(file_paths=[f"{path}/journal_article.txt"])

Converting files: 100%|██████████| 1/1 [00:00<00:00, 12.64it/s]
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]Document 60b653a8599c46ea25daf045f69f22e9 is 14092 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time. This document will be now hard-split at 10000 chars recursively.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 54.45docs/s]
Writing Documents:   0%|          | 0/2 [00:00<?, ?it/s]

Writing Documents: 10000it [00:00, 49016.00it/s]        


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

{'documents': [<Document: {'content': 'High doses of anti-inflammatory drugs compromise muscle strength and hypertrophic adaptations to resistance training in young adults.\n\nDiscussion\nThe interest in this study was spurred by the intriguing observation that while NSAIDs might have a negative impact on acute exercise responses (satellite cell activity, translational signalling and protein synthesis), previous human studies have failed to demonstrate a detrimental effect of NSAIDs on the development of muscle hypertrophy in response to chronic resistance training in young adults, possibly due to differences in drug dosage across studies. Accordingly, in the current study, healthy young men and women performed 8 weeks of supervised resistance training with concomitant high- or low-dose NSAID treatment. The major and novel findings were that 1) maximal overthe-counter doses of ibuprofen compromised resistance exercise-induced muscle hypertropy\nindependent of training mode; 2) increase

In [44]:
def print_n_docs(iteration):
    print(f'document_stores length: {len(document_store_dict[iteration].get_all_documents())}\n')
    print(f'retriever document_stores length: {len(retriever_dict[iteration].document_store.get_all_documents())}')
    
print_n_docs(iteration)

document_stores length: 2

retriever document_stores length: 2


In [45]:

document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

In [48]:
import sys


# saved_document_store
try:
    saved_document_store = FAISSDocumentStore.load(
        # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
except Exception as error:
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    message = f"Error in line {lineno} of {filename}: {str(error)}"
    print(message)
    saved_document_store = FAISSDocumentStore(
        sql_url=f"sqlite:///../data/testing_2023-11-22/{faiss_filename}.db"
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
    

Error in line 6 of /tmp/ipykernel_15158/2061927404.py: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.


ValueError: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

In [54]:
saved_document_store = FAISSDocumentStore.load(
index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
)
# Check if the DocumentStore is loaded correctly
assert saved_document_store.faiss_index_factory_str == "Flat"

ValueError: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

In [55]:
saved_document_store = FAISSDocumentStore.load(
        # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )

ValueError: The number of documents in the SQL database (2) doesn't match the number of embeddings in FAISS (0). Make sure your FAISS configuration file points to the same database that you used when you saved the original index.

## Update embeddings before saving

In [56]:
document_store.update_embeddings(retriever)
document_store.save(index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}')

Updating Embedding:   0%|          | 0/2 [00:00<?, ? docs/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Documents Processed: 10000 docs [00:10, 986.87 docs/s]       


In [57]:
import sys


# saved_document_store
try:
    saved_document_store = FAISSDocumentStore.load(
        # sql_url="sqlite:///../data/testing_2023-11-22/faiss_document_store.db",
        index_path=f'{path}/{index_filename}', config_path=f'{path}/{config_filename}'
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
except Exception as error:
    exc_type, exc_obj, tb = sys.exc_info()
    f = tb.tb_frame
    lineno = tb.tb_lineno
    filename = f.f_code.co_filename
    message = f"Error in line {lineno} of {filename}: {str(error)}"
    print(message)
    saved_document_store = FAISSDocumentStore(
        sql_url=f"sqlite:///../data/testing_2023-11-22/{faiss_filename}.db"
        )
    # Check if the DocumentStore is loaded correctly
    assert saved_document_store.faiss_index_factory_str == "Flat"
    

In [59]:
saved_document_store.faiss_index_factory_str

'Flat'

# 

# *End of Page*