In [1]:
from haystack.document_stores.in_memory import InMemoryDocumentStore

document_store = InMemoryDocumentStore()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pathlib import Path

In [5]:
!tree ../data

[1;36m../data[0m
└── [1;36mrick_and_morty_episodes[0m
    ├── README.md
    ├── season_1.txt
    ├── season_2.txt
    ├── season_3.txt
    ├── season_4.txt
    ├── season_5.txt
    ├── season_6.txt
    └── season_7.txt

2 directories, 8 files


In [8]:
doc_files = list(Path("../data").glob("**/*.txt"))
doc_files

[PosixPath('../data/rick_and_morty_episodes/season_2.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_3.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_1.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_4.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_5.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_7.txt'),
 PosixPath('../data/rick_and_morty_episodes/season_6.txt')]

In [9]:
doc_content = doc_files[0].read_text()

In [10]:
from haystack import Document 

In [12]:
Document?

[0;31mInit signature:[0m [0mDocument[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Base data class containing some data to be queried.

Can contain text snippets, tables, and file paths to images or audios. Documents can be sorted by score and saved
to/from dictionary and JSON.

:param id: Unique identifier for the document. When not set, it's generated based on the Document fields' values.
:param content: Text of the document, if the document contains text.
:param dataframe: Pandas dataframe with the document's content, if the document contains tabular data.
:param blob: Binary data associated with the document, if the document has any binary data associated with it.
:param meta: Additional custom metadata for the document. Must be JSON-serializable.
:param score: Score of the document. Used for ranking, usually assigned by retrievers.
:param embedding: dense vector representation of 

In [13]:
docs = [Document(content=file.read_text()) for file in doc_files]


In [14]:
docs

[Document(id=f8983c0f698b86be3030cf14350638ec6c8cf91144ddee62e5030997e74bcadf, content: 'Season 2, Episode 1: "A Rickle in Time"	Wes Archer	Matt Roller	July 26, 2015	RAM-201	2.12[11]
 Contin...'),
 Document(id=ef2c3018a40de20870c01561bc688afdba94b96ee2c328720211087a58e1f01c, content: 'Season 3, Episode 1: "The Rickshank Rickdemption"	Juan Meza-León	Mike McMahan	April 1, 2017	RAM-301	...'),
 Document(id=741d1c671c90dc09293cd1ec37336e10a2ffabcd6f5726ef45b77feb5e64b470, content: 'Season 1, Episode 1: "Pilot"	Justin Roiland	Dan Harmon & Justin Roiland	December 2, 2013	RAM-001	1.1...'),
 Document(id=aa45160f26471d1390155cb9d9cfbee8add2e7c78e531364de9fe4c0f555be00, content: 'Season 4, Episode 1: "Edge of Tomorty: Rick Die Rickpeat"	Erica Hayes	Mike McMahan	November 10, 2019...'),
 Document(id=caa40bd1111646f7091ce50c531d94c42ecbf6f908c998d7e79800c3294d8aee, content: 'Season 5, Episode 1: "Mort Dinner Rick Andre"	Jacob Hair	Jeff Loveness	June 20, 2021	RAM-501	1.30[9]...'),
 Document(id=940dca

In [15]:
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()




In [16]:
ls ~/.cache/huggingface/hub | grep sentence-transformers

[1m[36mmodels--sentence-transformers--all-MiniLM-L6-v2[m[m/


In [17]:
docs_with_embeddings = doc_embedder.run(docs)
document_store.write_documents(docs_with_embeddings["documents"])

Batches: 100%|██████████| 1/1 [00:02<00:00,  2.57s/it]


7

In [18]:
document_store

<haystack.document_stores.in_memory.document_store.InMemoryDocumentStore at 0x162e626f0>

> Notice that you used sentence-transformers/all-MiniLM-L6-v2 model to create embeddings for your documents before. This is why you need to use the same model to embed the user queries.

In [19]:
from haystack.components.embedders import SentenceTransformersTextEmbedder

text_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")

In [20]:
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever

retriever = InMemoryEmbeddingRetriever(document_store)


In [21]:
from haystack.components.builders import PromptBuilder

template = """
Given the following information, answer the question.

Context:
{% for document in documents %}
    {{ document.content }}
{% endfor %}

Question: {{question}}
Answer:
"""

prompt_builder = PromptBuilder(template=template)

In [22]:
prompt_builder.run(documents=docs, question="Who is the best character?")



In [23]:
import os
from getpass import getpass
from haystack.components.generators import OpenAIGenerator

if "OPENAI_API_KEY" not in os.environ:
    raise ValueError("No OPENAI_API_KEY env variable")
    # os.environ["OPENAI_API_KEY"] = getpass("Enter OpenAI API key:")

generator = OpenAIGenerator(model="gpt-4o-mini")

In [24]:
from haystack import Pipeline

basic_rag_pipeline = Pipeline()
# Add components to your pipeline
basic_rag_pipeline.add_component("text_embedder", text_embedder)
basic_rag_pipeline.add_component("retriever", retriever)
basic_rag_pipeline.add_component("prompt_builder", prompt_builder)
basic_rag_pipeline.add_component("llm", generator)

# Now, connect the components to each other
basic_rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
basic_rag_pipeline.connect("retriever", "prompt_builder.documents")
basic_rag_pipeline.connect("prompt_builder", "llm")


<haystack.core.pipeline.pipeline.Pipeline object at 0x34ca5c4a0>
🚅 Components
  - text_embedder: SentenceTransformersTextEmbedder
  - retriever: InMemoryEmbeddingRetriever
  - prompt_builder: PromptBuilder
  - llm: OpenAIGenerator
🛤️ Connections
  - text_embedder.embedding -> retriever.query_embedding (List[float])
  - retriever.documents -> prompt_builder.documents (List[Document])
  - prompt_builder.prompt -> llm.prompt (str)

In [25]:
question = "What is a Meeseeks Box?"

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])


Batches: 100%|██████████| 1/1 [00:00<00:00,  2.17it/s]


A Meeseeks Box is a fictional device in the animated television show *Rick and Morty*. It allows users to summon beings known as Meeseeks, blue creatures that are created specifically to fulfill a single request or task for the user. Once they complete their task, the Meeseeks dissolve and cease to exist. The concept serves as a commentary on the nature of existence and purpose, as the Meeseeks become increasingly frustrated and desperate if their assigned tasks are overly complicated or difficult to achieve. The Meeseeks Box was introduced in Season 1, Episode 5, titled "Meeseeks and Destroy."


In [26]:
question = """
Give me a list of all the episodes about Jessica, in JSON format

e.g.

[
    {
        "season_number": 1,
        "episode_number": 3,
        "episode_name": "...",
        "episode_description": "..."
    },
    ...
]
""".strip()

response = basic_rag_pipeline.run({"text_embedder": {"text": question}, "prompt_builder": {"question": question}})

print(response["llm"]["replies"][0])

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]


```json
[
    {
        "season_number": 1,
        "episode_number": 1,
        "episode_name": "Pilot",
        "episode_description": "Rick takes Morty to another dimension to collect seeds of 'Mega Trees', which Morty is forced to hide in his own rectum to get past intergalactic customs. However, their cover is blown, leading to chaos and a potential love interest for Morty with Jessica."
    },
    {
        "season_number": 1,
        "episode_number": 24,
        "episode_name": "Total Rickall",
        "episode_description": "The family can't distinguish real people from aliens, resulting in the appearance of many characters, including Jessica, who is initially believed to be real but is eventually deemed a parasite."
    },
    {
        "season_number": 2,
        "episode_number": 1,
        "episode_name": "A Rickle in Time",
        "episode_description": "Rick, Morty, and Summer split into separate timelines. Jessica appears in one of the timelines as Morty's love interes

In [27]:
response

{'llm': {'replies': ['```json\n[\n    {\n        "season_number": 1,\n        "episode_number": 1,\n        "episode_name": "Pilot",\n        "episode_description": "Rick takes Morty to another dimension to collect seeds of \'Mega Trees\', which Morty is forced to hide in his own rectum to get past intergalactic customs. However, their cover is blown, leading to chaos and a potential love interest for Morty with Jessica."\n    },\n    {\n        "season_number": 1,\n        "episode_number": 24,\n        "episode_name": "Total Rickall",\n        "episode_description": "The family can\'t distinguish real people from aliens, resulting in the appearance of many characters, including Jessica, who is initially believed to be real but is eventually deemed a parasite."\n    },\n    {\n        "season_number": 2,\n        "episode_number": 1,\n        "episode_name": "A Rickle in Time",\n        "episode_description": "Rick, Morty, and Summer split into separate timelines. Jessica appears in o

In [28]:
import tiktoken
tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

prompt = prompt_builder.run(documents=docs, question=question)["prompt"]
tokens = tokenizer.encode(prompt)
len(tokens)

23130

In [29]:
prompt

