In [1]:
%pip install haystack-ai trafilatura datasets txtai pandas datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key  

### dataset usado

In [3]:
from datasets import load_dataset

dataset = load_dataset("rajpurkar/squad_v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [5]:
import pandas as pd

data_df = pd.DataFrame(dataset['train'])


In [6]:
data_df

Unnamed: 0,id,title,context,question,answers
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'..."
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star..."
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}"
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [..."
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}"
...,...,...,...,...,...
130314,5a7e070b70df9f001a875439,Matter,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,"{'text': [], 'answer_start': []}"
130315,5a7e070b70df9f001a87543a,Matter,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,"{'text': [], 'answer_start': []}"
130316,5a7e070b70df9f001a87543b,Matter,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,"{'text': [], 'answer_start': []}"
130317,5a7e070b70df9f001a87543c,Matter,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,"{'text': [], 'answer_start': []}"


In [7]:
def consolidate_contexts(group):
    # Remover duplicatas dentro de um mesmo título
    unique_contexts = set(group)
    # Juntar os contextos em um único texto corrido
    return " ".join(unique_contexts)

contextos_df = data_df.groupby("title")["context"].apply(consolidate_contexts).reset_index()
contextos_df.rename(columns={"context": "contextos"}, inplace=True)

In [8]:
contextos_df.to_csv("contextos_por_titulo.csv", index=False)

### haystack

- rag com arquivo simples

In [None]:
import os

from haystack import Pipeline, PredefinedPipeline
from dotenv import load_dotenv
import urllib.request

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key

# urllib.request.urlretrieve("https://archive.org/stream/leonardodavinci00brocrich/leonardodavinci00brocrich_djvu.txt",
#                            "davinci.txt")  

indexing_pipeline =  Pipeline.from_template(PredefinedPipeline.INDEXING)
indexing_pipeline.run(data={"sources": ['contextos_por_titulo.csv']})
# indexing_pipeline.run(data={"sources": ["davinci.txt"]})

rag_pipeline =  Pipeline.from_template(PredefinedPipeline.RAG)

query = "Which film featured Destiny's Child's first major single?"
result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
print(result["llm"]["replies"][0])


- mesmo rag so que definindo manualmente as pipelines

In [None]:
import os
import urllib.request

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = api_key
urllib.request.urlretrieve("https://archive.org/stream/leonardodavinci00brocrich/leonardodavinci00brocrich_djvu.txt",
                           "davinci.txt")    

document_store = InMemoryDocumentStore()

text_file_converter = TextFileToDocument()
cleaner = DocumentCleaner()
splitter = DocumentSplitter()
embedder = OpenAIDocumentEmbedder()
writer = DocumentWriter(document_store)

indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", text_file_converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("converter.documents", "cleaner.documents")
indexing_pipeline.connect("cleaner.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")
indexing_pipeline.run(data={"sources": ["davinci.txt"]})

text_embedder = OpenAITextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store)
template = """Given these documents, answer the question.
                Documents:
                {% for doc in documents %}
                    {{ doc.content }}
                {% endfor %}
                Question: {{query}}
                Answer:"""
prompt_builder = PromptBuilder(template=template)
llm = OpenAIGenerator()

rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

query = "Com quantos anos o leonardo foi de arrasta pra cima?"
result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
print(result["llm"]["replies"][0])


### haystack usando meu dataset

In [None]:
from haystack import Pipeline, PredefinedPipeline

indexing_pipeline =  Pipeline.from_template(PredefinedPipeline.INDEXING)
indexing_pipeline.run(data={"sources": ['contextos_por_titulo.csv']})

In [71]:
rag_pipeline =  Pipeline.from_template(PredefinedPipeline.RAG)

In [None]:

query = "What sect of Buddhism is the only remaining one based in Sanskrit?"
result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
print(result["llm"]["replies"][0])

In [None]:
import os

from haystack import Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator

document_store = InMemoryDocumentStore()

text_file_converter = TextFileToDocument()
cleaner = DocumentCleaner()
splitter = DocumentSplitter()
embedder = OpenAIDocumentEmbedder()
writer = DocumentWriter(document_store)

In [None]:
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("converter", text_file_converter)
indexing_pipeline.add_component("cleaner", cleaner)
indexing_pipeline.add_component("splitter", splitter)
indexing_pipeline.add_component("embedder", embedder)
indexing_pipeline.add_component("writer", writer)

indexing_pipeline.connect("converter.documents", "cleaner.documents")
indexing_pipeline.connect("cleaner.documents", "splitter.documents")
indexing_pipeline.connect("splitter.documents", "embedder.documents")
indexing_pipeline.connect("embedder.documents", "writer.documents")
indexing_pipeline.run(data={"sources": ["contextos_por_titulo.csv"]})

In [6]:

text_embedder = OpenAITextEmbedder()
retriever = InMemoryEmbeddingRetriever(document_store)
template = """Given these documents, answer the question.
                Documents:
                {% for doc in documents %}
                    {{ doc.content }}
                {% endfor %}
                Question: {{query}}
                Answer:"""
prompt_builder = PromptBuilder(template=template)
llm = OpenAIGenerator()

In [None]:
rag_pipeline = Pipeline()
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")

In [None]:
query = "Which film featured Destiny's Child's first major single?"
result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
print(result["llm"]["replies"][0])

### txtai

In [10]:
import torch
print(torch.__version__)

2.6.0


In [11]:
%pip install torch torchvision torchaudio txtai autoawq

Collecting torchvision
  Downloading torchvision-0.21.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.6.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting autoawq
  Using cached autoawq-0.2.8.tar.gz (71 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mGetting requirements to build wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[23 lines of output][0m
  [31m   [0m Traceback (most recent call last):
  [31m   [0m   File [35m"/Users/vx/Desktop/coding/machineLearning/.venv/lib/python3.13/site-packages/pip/_vendor/pyproject_hooks/_in_process/_in_process.py"[0m, line [35m389[0m, in [35m<module>[0m
  [31m   [0m     [31mmain[0m[1;31m()[0m
  [31m   [0m     [31m~~~~[0m[1;31m^^[0m
  [31m   [0m   File [35m"/Users/vx/Deskto

In [12]:
from txtai import Embeddings, RAG, LLM
from txtai.pipeline import Extractor

embeddings = Embeddings(path="sentence-transformers/nli-mpnet-base-v2", content=True, autoid="uuid5")
embeddings.index(contextos_df)

In [13]:
extractor = Extractor(embeddings, "google/flan-t5-large")

Device set to use mps


In [14]:
query = "Who killed the Shakyas?"
context = lambda question: [{"query": question, "question": f"Answer the following question using the context below. \n Question: {question} \n Context:"}]

In [16]:
context_query = extractor(context(query))

In [17]:
rag = RAG(embeddings, "google/flan-t5-large", template="""
    Answer the following question using the provided context.

    Question:
    {question}

    Context:
    {context}
"""
)

Device set to use mps


In [19]:
rag("Who killed the Shakyas?")

{'answer': 'el hombre'}