#### RAG

![alt](pics/RAG.PNG)

In [73]:
from utils import Settings
from openai import OpenAI
import json
import requests
from langchain_openai import ChatOpenAI

api_key = Settings().openai_api_key

#### 1. ładowanie dokumentów do bazy wiedzy

In [74]:
from langchain_community.document_loaders import WebBaseLoader

In [75]:
loader = WebBaseLoader("https://mikulskibartosz.name/cupid-principles-in-data-engineering", )


data = loader.load()

#### 2. text splitting - podział dłuzszego tekstu na mniejsze fragmenty
- rozmiar chunku
- overlap - nakladanie sie na siebie sasiadujacych chunkow

In [76]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [77]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
chunks = text_splitter.split_documents(data)

#### 3. tworzenie bazy wektorowej
- embedding chunków
- zapis do bazy

In [78]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity

In [79]:
embeddings = OpenAIEmbeddings(openai_api_key=api_key)
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings)

In [80]:
retriever = vectorstore.as_retriever(search_kwargs = {'k': 2})

#### 4. semantic search

In [81]:
results = retriever.invoke('How to use unix philosophy in data engineering')

In [82]:
results

[Document(id='6e160abd-155f-42b4-b411-dd7d85ad04dd', metadata={'title': 'CUPID properties in data engineering', 'source': 'https://mikulskibartosz.name/cupid-principles-in-data-engineering', 'language': 'en', 'description': 'Does it make sense to use SOLID principles in data engineering? What about CUPID properties in data pipelines?'}, page_content='Composable\nUnix Philosophy\nPredictable\nIdiomatic\nDomain-based\n\n\nComposable\nData pipelines are composable by definition. Every output dataset may become an input for something else.\nAt the code level, it gets harder. Rarely can we extract a function encapsulating a part of one data transformation and use it in another. However, maybe that’s a good thing. We should reuse output datasets to avoid calculating the same thing multiple times. We don’t need code reuse when we have data reuse.\n\nUnix Philosophy\nUnix Philosophy means we can build a new program on top of another. We can use one tool’s output as another application’s data s

In [83]:
query_vector = embeddings.embed_query('How to use unix philosophy in data engineering')
answer_vector = embeddings.embed_query(results[0].page_content)

In [84]:
cosine_similarity([query_vector], [answer_vector])

array([[0.86553863]])

In [85]:
results = retriever.invoke('co to jest kot')

In [86]:
results

[Document(id='3b831065-37c3-4a1b-94a4-b44ef2cbfbea', metadata={'title': 'CUPID properties in data engineering', 'source': 'https://mikulskibartosz.name/cupid-principles-in-data-engineering', 'description': 'Does it make sense to use SOLID principles in data engineering? What about CUPID properties in data pipelines?', 'language': 'en'}, page_content='Bartosz Mikulski\n\n\n\n\n\n\n\n\n\n\n\n\n\nAI Risk Prevention | I help SaaS startups with customer-facing LLMs stop AI hallucinations before they trigger public failures, compliance issues, or customer churn.\n\n\n\n\n\n\n\n\n2025 © Bartosz Mikulski | This website does NOT use cookies. Check the about page for the privacy policy | Crafted & Designed by Artem Sheludko.'),
 Document(id='40d8a793-357d-41d2-9aab-0d338a685d35', metadata={'source': 'https://mikulskibartosz.name/cupid-principles-in-data-engineering', 'description': 'Does it make sense to use SOLID principles in data engineering? What about CUPID properties in data pipelines?', '

In [87]:
query_vector = embeddings.embed_query('co to jest kot')
answer_vector = embeddings.embed_query(results[0].page_content)

In [88]:
cosine_similarity([query_vector], [answer_vector])

array([[0.72361774]])

In [None]:
results = vectorstore.similarity_search_with_score('How to use unix philosophy in data engineering', k=2) # dystans euklidesowy znormalisowany 0-1 im bliżej 0 tym mniejszy dystans i są bardziej podobne sementatycznie

In [93]:
results[0][0]
results[0][1]

0.26892274618148804

In [94]:
results[0][0]

Document(id='6e160abd-155f-42b4-b411-dd7d85ad04dd', metadata={'description': 'Does it make sense to use SOLID principles in data engineering? What about CUPID properties in data pipelines?', 'source': 'https://mikulskibartosz.name/cupid-principles-in-data-engineering', 'title': 'CUPID properties in data engineering', 'language': 'en'}, page_content='Composable\nUnix Philosophy\nPredictable\nIdiomatic\nDomain-based\n\n\nComposable\nData pipelines are composable by definition. Every output dataset may become an input for something else.\nAt the code level, it gets harder. Rarely can we extract a function encapsulating a part of one data transformation and use it in another. However, maybe that’s a good thing. We should reuse output datasets to avoid calculating the same thing multiple times. We don’t need code reuse when we have data reuse.\n\nUnix Philosophy\nUnix Philosophy means we can build a new program on top of another. We can use one tool’s output as another application’s data so

#### 5. rozszerzenie rag o funkcje pomocnicze

In [96]:
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

In [97]:
prompt = ChatPromptTemplate.from_messages([
    ('system',"""Answer user's question using the context provided below:
     
     Context:
     {context}
     """),
     MessagesPlaceholder(variable_name='question'),
     ])

model = ChatOpenAI(model='gpt-4o-mini', openai_api_key=api_key)

In [98]:
document_chain = create_stuff_documents_chain(llm=model, prompt=prompt)

In [None]:
from langchain_core.messages import HumanMessage

In [101]:
question = "How to use unix philosophy in data engineering"


documents = retriever.invoke(question)

answer = document_chain.invoke({
    "context": documents,
    "question": [HumanMessage(content=question)]
})

In [102]:
print(answer)

To use the Unix Philosophy in data engineering, you should focus on building data pipelines that are modular and composed of small, single-purpose tools that can be combined together. Here are some key principles to follow:

1. **Modularity**: Create small programs or components that perform specific tasks. Each tool should do one thing well and can be reused in different contexts or workflows.

2. **Data as Input/Output**: Use the output of one tool as the input for another. This allows you to build complex pipelines by chaining together simpler programs effectively.

3. **Text and Data Streams**: Work with data as streams of text and prefer using standard input and output formats. This provides flexibility in how data can be processed and manipulated.

4. **Interoperability**: Ensure that your tools can work with each other easily. This often means sticking to common data formats and protocols.

5. **Simplicity and Clarity**: Keep your processes simple and your code clear. Avoid unne

#### RAG jako chain 

In [103]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough # pozwala przekazac argument do chaina

parser = StrOutputParser()

prompt_str = """Answer user's question using the context provided below:
     
    Context:
    {context}
"""
prompt = ChatPromptTemplate.from_template(prompt_str)

chain =  {"context": retriever, "question": RunnablePassthrough()} | prompt | model | parser

In [104]:
chain.invoke('How to use unix philosophy in data engineering')

'What are the CUPID properties in data engineering?\n\nCUPID properties in data engineering include:\n\n1. **Composable**: Data pipelines are inherently composable, allowing output datasets to be reused as inputs for other processes. While reusing code in terms of functions may be difficult, data reuse can prevent redundant calculations.\n\n2. **Unix Philosophy**: This principle emphasizes building tools on top of other tools, using the output of one application as the input for another. It encourages following the Unix philosophy in data engineering.\n\n3. **Predictable**: Predictability is crucial, as unpredictable data pipelines are deemed useless. This can be achieved through automated testing, which helps ensure that complex code behaves as expected.\n\n4. **Idiomatic**: This property pertains to writing code in a way that follows the conventions and idioms of a given programming language, making it easier to read and maintain.\n\n5. **Domain-based**: This concept focuses on align

In [None]:
# [
#     {
#         'doc_content': 'totaj jest odpowiedz na twoje pytanie',
#         'start_line': 45,
#         'finish_line': 160,
#         'start_doc_page': 1,
#         "finish_doc_page": 2
#         "cosine_similarity": 0.87
#     },
#     {
#         'doc_content': 'totaj jest odpowiedz na twoje pytanie',
#         'start_line': 45,
#         'finish_line': 160,
#         'start_doc_page': 1,
#         "finish_doc_page": 2
#         "cosine_similarity": 0.87
#     },

# ]