# Visualize your RAG Data - EDA for Retrieval-Augmented Generation
---

https://itnext.io/visualize-your-rag-data-eda-for-retrieval-augmented-generation-0701ee98768f

https://github.com/Renumics/rag-demo/blob/main/notebooks/visualize_rag_tutorial.ipynb

## How to use UMAP dimensionality reduction for Embeddings to show Questions, Answers and their relationships to source documents with OpenAI, Langchain and ChromaDB

In [9]:
from pprint import pprint

### Prepare Documents

In [1]:
# Create embeddings model and vector store
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma

embeddings_model = OpenAIEmbeddings(model="text-embedding-ada-002")
docs_vectorstore = Chroma(
    collection_name="docs_store",
    embedding_function=embeddings_model,
    persist_directory="docs-db",
)

In [2]:
# Load documents with the LangChain document loader
from langchain_community.document_loaders import BSHTMLLoader, DirectoryLoader

loader = DirectoryLoader(
    "../../../docs/f1-wiki",
    glob="*.html",
    loader_cls=BSHTMLLoader,
    loader_kwargs={"open_encoding": "utf-8"},
    recursive=True,
    show_progress=True,
)
docs = loader.load()

100%|██████████| 979/979 [01:13<00:00, 13.29it/s]


In [47]:
# Divide documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
splits = text_splitter.split_documents(docs)


pprint(f"Document Metadata:\n{splits[0].metadata}")
pprint(f"Document Schema:\n{splits[0].schema_json()}")
pprint(f"Document Page Content:\n{splits[0].page_content[:30]}")

('Document Metadata:\n'
 "{'source': '..\\\\..\\\\..\\\\docs\\\\f1-wiki\\\\1 (2013 film).html', "
 "'title': '1 (2013 film) - Wikipedia', 'start_index': 3}")
('Document Schema:\n'
 '{"title": "Document", "description": "Class for storing a piece of text and '
 'associated metadata.", "type": "object", "properties": {"page_content": '
 '{"title": "Page Content", "type": "string"}, "metadata": {"title": '
 '"Metadata", "type": "object"}, "type": {"title": "Type", "default": '
 '"Document", "enum": ["Document"], "type": "string"}}, "required": '
 '["page_content"]}')
'Document Page Content:\n1 (2013 film) - Wikipedia\n\n\n\n\n'


In [50]:
# Add documents to the vector store - use an id that can be reconstructed from the metadata
import json
import hashlib
from langchain_core.documents import Document


def stable_hash(doc: Document) -> str:
    """
    Stable hash document based on its metadata.
    """
    return hashlib.sha1(json.dumps(doc.metadata, sort_keys=True).encode()).hexdigest()


# Hashing splits
split_ids = list(map(stable_hash, splits))

# Removing duplicates based on split hash
unique_splits = []
unique_split_ids = set()
for split, split_id in zip(splits, split_ids):
    if split_id not in unique_split_ids:
        unique_splits.append(split)
        unique_split_ids.add(split_id)

# Now unique_splits contains the items from splits with unique 'split_id'
splits = unique_splits
split_ids = list(unique_split_ids)

# Adding splits to vector store
docs_vectorstore.add_documents(splits, ids=split_ids)
docs_vectorstore.persist()

### Build the LangChain

In [51]:
# Create Langchain model and retriever
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4", temperature=0.0)
retriever = docs_vectorstore.as_retriever(search_kwargs={"k": 20})

In [52]:
# Create a RAG prompt that includes the question and the source documents
from langchain_core.prompts import ChatPromptTemplate

template = """
You are an assistant for question-answering tasks.
Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
ALWAYS return a "SOURCES" part in your answer.

QUESION: {question}
==========
{source_documents}
==========
FINAL_ANSWER: """
prompt = ChatPromptTemplate.from_template(template)

In [53]:
# Create a RAG chain that retrieves documents, generates an answer, and formats the answer
from typing import List
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
 

def format_docs(docs: List[Document]) -> str:
    return "\n\n".join(
        f"Content: {doc.page_content}\nSource: {doc.metadata['source']}" for doc in docs
    )


rag_chain_from_docs = (
    RunnablePassthrough.assign(
        source_documents=(lambda x: format_docs(x['source_documents']))
    )
    | prompt
    | llm
    | StrOutputParser()
)
rag_chain = RunnableParallel(
    {
        "source_documents": retriever,
        "question": RunnablePassthrough()
    }
).assign(answer=rag_chain_from_docs)

### Ask a Question

In [54]:
question = "Who built the nurburgring?"
response = rag_chain.invoke(question)
answer = response['answer']
print(answer)

The Nürburgring was built in the 1920s in the town of Nürburg, Rhineland-Palatinate, Germany. The construction of the track was proposed around 1925, following the examples of Italy's Monza and Targa Florio courses, and Berlin's AVUS, yet with a different character. The original Nürburgring was to be a showcase for German automotive engineering and racing talent. The track was designed by the Eichler Architekturbüro from Ravensburg, led by architect Gustav Eichler, and construction began in September 1925 (SOURCES: ..\..\..\docs\f1-wiki\Nürburgring.html).


### Visualise

In [57]:
# Extract embeddings for the documents from the vector store and store them in a dataframe
import pandas as pd

response = docs_vectorstore.get(include=['metadatas', 'documents', 'embeddings'])
df = pd.DataFrame(
    {
        'id': response['ids'],
        'source': [metadata.get('source') for metadata in response['metadatas']],
        'page': [metadata.get('page', -1) for metadata in response['metadatas']],
        'document': response['documents'],
        'embedding': response['embeddings'],
    }
)
df['contains_answer'] = df['document'].apply(lambda x: "Eichler" in x)  # This is a known answer based on the question above
df['contains_answer'].to_numpy().nonzero()

(array([19395], dtype=int64),)

In [61]:
df.sample(5)

Unnamed: 0,id,source,page,document,embedding,contains_answer
11181,5e475538c7ed6aea5f55906409e3a500635ff7cc,..\..\..\docs\f1-wiki\List of Formula One circ...,-1,Street circuit\n\nClockwise\n\nLong Beach\n\n ...,"[-0.0008848432917147875, 0.016657445579767227,...",False
20829,b15dc10ee9d45f867cb679550b6da24926b5f47c,..\..\..\docs\f1-wiki\1988 Formula One World C...,-1,Round 3 – Monaco[edit]\nDespite what many[who?...,"[0.0006762072443962097, 0.01972772181034088, 6...",False
8007,43d237895a247326d18a9f0c85d25e7ff42eae1d,..\..\..\docs\f1-wiki\Arrows A19.html,-1,JPN Taki Inoue\n ITA Gianni Morbidelli\n ITA M...,"[-0.005464091431349516, 0.013111073523759842, ...",False
25927,dd9b915560104153351301088b2cabec599fba87,..\..\..\docs\f1-wiki\Porsche in Formula One.html,-1,"During the 2010 Paris Motor Show, Porsche chai...","[0.0033621552865952253, -0.0021703827660530806...",False
16981,9010da3770279864dd86d0da6271924c98df15a2,..\..\..\docs\f1-wiki\Hungaroring.html,-1,GT4\n1:53.579[39]\nSimon Knap[40]\nBMW M4 GT4\...,"[-0.009206829592585564, 0.003902455559000373, ...",False


In [62]:
# Add the question and answer with their embeddings to the dataframe
question_row = pd.DataFrame(
    {
        'id': ['question'],
        'question': [question],
        'embedding': [embeddings_model.embed_query(question)],
    }
)
answer_row = pd.DataFrame(
    {
        'id': ['answer'],
        'answer': [answer],
        'embedding': [embeddings_model.embed_query(answer)],
    }
)
df = pd.concat([question_row, answer_row, df])
df

Unnamed: 0,id,question,embedding,answer,source,page,document,contains_answer
0,question,Who built the nurburgring?,"[0.007708625985154646, -0.003937094532203096, ...",,,,,
0,answer,,"[-0.006048189310263024, -0.021133312375925372,...",The Nürburgring was built in the 1920s in the ...,,,,
0,0000cb0b906a51abc1fd1321d9f127028d419aab,,"[-0.003770043607801199, 0.02416779100894928, 0...",,..\..\..\docs\f1-wiki\Death of Ayrton Senna.html,-1.0,Autopsy[edit]\nDuring legal proceedings before...,False
1,000123016ef55eb0832a39ea2fa076177c602993,,"[-0.016073334962129593, -0.0030615876894444227...",,..\..\..\docs\f1-wiki\2022 Formula One World C...,-1.0,show Alonso a black and orange flag due to his...,False
2,000266f20a6da4fa752f9aad63a6f9e1f4dd4f7a,,"[0.01503509096801281, 0.003755377372726798, -0...",,..\..\..\docs\f1-wiki\Alonso–Hamilton rivalry....,-1.0,The season finale was marred by controversy ov...,False
...,...,...,...,...,...,...,...,...
29846,fff3ef225a118025d9b0911da917bde1eb768855,,"[-0.0016454989090561867, -0.007826153188943863...",,..\..\..\docs\f1-wiki\Benetton Formula.html,-1.0,"^ ""1992 F1 Season"". Stats F1. Archived from th...",False
29847,fff4797a64b4a2396aed8a561ba7face9decf7fd,,"[-0.0032286755740642548, -0.003210729453712702...",,..\..\..\docs\f1-wiki\Alfa Romeo Racing C41.html,-1.0,Development[edit]\nThe car is an evolution of ...,False
29848,fff50ab1152410c731c3fd658e64c380648dbfdf,,"[-0.002899846062064171, 0.004342384170740843, ...",,..\..\..\docs\f1-wiki\Ferrari SF90.html,-1.0,An anonymous aerodynamicist commented that bec...,False
29849,fff57ac499fc6e488840fc839d3a199ab38f3712,,"[0.0042950911447405815, -0.006919308099895716,...",,..\..\..\docs\f1-wiki\Pirelli.html,-1.0,Shareholder\n\nStake %\n\n\nMarco Polo Interna...,False


In [63]:
# Calculate the distance (L2 norm) between the question and the document embeddings
import numpy as np

question_embedding = embeddings_model.embed_query(question)
df['dist'] = df.apply(
    lambda row: np.linalg.norm(np.array(row['embedding']) - question_embedding), 
    axis=1
)

In [65]:
# Show the dataframe with the question and answer in spotlight
from renumics import spotlight

spotlight.show(df)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


VBox(children=(Label(value='Spotlight running on http://127.0.0.1:64279/'), HBox(children=(Button(description=…

Exception in thread Thread-9:
Traceback (most recent call last):
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\threading.py", line 980, in _bootstrap_inner
    self.run()
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\site-packages\renumics\spotlight\server.py", line 342, in _handle_connections
    msg = self.connection.recv()
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\multiprocessing\connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\multiprocessing\connection.py", line 414, in _recv_bytes
    buf = self._recv(4)
  File "c:\Users\Winson Yeap\miniconda3\envs\f1-analytics\lib\multiprocessing\connection.py", line 379, in _recv
    chunk = read(handle, remaining)
ConnectionResetError: [WinError 10054] An existing connection was f