# RAG Comparison

### Import

In [1]:
# Imports

from docling.document_converter import DocumentConverter
from tqdm import tqdm
from flock.core.tools.basic_tools import extract_links_from_markdown, get_web_content_as_markdown
# load all markdown file in news folder
import glob
import chromadb
from devtools import debug, pprint
from flock.core.tools.llm_tools import chunk_text_for_embedding
import random
from flock.core import FlockFactory
from flock.core.logging.formatters.themes import OutputTheme
from flock.evaluators.memory.memory_evaluator import MemoryEvaluator, MemoryEvaluatorConfig
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType
from ragas.llms.base import llm_factory
from ragas.embeddings.base import embedding_factory
from ragas.testset.transforms import Parallel, apply_transforms
from ragas.testset.transforms import (
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    OverlapScoreBuilder,
)
from dataclasses import dataclass
import typing as t
from ragas.testset.synthesizers.multi_hop.base import (
    MultiHopQuerySynthesizer,
    MultiHopScenario,
)
from IPython.display import clear_output



### Download News

In [None]:
# NEWS import
import os


path = "https://lite.cnn.com"

index_markdown = get_web_content_as_markdown(path)
links = extract_links_from_markdown(index_markdown,path)

# create news folder if it doesn't exist
if not os.path.exists("news"):
    os.makedirs("news") 


converter = DocumentConverter()
for link in tqdm(links):
    try:    
        result = converter.convert(link)
        markdown = result.document.export_to_markdown()
        with open(f"news/{link.split('/')[-2]}.md", "w", encoding="utf-8") as f:
            f.write(markdown)
    except Exception as e:
        print(f"Error converting {link}: {e}")


### ChromaDB

#### Generate Embeddings and Collection

In [None]:

news_files = glob.glob("news/*.md")
news_files = [open(file, "r", encoding="utf-8") for file in news_files]
news_files = [(file.read(),file.name) for file in news_files]

news_files_chunks = []
for file_content, file_name in tqdm(news_files):
    news_files_chunks.extend(chunk_text_for_embedding(file_content, file_name.split("\\")[-1], chunk_size=2000, overlap=250))

clear_output(wait=True) 

chroma_client = chromadb.PersistentClient(path=".chroma")
collection = chroma_client.get_or_create_collection(name="my_cnn_news")        
for chunk in tqdm(news_files_chunks):
    collection.add(
        ids=[chunk["chunk_id"]],
        documents=[chunk["text"]],
        metadatas=[{"source": chunk["file"]}],
    )
    

#### Sample Query

In [None]:
chroma_client = chromadb.PersistentClient(path=".chroma")
collection = chroma_client.get_or_create_collection(name="my_cnn_news")        

query="What is the vaccination rate goal set by HHS to help prevent outbreaks of measles?"
results = collection.query(query_texts=[query], n_results=3)
debug(results)



#### Generate Queries with Flock

Chunk Query

In [None]:
chroma_client = chromadb.PersistentClient(path=".chroma")
collection = chroma_client.get_or_create_collection(name="my_cnn_news")
entries = collection.get()
# Get 10 random entries from the collection
random_ids = random.sample(entries.get("ids"), 10)
random_news_chunks = collection.get(ids=random_ids).get("documents")

# Easiest form of query - answer a question based on the chunk
simple_qa_generator = FlockFactory.create_default_agent(name="simple_qa_generator",
                                                     model="openai/gpt-4o",
                                                     input="rag_context: str",
                                                     output="factoid_question: str, factoid_answer: str",
                                                     enable_rich_tables=True)

for news_chunk in random_news_chunks: 
    qa_pair = await simple_qa_generator.run_async(inputs={"rag_context": news_chunk})
    print(qa_pair)
    






Document Query

In [None]:
# More complex form of query - answer a question based on the whole document
# get 10 random news markdown files from the news folder

news_files = glob.glob("news/*.md")
news_files = [open(file, "r", encoding="utf-8") for file in news_files]
news_files = [(file.read(),file.name) for file in news_files]
random_news_files = random.sample(news_files, 10)

simple_qa_generator = FlockFactory.create_default_agent(name="simple_qa_generator",
                                                     model="openai/gpt-4o",
                                                     input="rag_context: str",
                                                     output="factoid_question: str, factoid_answer: str, ground_truth: str",
                                                     enable_rich_tables=True)

# let the agent iterate over the news files and generate a question and answer for each file    
for news_file in random_news_files: 
    qa_pair = await simple_qa_generator.run_async(inputs={"rag_context": news_file})
    print(qa_pair)

In [None]:

news_files = glob.glob("news/*.md")
news_files = [open(file, "r", encoding="utf-8") for file in news_files]
news_files = [(file.read(),file.name) for file in news_files]

write_to_kg_agent = FlockFactory.create_default_agent(model="openai/gpt-4o",name="news_to_kg_agent", 
                                            input="data", 
                                            output_theme=OutputTheme.aardvark_blue, shorten_long_lists=True)


write_to_kg_agent.evaluator = MemoryEvaluator(name="news_to_kg_agent", 
                                              config=MemoryEvaluatorConfig(splitting_mode="characters", 
                                                                           number_of_concepts_to_extract=3))

for file_content, file_name in news_files:
    result = write_to_kg_agent.run(inputs={"data": file_content})
    print(result)




