# RAG Quanam Challenge

## Install dependencies

In [1]:
%pip install -r requirements.txt



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Initialize LLM

In [2]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

OPEN_API_KEY = os.getenv("OPEN_API_KEY")
GENERATION_LLM_MODEL = os.getenv("GENERATION_LLM_MODEL")
EVALUATION_LLM_MODEL = os.getenv("EVALUATION_LLM_MODEL")

llm = ChatOpenAI(model=GENERATION_LLM_MODEL, api_key=OPEN_API_KEY)

# Test LLM without retrieval

In [3]:
llm.invoke("¿Which magazine was started first Arthur's Magazine or First for Women?").content

"Arthur's Magazine was started first. It was an American literary periodical that began publication in 1844. On the other hand, First for Women is a women's magazine that was first published in 1989."

# Load JSON and create documents

In [4]:
import json
from langchain.schema import Document 

with open("hotpotqa_docs_reduced.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

# Delete "answer" and "question" keys
cleaned_docs = [
    Document(page_content=doc["text"], metadata={"title": doc["title"]}) 
    for doc in raw_data
]

print(f"{len(cleaned_docs)} documents created.")
print("Document example:")
print(cleaned_docs[0])


1000 documents created.
Document example:
page_content='Radio City is India's first private FM radio station and was started on 3 July 2001.  It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).  It plays Hindi, English and regional songs.  It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.  Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.  The Radio station currently plays a mix of Hindi and Regional music.  Abraham Thomas is the CEO of the company.' metadata={'title': 'Radio City (Indian radio station)'}


# Save embeddings in ChromaDB

In [5]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")

# Index documents
vectorstore = Chroma.from_documents(cleaned_docs, embeddings)

print(f"Indexed {len(cleaned_docs)} documents in ChromaDB")


  embeddings = HuggingFaceEmbeddings(model_name="intfloat/multilingual-e5-large-instruct")
  from .autonotebook import tqdm as notebook_tqdm


Indexed 1000 documents in ChromaDB


# Visualize what was saved in ChromaDB

In [6]:
retrieved_docs = vectorstore.get(include=["embeddings", "metadatas", "documents"])

for i in range(5): 
    print(f"📌 Document {i+1}:")
    print(f"🔹 Metadata: {retrieved_docs['metadatas'][i]}")
    print(f"🔹 Original text: {retrieved_docs['documents'][i][:200]}...") 
    print(f"🔹 Embedding (first 5 values): {retrieved_docs['embeddings'][i][:5]}")  
    print("-" * 80)


📌 Document 1:
🔹 Metadata: {'title': 'Radio City (Indian radio station)'}
🔹 Original text: Radio City is India's first private FM radio station and was started on 3 July 2001.  It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengalur...
🔹 Embedding (first 5 values): [ 0.01276961 -0.00303665 -0.02338252 -0.02053304  0.02734209]
--------------------------------------------------------------------------------
📌 Document 2:
🔹 Metadata: {'title': 'History of Albanian football'}
🔹 Original text: Football in Albania existed before the Albanian Football Federation (FSHF) was created.  This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1...
🔹 Embedding (first 5 values): [ 0.03809442  0.01872035 -0.05253534 -0.00839981  0.03744646]
--------------------------------------------------------------------------------
📌 Document 3:
🔹 Metadata: {'title': 'Echosmith'}
🔹 Original text: Echos

# Generate answer with LLM

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.language_models import BaseLanguageModel

output_parser = StrOutputParser()

def create_prompt() -> ChatPromptTemplate:
    return ChatPromptTemplate.from_messages([
        ("system", """<?xml version="1.0" encoding="UTF-8"?>
        <Prompt>
            <Instructions>
                You are an intelligent assistant that answers questions using only the reference texts provided below.
                Use the most relevant information from these references to provide a concise and accurate answer.
            </Instructions>
            <Rules>
                <Rule>Do not add any external knowledge.</Rule>
                <Rule>Try to keep the answer under 5 words.</Rule>
                <Rule>Respond in the same language as the question and the data, English.</Rule>
                <Rule>Answer strictly what you are asked. Do not provide extra information.</Rule>
            </Rules>
            <Question>{query}</Question>
            <ReferenceTexts>{reference_texts}</ReferenceTexts>
        </Prompt>
        <Answer></Answer>
        """),
    ])

def create_chain(llm: BaseLanguageModel) -> BaseLanguageModel:
    prompt = create_prompt()
    return prompt | llm | output_parser


## LLM assisted eval

In [13]:
evaluator_llm = ChatOpenAI(model=EVALUATION_LLM_MODEL, api_key=OPEN_API_KEY)


def evaluate_with_llm(question: str, expected: str, actual: str) -> bool:
    """Evaluates if the generated answer is correct using an LLM with XML input and JSON output."""
    
    system_prompt = {
        "role": "system",
        "content": "You are an evaluator. Determine if the generated answer is correct based on the expected answer."
                   "Allow minor variations, including:"
                   "- Differences in phrasing, capitalization, punctuation, synonyms, and word order."
                   "- Slightly more detailed answers (e.g., '2006' vs. 'September 2006')."
                   "- Equivalent expressions (e.g., 'ten' vs. '10')."
                   "However, responses that introduce factual inaccuracies, contradict the expected answer, or are incomplete in a way that changes meaning should be marked incorrect."
                   "Respond only with a JSON object: {\"correct\": true} or {\"correct\": false}. No extra text or explanations."
    }

    user_prompt = {
        "role": "user",
        "content": f"""
            <evaluation>
                <question>{question}</question>
                <expected>{expected}</expected>
                <actual>{actual}</actual>
                <instructions>
                    <format>{{"correct": true}} OR {{"correct": false}}</format>
                    <examples>
                        <example expected="2006" actual="September 2006">{{"correct": true}}</example>
                        <example expected="New York City" actual="NYC">{{"correct": true}}</example>
                        <example expected="15 years old" actual="16">{{"correct": false}}</example>
                        <example expected="Standard gauge track" actual="Standard-gauge track">{{"correct": true}}</example>
                    </examples>
                </instructions>
            </evaluation>
        """

    }

    response = evaluator_llm.invoke([system_prompt, user_prompt])

    # Parse JSON output safely
    try:
        response_json = json.loads(response.content) if hasattr(response, "content") else json.loads(str(response))
        return response_json.get("correct", False)
    except json.JSONDecodeError:
        return False  # Default to False if parsing fails


# Evaluate each question and calculate accuracy

In [14]:
QNA_JSON_PATH = "hotpotqa_docs_reduced_qa.json"

with open(QNA_JSON_PATH, "r", encoding="utf-8") as f:
    qna_data = json.load(f)  
results = []

for idx, entry in enumerate(qna_data): 
    query = entry["question"]
    expected_answer = entry["answer"]

    # Retrieve top documents KNN
    search_results = vectorstore.similarity_search_with_score(query, k=10) 

    reference_texts = "\n\n".join(
        [f"Title: {doc.metadata['title']}\nText: {doc.page_content}" for doc, _ in search_results]
    ) if search_results else "No relevant text found."

    chain = create_chain(llm)
    actual_answer = chain.invoke({"query": query, "reference_texts": reference_texts})

    is_correct = evaluate_with_llm(query, expected_answer, actual_answer)

    results.append({
        "question": query,
        "expected_answer": expected_answer,
        "actual_answer": actual_answer,
        "correct": is_correct
    })

    print(f"\n🔹 **Test {idx + 1}:**")
    print(f"📌 Question: {query}")
    print(f"✅ Expected Answer: {expected_answer}")
    print(f"🤖 Generated Answer: {actual_answer}")
    print(f"✔ Correct: {is_correct}")
    print("-" * 80)

total_correct = sum(1 for r in results if r["correct"])
accuracy = total_correct / len(results) if results else 0

print(f"\n🎯 **Final Accuracy: {accuracy * 100:.2f}%** ({total_correct}/{len(results)})")

print(results)



🔹 **Test 1:**
📌 Question: Which magazine was started first Arthur's Magazine or First for Women?
✅ Expected Answer: Arthur's Magazine
🤖 Generated Answer: Arthur's Magazine.
✔ Correct: True
--------------------------------------------------------------------------------

🔹 **Test 2:**
📌 Question: The Oberoi family is part of a hotel company that has a head office in what city?
✅ Expected Answer: Delhi
🤖 Generated Answer: Delhi
✔ Correct: True
--------------------------------------------------------------------------------

🔹 **Test 3:**
📌 Question: Musician and satirist Allie Goertz wrote a song about the "The Simpsons" character Milhouse, who Matt Groening named after who?
✅ Expected Answer: President Richard Nixon
🤖 Generated Answer: Richard Nixon
✔ Correct: True
--------------------------------------------------------------------------------

🔹 **Test 4:**
📌 Question:  What nationality was James Henry Miller's wife?
✅ Expected Answer: American
🤖 Generated Answer: Not provided in the

### Check wrong results

In [None]:
wrong = [r for r in results if not r["correct"]]

wrong

w1 = wrong[0]

search_results = vectorstore.similarity_search_with_score(w1['question'], k=8)

search_results

[(Document(metadata={'title': 'Cadmium chloride'}, page_content='Cadmium chloride is a white crystalline compound of cadmium and chlorine, with the formula CdCl.  It is a hygroscopic solid that is highly soluble in water and slightly soluble in alcohol.  Although it is considered to be ionic, it has considerable covalent character to its bonding.  The crystal structure of cadmium chloride (described below), composed of two-dimensional layers of ions, is a reference for describing other crystal structures.  Also known are CdCl•HO and CdCl•5HO.'),
  0.24115949869155884),
 (Document(metadata={'title': 'Benzamide'}, page_content='Benzamide is an off-white solid with the chemical formula of CHCONH.  It is a derivative of benzoic acid.  It is slightly soluble in water, and soluble in many organic solvents.'),
  0.31586769223213196),
 (Document(metadata={'title': 'Chloride'}, page_content='The chloride ion is the anion (negatively charged ion) Cl.  It is formed when the element chlorine (a ha

# Simple FrontEnd

In [10]:
!streamlit run asistenteInteligente.py


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://192.168.1.9:8501[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
1000 documents created.
Document example:
page_content='Radio City is India's first private FM radio station and was started on 3 July 2001.  It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).  It plays Hindi, English and regional songs.  It was launched in Hyderabad in March 2006, in Chennai 