### 1- Documents Loader

In [1]:
from tqdm.auto import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import json
from datasets import load_dataset

pd.set_option("display.max_colwidth", None)

In [2]:
from langchain.docstore.document import Document as LangchainDocument


In [3]:
from typing import List, Optional
from langchain.docstore.document import Document
from langchain_community.document_loaders import (
   PyPDFLoader,
   TextLoader,
   Docx2txtLoader,
   CSVLoader,
   UnstructuredExcelLoader
   )
import os
from pathlib import Path
def load_document(file_path: str) -> List[Document]:
    """
    Load a document and convert it to a list of Langchain Documents.
    
    Args:
        file_path (str): Path to the document
        
    Returns:
        List[Document]: List of Langchain Documents with page_content and metadata
    """
    # Get the file extension
    file_extension = Path(file_path).suffix.lower()
    
    try:
        # Choose appropriate loader based on file extension
        if file_extension == '.pdf':
            loader = PyPDFLoader(file_path)
        elif file_extension == '.txt':
            loader = TextLoader(file_path)
        elif file_extension in ['.docx', '.doc']:
            loader = Docx2txtLoader(file_path)
        elif file_extension == '.csv':
            loader = CSVLoader(file_path)
        elif file_extension in ['.xlsx', '.xls']:
            loader = UnstructuredExcelLoader(file_path)
        else:
            raise ValueError(f"Unsupported file extension: {file_extension}")
        
        # Load the document
        documents = loader.load()
        
        # Add additional metadata
        for doc in documents:
            doc.metadata.update({
                'source': file_path,
                'file_type': file_extension,
                'file_name': os.path.basename(file_path),
                'creation_date': os.path.getctime(file_path),
                'last_modified': os.path.getmtime(file_path),
                'len':len(doc.page_content)
            })
        
        return documents
    
    except Exception as e:
        print(f"Error loading document {file_path}: {str(e)}")
        return []
# Example usage:
def load_documents_from_directory(directory_path: str, 
                               accepted_extensions: Optional[List[str]] = None) -> List[Document]:
    """
    Load all documents from a directory.
    
    Args:
        directory_path (str): Path to the directory
        accepted_extensions (List[str], optional): List of accepted file extensions
        
    Returns:
        List[Document]: List of all loaded documents
    """
    if accepted_extensions is None:
        accepted_extensions = ['.pdf', '.txt', '.docx', '.doc', '.csv', '.xlsx', '.xls']
    
    all_documents = []
    
    for root, _, files in os.walk(directory_path):
        for file in files:
            if any(file.lower().endswith(ext) for ext in accepted_extensions):
                file_path = os.path.join(root, file)
                documents = load_document(file_path)
                all_documents.extend(documents)
    
    return all_documents

In [4]:
data = load_document('Attenion_is_all_you_need.pdf')

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=700,
    chunk_overlap=70,
    add_start_index=True,
    separators=["\n\n", "\n", ".", " ", ""],
)

docs_processed = []
for doc in data:
    # Extend
    docs_processed += text_splitter.split_documents([doc])

In [10]:
len(docs_processed)

68

In [13]:
import os

from groq import Groq

client = Groq(
    api_key=api_key, # removed from the notebook
)
def call_llm(prompt:str)->str:
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model="llama3-70b-8192",
        
    )

    return chat_completion.choices[0].message.content

In [14]:
len(docs_processed)

68

In [15]:
# create context
n = 10
contexts = []
for i in range(0, len(docs_processed), n):
   batch = docs_processed[i:i+n]
   combined_text = " ".join([doc.page_content for doc in batch])
   contexts.append(combined_text)

In [16]:
import prompts

outputs = []
for context in tqdm(contexts):
    # Generate QA couple
    output_QA_couple = call_llm(prompts.QA_generation_prompt.format(context=context,n=10))
    try:
        for QAs in output_QA_couple.split('Output:::')[1:]:
            
            question = QAs.split("Factoid question: ")[-1].split("Answer: ")[0]
            answer = QAs.split("Answer: ")[-1]
            outputs.append(
                {
                    "context": context,
                    "question": question.strip(),
                    "answer": answer.strip(),
                }
            )
            print(len(outputs))
    except Exception as e:
        print(e)
        continue

  0%|          | 0/7 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


In [17]:
len(outputs)

41

In [121]:
pd.DataFrame(outputs).to_json('output/QAs.json',orient='records',indent=4)

In [19]:
critique_prompt = """
You will be given a context and a question.

Your task is to evaluate the question based on three criteria and provide ratings for each:

1. GROUNDEDNESS (1-5): How well can one answer the question unambiguously with the given context?
   - 1: Question cannot be answered at all from the context
   - 5: Question can be clearly and unambiguously answered from the context

2. RELEVANCE (1-5): How useful is this question for machine learning developers building NLP applications?
   - 1: Question is not useful at all
   - 5: Question is extremely useful

Note:
- DO NOT add any extra text outside the specified format.
- DO NOT include bold formatting, asterisks, or additional prefixes like "Answer:::" unless explicitly mentioned.
- Follow the format exactly as shown below:
### Question: the question
### Context: the context
Answer:::
Groundedness Evaluation: (your rationale)
Groundedness Rating: (1-5)
Relevance Evaluation: (your rationale)
Relevance Rating: (1-5)
### Question: {question}
### Context: {context}
Answer:::
"""


In [20]:
evaluation_list = []
for output in tqdm(outputs):
    eval_data = call_llm(critique_prompt.format(context=output["context"], question=output["question"]))
    eval_data_parsed = eval_data.split('Answer:::')[-1]
    eval_data_list = [ev for ev in eval_data_parsed.split('\n') if ev]
    evaluation_dict = {}
    try:
        for eval_row in eval_data_list:
            evaluation_dict[eval_row.split(':')[0].strip()] = eval_row.split(':')[1].strip()
        evaluation_dict.update({'question':output["question"],'answer':output["answer"]})
    except Exception as e:
        print(e)
        continue
    evaluation_list.append(evaluation_dict) 
    print(len(evaluation_list))


  0%|          | 0/41 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


In [21]:
len(evaluation_list)

41

In [22]:
evaluation_list[5]

{'Groundedness Evaluation': 'The context provides a detailed explanation of the scaled dot-product attention mechanism, including the formula for computing attention and the reasoning behind the scaling factor of 1/√dk. The question can be answered unambiguously based on the given context.',
 'Groundedness Rating': '5',
 'Relevance Evaluation': 'This question is extremely relevant to machine learning developers building NLP applications, as it pertains to a key component of the Transformer architecture and its implementation details.',
 'Relevance Rating': '5',
 'question': 'What is the purpose of the scaling factor of 1/√dk in the Scaled Dot-Product Attention?',
 'answer': 'To counteract the effect of large dot products pushing the softmax function into regions with extremely small gradients'}

In [23]:
import pandas as pd
df_data = pd.DataFrame(evaluation_list)

In [122]:
df_data.to_json('output/Critiques.json',orient='records',indent=4)

In [24]:
df_data.columns

Index(['Groundedness Evaluation', 'Groundedness Rating',
       'Relevance Evaluation', 'Relevance Rating', 'question', 'answer', ''],
      dtype='object')

In [26]:
generated_questions = df_data.loc[
    (df_data["Groundedness Rating"] >= '4')
    & (df_data["Relevance Rating"] >= '4')
]

In [27]:
df_data.shape

(41, 7)

In [28]:
generated_questions.shape

(37, 7)

In [29]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer


def split_documents(
    chunk_size: int,
    knowledge_base: List[LangchainDocument],
) -> List[LangchainDocument]:
    """
    Split documents into chunks of size `chunk_size` characters and return a list of documents.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=int(chunk_size / 10),
        separators=["\n\n", "\n", ".", " ", ""],
    )

    docs_processed = []
    for doc in knowledge_base:
        docs_processed += text_splitter.split_documents([doc])

    return docs_processed

In [65]:
from langchain.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_community.embeddings import OpenAIEmbeddings
import os


def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "text-embedding-3-small",
) -> FAISS:
    """
    Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

    Args:
        langchain_docs: list of documents
        chunk_size: size of the chunks to split the documents into
        embedding_model_name: name of the embedding model to use

    Returns:
        FAISS index
    """
    # load embedding_model
    embedding_model = OpenAIEmbeddings(
        model=embedding_model_name
    )    

    # Check if embeddings already exist on disk
    index_name = f"Faiss_database"
    index_folder_path = f"./data/indexes/{index_name}.faiss"
    try:
        if os.path.isdir(index_folder_path):
            return FAISS.load_local(
                index_folder_path,
                embedding_model,
                allow_dangerous_deserialization = True,
                distance_strategy=DistanceStrategy.COSINE,
            )

        else:
            print("Index not found, generating it...")
            docs_processed = split_documents(
                chunk_size,
                langchain_docs,
                embedding_model_name,
            )
            knowledge_index = FAISS.from_documents(
                docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
            )
            knowledge_index.save_local(index_folder_path)
            return knowledge_index
            
    except (KeyError, Exception) as e:
        print(f"Error loading existing index: {e}")
        print("Creating new index...")
   

In [62]:
def load_embeddings(
    langchain_docs: List[LangchainDocument],
    chunk_size: int,
    embedding_model_name: Optional[str] = "text-embedding-3-small",
) -> FAISS:
    """Creates a FAISS index from the given embedding model and documents."""
    
    # load embedding_model
    embedding_model = OpenAIEmbeddings(
        model=embedding_model_name,
        openai_api_key=os.environ["OPENAI_API_KEY"]
    )

    # Create index name
    index_name = f"index_chunk:{chunk_size}_embeddings:{embedding_model_name}"
    index_folder_path = f"data/indexes/Faiss_database/{index_name}"
    
    try:
        if os.path.isdir(index_folder_path):
            return FAISS.load_local(
                index_folder_path,
                embedding_model,
                allow_dangerous_deserialization=True,
                distance_strategy=DistanceStrategy.COSINE,
            )
    except (KeyError, Exception) as e:
        print(f"Error loading existing index: {e}")
        print("Creating new index...")
        
    # If loading fails or index doesn't exist, create new index
    texts = [doc.page_content for doc in langchain_docs]
    metadatas = [doc.metadata for doc in langchain_docs]
    
    # Create and save new index
    vectorstore = FAISS.from_texts(
        texts,
        embedding_model,
        metadatas=metadatas,
        distance_strategy=DistanceStrategy.COSINE,
    )
    
    # Create directory if it doesn't exist
    os.makedirs(index_folder_path, exist_ok=True)
    
    # Save the index
    vectorstore.save_local(index_folder_path)
    
    return vectorstore

In [31]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [33]:
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.llms import LLM


def answer_with_rag(
    question: str,
    llm: LLM,
    knowledge_index: VectorStore,
    num_retrieved_docs: int = 3,
) -> Tuple[str, List[LangchainDocument]]:
    """Answer a question using RAG with the given knowledge index."""
    # Gather documents with retriever
    relevant_docs = knowledge_index.similarity_search(query=question, k=num_retrieved_docs)
    relevant_docs = [doc.page_content for doc in relevant_docs]  # keep only the text

    # Optionally rerank results
    


    # Build the final prompt
    context = "\nExtracted documents:\n"
    context += "".join([f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs)])

    final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

    # Redact an answer
    answer = llm(final_prompt)

    return answer, relevant_docs

In [72]:
from langchain_core.language_models import BaseChatModel


def run_rag_tests(
    eval_dataset: pd.DataFrame,
    llm,
    knowledge_index: VectorStore,
    output_file: str,
):
    """Runs RAG tests on the given dataset and saves the results to the given output file."""
    
    outputs = []  
    
    for _, example in tqdm(eval_dataset.iterrows(),total=len(eval_dataset)):
        question = example["question"]
        answer, relevant_docs = answer_with_rag(question, llm, knowledge_index)
        
        result = {
            "question": question,
            "true_answer": example["answer"],
            "generated_answer": answer,
            "retrieved_docs": relevant_docs,  # Don't create a new list comprehension here
        }
        
        outputs.append(result)
    
    # Write to file once after the loop is complete
    with open(output_file, "w") as f:
        json.dump(outputs, f)
    
    return outputs  # Optionally return the results

In [84]:
evaluation_prompt_template = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

In [99]:
from langchain.chat_models import ChatOpenAI

eval_chat_model = ChatOpenAI(model="gpt-4o", temperature=0)
def evaluate_answers(
    answer_path:str,
    Rag_Dataset:pd.DataFrame,
    eval_chat_model,
    evaluation_prompt_template: str,
) -> None:
    """Evaluates generated answers. Modifies the given answer file in place for better checkpointing."""
    answers = []
    for experiment in tqdm(Rag_Dataset,total= len(Rag_Dataset)):
        
        eval_prompt = evaluation_prompt_template.format(
            instruction=experiment["question"],
            response=experiment["generated_answer"],
            reference_answer=experiment["true_answer"],
        )
        eval_result = eval_chat_model.predict(eval_prompt)
        feedback, score = [item.strip() for item in eval_result.split("[RESULT]")]
        experiment[f"eval_score"] = score
        experiment[f"eval_feedback"] = feedback
        answers.append(experiment)
        with open(answer_path, "w") as f:
            json.dump(answers, f,indent=4)

In [74]:
if not os.path.exists("./output"):
    os.mkdir("./output")
       
output_file_name = f"./output/rag_final_result.json"
print("Loading knowledge base embeddings...")
knowledge_index = load_embeddings(
    data,
    chunk_size=300,
)
print("Running RAG...")
Rag_outputs = run_rag_tests(
    eval_dataset=generated_questions,
    llm=call_llm,
    knowledge_index=knowledge_index,
    output_file=output_file_name,
)


Loading knowledge base embeddings...
Running RAG...


  0%|          | 0/37 [00:00<?, ?it/s]

In [100]:
print("Running evaluation...")
evaluate_answers(
    'output/Evaluation_Result.json',
    Rag_outputs,
    eval_chat_model,
    evaluation_prompt_template,
)

Running evaluation...


  0%|          | 0/37 [00:00<?, ?it/s]

In [101]:
df_result = pd.read_json('output/Evaluation_Result.json')

In [107]:
df_result.drop(['retrieved_docs'],axis=1,inplace=True)

In [109]:
df_result.columns

Index(['question', 'true_answer', 'generated_answer', 'eval_score',
       'eval_feedback'],
      dtype='object')

In [119]:
print('final accuracy:',df_result['eval_score'].apply(lambda x:x/5).sum() / df_result.shape[0])

final accuracy: 0.6378378378378379


## Done