In [1]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
openai_api_key = os.environ["OPENAI_API_KEY"]

## Basic App for Evaluation

In [2]:
from langchain import OpenAI

In [3]:
llm = OpenAI()

**Load document**

In [4]:
from langchain.document_loaders import TextLoader

In [5]:
loader = TextLoader("data/be-good-and-how-not-to-die.txt")
document = loader.load()

In [6]:
print(f"The document has {len(document[0].page_content)} characters")

The document has 27419 characters


**Split the document in smaller chunks**

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=3000,
    chunk_overlap=400
)

In [9]:
document_chunks = text_splitter.split_documents(document)

In [10]:
print(f"Now you have {len(document_chunks)} chunks")

Now you have 12 chunks


In [11]:
print(type(document_chunks))

<class 'list'>


**Convert text chunks in numeric embeddings and load them to the vector database**

In [17]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [18]:
embeddings = OpenAIEmbeddings()

In [19]:
from langchain.vectorstores import FAISS

In [20]:
stored_embeddings = FAISS.from_documents(document_chunks, embeddings)

**Create a Retrieval Question & Answering Chain**

In [21]:
from langchain.chains import RetrievalQA

In [22]:
QA_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=stored_embeddings.as_retriever(),
    input_key="question"
)

Notice that we have added input_key in the QA_chain configuration. This tells the chain where will the user prompt be located.

**We are going to evaluate this app with 2 questions and answers we already know (these answers are technically known as "ground truth answers")**

In [27]:
questions_and_answers = [
    {
        'question' : "Where is a whole neighborhood of YC-funded startups?", 
        'answer' :"In San Francisco"},
    {
        'question' : "What may be the most valuable  thing Paul Buchheit made for Google?", 
        'answer' : "The motto Don't be evil"}
]

In [28]:
predictions = QA_chain.apply(questions_and_answers)

In [29]:
predictions

[{'question': 'Where is a whole neighborhood of YC-funded startups?',
  'answer': 'In San Francisco',
  'result': ' A whole neighborhood of YC-funded startups is in San Francisco.'},
 {'question': 'What may be the most valuable  thing Paul Buchheit made for Google?',
  'answer': "The motto Don't be evil",
  'result': ' Paul Buchheit is credited with creating the phrase "Don\'t be evil," which serves as the motto for Google. This phrase may be the most valuable thing Buchheit made for Google, as it serves as a reminder for the company to stay true to its mission.'}]

**The evaluation of this App has been positive, since the App has responded the 2 evaluation questions right.**

**But instead of confirming that manually ourselves, we can ask the LLM to check if the responses are coincidental with the "ground truth answers"**

In [30]:
from langchain.evaluation.qa import QAEvalChain

In [31]:
evaluation_chain = QAEvalChain.from_llm(llm)

In [34]:
evaluate_responses = evaluation_chain.evaluate(
    questions_and_answers,
    predictions,
    question_key="question",
    answer_key="answer"
)

In [35]:
evaluate_responses

[{'results': ' CORRECT'}, {'results': ' CORRECT'}]