## Question Answering
https://python.langchain.com/en/latest/use_cases/evaluation/question_answering.html

In [2]:
%pip install langchain llama_index
%pip install --upgrade llama_index langchain

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import openai

openai.api_key = os.environ['API_KEY']
os.environ["OPENAI_API_KEY"] = os.environ['API_KEY']


In [4]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI


In [25]:
# from langchain import PromptTemplate, OpenAI, LLMChain

# template = """質問: {question}

# 回答: 段階的に考えてください。"""
# prompt = PromptTemplate(template=template, input_variables=["question"])
# llm_chain = LLMChain(prompt=prompt, llm=OpenAI(temperature=0), verbose=True)

# question = "関ヶ原の戦いで勝ったのは?"

# llm_chain.predict(question=question)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m質問: 関ヶ原の戦いで勝ったのは?

回答: 段階的に考えてください。[0m

[1m> Finished chain.[0m


'関ヶ原の戦いは、1575年に行われた戦いです。この戦いで勝利したのは、徳川家康率いる徳川軍です。'

In [5]:
prompt = PromptTemplate(
    template="Question: {question}\nAnswer:", input_variables=["question"])

In [6]:
llm = OpenAI(model_name="text-davinci-003", temperature=0)
chain = LLMChain(llm=llm, prompt=prompt)


In [7]:
examples = [
    {
        "question": "Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?",
        "answer": "11"
    },
    {
        "question": 'Is the following sentence plausible? "Joao Moutinho caught the screen pass in the NFC championship."',
        "answer": "No"
    }
]

In [8]:
predictions = chain.apply(examples)


In [9]:
predictions


[{'text': ' 11 tennis balls'},
 {'text': ' No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.'}]

In [10]:
from langchain.evaluation.qa import QAEvalChain


In [11]:
llm = OpenAI(temperature=0)
eval_chain = QAEvalChain.from_llm(llm)
graded_outputs = eval_chain.evaluate(examples, predictions, question_key="question", prediction_key="text")


In [12]:
for i, eg in enumerate(examples):
    print(f"Example {i}:")
    print("Question: " + eg['question'])
    print("Real Answer: " + eg['answer'])
    print("Predicted Answer: " + predictions[i]['text'])
    print("Predicted Grade: " + graded_outputs[i]['text'])
    print()

Example 0:
Question: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
Real Answer: 11
Predicted Answer:  11 tennis balls
Predicted Grade:  CORRECT

Example 1:
Question: Is the following sentence plausible? "Joao Moutinho caught the screen pass in the NFC championship."
Real Answer: No
Predicted Answer:  No, this sentence is not plausible. Joao Moutinho is a professional soccer player, not an American football player, so it is not likely that he would be catching a screen pass in the NFC championship.
Predicted Grade:  CORRECT



In [13]:
from langchain.prompts.prompt import PromptTemplate

_PROMPT_TEMPLATE = """You are an expert professor specialized in grading students' answers to questions.
You are grading the following question:
{query}
Here is the real answer:
{answer}
You are grading the following predicted answer:
{result}
What grade do you give from 0 to 10, where 0 is the lowest (very low similarity) and 10 is the highest (very high similarity)?
"""

PROMPT = PromptTemplate(input_variables=["query", "answer", "result"], template=_PROMPT_TEMPLATE)

In [14]:
evalchain = QAEvalChain.from_llm(llm=llm,prompt=PROMPT)
evalchain.evaluate(examples, predictions, question_key="question", answer_key="answer", prediction_key="text")

[{'text': '\n10'},
 {'text': "\nI would give this answer a 9 out of 10. The student correctly identified that Joao Moutinho is a soccer player and not an American football player, and they provided a reasonable explanation for why the sentence is not plausible. The only thing missing is a reference to the NFC championship specifically, but the student's answer is still very accurate and well-reasoned."}]

In [15]:
context_examples = [
    {
        "question": "How old am I?",
        "context": "I am 30 years old. I live in New York and take the train to work everyday.",
    },
    {
        "question": 'Who won the NFC championship game in 2023?"',
        "context": "NFC Championship Game 2023: Philadelphia Eagles 31, San Francisco 49ers 7"
    }
]
QA_PROMPT = "Answer the question based on the  context\nContext:{context}\nQuestion:{question}\nAnswer:"
template = PromptTemplate(input_variables=["context", "question"], template=QA_PROMPT)
qa_chain = LLMChain(llm=llm, prompt=template)
predictions = qa_chain.apply(context_examples)

In [16]:
predictions

[{'text': 'You are 30 years old.'},
 {'text': ' The Philadelphia Eagles won the NFC championship game in 2023.'}]

In [17]:
from langchain.evaluation.qa import ContextQAEvalChain
eval_chain = ContextQAEvalChain.from_llm(llm)
graded_outputs = eval_chain.evaluate(context_examples, predictions, question_key="question", prediction_key="text")

In [18]:
graded_outputs


[{'text': ' CORRECT'}, {'text': ' CORRECT'}]

In [19]:
# Some data munging to get the examples in the right format
for i, eg in enumerate(examples):
    eg['id'] = str(i)
    eg['answers'] = {"text": [eg['answer']], "answer_start": [0]}
    predictions[i]['id'] = str(i)
    predictions[i]['prediction_text'] = predictions[i]['text']

for p in predictions:
    del p['text']

new_examples = examples.copy()
for eg in new_examples:
    del eg ['question']
    del eg['answer']

In [20]:
%pip install scipy==1.7.1
%pip install -qqq evaluate


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [22]:
from evaluate import load

squad_metric = load("squad")
results = squad_metric.compute(
    references=new_examples,
    predictions=predictions,
)

In [23]:
results


{'exact_match': 0.0, 'f1': 0.0}