In [2]:
import logging

from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

In [3]:
loader = TextLoader("../../state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [4]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [5]:
vector_db = FAISS.from_documents(docs, hf_embeddings)
vector_db.save_local("faiss_index")

In [6]:
question = "What did the president say about economics?"
docs = vector_db.similarity_search(question)
docs[0].page_content

'We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.  \n\nAnd tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. \n\nBy the end of this year, the deficit will be down to less than half what it was before I took office.  \n\nThe only president ever to cut the deficit by more than one trillion dollars in a single year. \n\nLowering your costs also means demanding more competition. \n\nI’m a capitalist, but capitalism without competition isn’t capitalism. \n\nIt’s exploitation—and it drives up prices. \n\nWhen corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. \n\nWe see it happening with ocean carriers moving goods in and out of America. \n\nDuring the pandemic, these foreign-owned companies raised prices by as much as 1,000% and made record profits.'

In [7]:
gpt35_azure_llm = AzureChatOpenAI(
    temperature=0,
    openai_api_key="02e3dbabaf334ccb959cbeadbd3f99c3",
    openai_api_base="https://llm-x-gpt.openai.azure.com/",
    deployment_name='LLM-X-GPT35-TURBO',
    openai_api_version="2023-03-15-preview"
)

In [32]:
from langchain.embeddings.openai import OpenAIEmbeddings

openai_azure_embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002",
    openai_api_key="02e3dbabaf334ccb959cbeadbd3f99c3",
    openai_api_base="https://llm-x-gpt.openai.azure.com/",
    deployment_name='LLM-X-Embedding'
)

In [8]:
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vector_db.as_retriever(),
    llm=gpt35_azure_llm
)

In [9]:
unique_docs = retriever_from_llm.get_relevant_documents(query=question)
for doc in unique_docs:
    print(doc.page_content)

INFO:langchain.retrievers.multi_query:Generated queries: ["1. Can you provide any information on the president's statements regarding the field of economics?", "2. I'm interested in knowing the president's views and comments on the subject of economics. Could you share any relevant information?", '3. Could you please share any insights or remarks made by the president in relation to economics?']


We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.  

And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. 

By the end of this year, the deficit will be down to less than half what it was before I took office.  

The only president ever to cut the deficit by more than one trillion dollars in a single year. 

Lowering your costs also means demanding more competition. 

I’m a capitalist, but capitalism without competition isn’t capitalism. 

It’s exploitation—and it drives up prices. 

When corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. 

We see it happening with ocean carriers moving goods in and out of America. 

During the pandemic, these foreign-owned companies raised prices by as much as 1,000% and made record profits.
I have a better plan to fight inflation. 

Lower your

In [10]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=gpt35_azure_llm,
    retriever=vector_db.as_retriever())

question = "What did the president say about economics?"
qa_chain({"query": question})

{'query': 'What did the president say about economics?',
 'result': "The president discussed several economic topics in the given context. Here are some key points he made:\n\n1. The president announced the appointment of a chief prosecutor for pandemic fraud, indicating a focus on combating fraud related to relief money meant for small businesses and Americans.\n\n2. He highlighted his achievement of reducing the deficit by more than one trillion dollars in a single year, emphasizing his commitment to fiscal responsibility.\n\n3. The president emphasized the importance of competition in capitalism, stating that when corporations don't have to compete, it leads to exploitation, higher prices, and negative impacts on small businesses and farmers.\n\n4. He mentioned the issue of ocean carriers raising prices during the pandemic, leading to record profits and increased costs for goods.\n\n5. The president outlined his plan to fight inflation, which includes lowering costs, increasing dome

In [11]:
from langchain.chains import QAGenerationChain
from langchain import PromptTemplate

templ = """You are a smart assistant designed to help high school teachers come up with reading comprehension questions.
Given a piece of text, you must come up with a {k} different question and answer pairs that can be used to test a student's reading comprehension abilities.
When coming up with this question/answer pair, each pair must be respond in the following format:

{{
    "question": "$YOUR_QUESTION_HERE",
    "answer": "$THE_ANSWER_HERE"
}}

So in your final answer you should response with a list of {k} pairs in this format:

```
[{{
    "question": "$YOUR_QUESTION_HERE",
    "answer": "$THE_ANSWER_HERE"
}},
 {{
    "question": "$YOUR_QUESTION_HERE",
    "answer": "$THE_ANSWER_HERE"
}},
 {{
    "question": "$YOUR_QUESTION_HERE",
    "answer": "$THE_ANSWER_HERE"
    }}
]
```

Please come up with a list of {k} question/answer pairs, in the specified list of JSONS format, for the following text:
----------------
{text}
"""

multi_qa_prompt = PromptTemplate.from_template(template=templ, partial_variables={"k": 20})
qa_generation_chain = QAGenerationChain.from_llm(llm=gpt35_azure_llm, prompt=multi_qa_prompt)

In [12]:
questions_and_answers_GT = qa_generation_chain.run(docs[0].page_content)[0]
questions_and_answers_GT

[{'question': 'What is the purpose of the chief prosecutor for pandemic fraud?',
  'answer': 'To go after the criminals who stole relief money'},
 {'question': 'What will happen to the deficit by the end of this year?',
  'answer': 'It will be less than half of what it was before'},
 {'question': 'What is the significance of the president cutting the deficit by more than one trillion dollars?',
  'answer': 'No other president has achieved this'},
 {'question': "What does the speaker mean by 'capitalism without competition isn't capitalism'?",
  'answer': 'Competition is an essential aspect of capitalism'},
 {'question': 'What are the consequences of corporations not having to compete?',
  'answer': 'Profits go up, prices go up, and small businesses suffer'},
 {'question': 'What is an example of corporations not having to compete?',
  'answer': 'Ocean carriers raising prices during the pandemic'},
 {'question': 'What did foreign-owned companies do during the pandemic?',
  'answer': 'Rai

In [13]:
questions_and_answers_llm = []
for i, qa in enumerate(questions_and_answers_GT):
    question, answer = qa["question"], qa["answer"]
    llm_answer = qa_chain({"query": question})["result"]
    questions_and_answers_llm.append({"question": question, "result": llm_answer})
    print(f"QA number {i + 1} \n")
    print(f"Question: {question}\n")
    print(f"Answer: {answer}\n")
    print(f"LLM Answer: {llm_answer}\n")
    print("--------------------------------------------------\n\n")

QA number 1 

Question: What is the purpose of the chief prosecutor for pandemic fraud?

Answer: To go after the criminals who stole relief money

LLM Answer: The purpose of the chief prosecutor for pandemic fraud is to go after the criminals who stole billions in relief money meant for small businesses and millions of Americans. They will be responsible for investigating and prosecuting cases of fraud related to the misuse of funds intended for pandemic relief.

--------------------------------------------------


QA number 2 

Question: What will happen to the deficit by the end of this year?

Answer: It will be less than half of what it was before

LLM Answer: The given context does not provide specific information about the projected deficit by the end of this year. Therefore, I don't have the information to answer your question.

--------------------------------------------------


QA number 3 

Question: What is the significance of the president cutting the deficit by more than o

In [14]:
template = """You are a teacher grading a quiz.
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either Correct or Incorrect.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: Correct or Incorrect here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. If the student answers that there is no specific information provided in the context, then the answer is Incorrect. Begin!

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:

Your response should be as follows:

GRADE: (Correct or Incorrect)
(line break)
JUSTIFICATION: (Without mentioning the student/teacher framing of this prompt, explain why the STUDENT ANSWER is Correct or Incorrect. Use one or two sentences maximum. Keep the answer as concise as possible.)
"""

GRADE_ANSWER_PROMPT = PromptTemplate(input_variables=["query", "result", "answer"], template=template)

In [15]:
from langchain.evaluation import QAEvalChain

eval_prompt = GRADE_ANSWER_PROMPT

eval_chain = QAEvalChain.from_llm(
    llm=gpt35_azure_llm,
    prompt=eval_prompt
)

In [16]:
graded_outputs = eval_chain.evaluate(
    examples=questions_and_answers_GT,
    predictions=questions_and_answers_llm,
    question_key="question",
    prediction_key="result"
)

In [17]:
graded_outputs

[{'results': 'GRADE: Correct\n\nJUSTIFICATION: The student answer accurately states that the purpose of the chief prosecutor for pandemic fraud is to go after the criminals who stole relief money. The additional information provided by the student does not conflict with the true answer.'},
 {'results': 'GRADE: Incorrect\n\nJUSTIFICATION: The student answer is incorrect because the true answer does provide specific information about the projected deficit by the end of this year, stating that it will be less than half of what it was before.'},
 {'results': 'GRADE: Correct\n\nJUSTIFICATION: The student answer accurately explains the significance of the president cutting the deficit by more than one trillion dollars, including the positive effects and implications of such a reduction.'},
 {'results': 'GRADE: Correct\n\nJUSTIFICATION: The student answer accurately explains that the speaker believes capitalism requires competition and that without competition, capitalism can become exploitat

In [18]:
for qa_grade in graded_outputs:
    print(f"{qa_grade['results']}")
    print("--------------------------------------------------\n\n")

GRADE: Correct

JUSTIFICATION: The student answer accurately states that the purpose of the chief prosecutor for pandemic fraud is to go after the criminals who stole relief money. The additional information provided by the student does not conflict with the true answer.
--------------------------------------------------


GRADE: Incorrect

JUSTIFICATION: The student answer is incorrect because the true answer does provide specific information about the projected deficit by the end of this year, stating that it will be less than half of what it was before.
--------------------------------------------------


GRADE: Correct

JUSTIFICATION: The student answer accurately explains the significance of the president cutting the deficit by more than one trillion dollars, including the positive effects and implications of such a reduction.
--------------------------------------------------


GRADE: Correct

JUSTIFICATION: The student answer accurately explains that the speaker believes capital

In [33]:
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain.evaluation import EmbeddingDistance

evaluator = load_evaluator(evaluator=EvaluatorType.PAIRWISE_STRING_DISTANCE,
                           distance_metric=EmbeddingDistance.EUCLIDEAN,
                           embeddings=openai_azure_embeddings,
                           llm=gpt35_azure_llm)

evaluator.evaluate_string_pairs(
    prediction="Seattle is very hot in June", prediction_b="Seattle is cool in June."
)

{'score': 0.12518518518518518}

In [29]:
evaluator = load_evaluator(evaluator=EvaluatorType.EMBEDDING_DISTANCE,
                           embeddings=HuggingFaceEmbeddings(),
                           distance_metric=EmbeddingDistance.COSINE,
                           llm=gpt35_azure_llm)

evaluator.evaluate_strings(prediction="I shall go", reference="I shan't go")

{'score': 0.548644889956817}