In [1]:
import logging
import os
from json import JSONDecodeError
from typing import List

import pandas as pd
from tqdm import tqdm
from langchain import PromptTemplate
from langchain.chains import QAGenerationChain, RetrievalQA
from langchain.chat_models import AzureChatOpenAI
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.evaluation import EmbeddingDistance
from langchain.evaluation import load_evaluator, EvaluatorType
from langchain.evaluation.schema import StringEvaluator
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema import BaseRetriever
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language

from src.prompt_templates import MULTI_QA_GPT4_PROMPT_TEMPLATE, MULTI_QA_GPT35_PROMPT_TEMPLATE, GRADE_DOCS_PROMPT_TEMPLATE

INDEX_OF_FIRST_QNA_IN_RESPONSE = 0

In [3]:
root_dir = "../linux-kernel"

docs = []
for dirpath, dirnames, filenames in os.walk(root_dir):
    for file in filenames:
        if file.endswith(".c") and "/.venv/" not in dirpath:
            try:
                loader = TextLoader(os.path.join(dirpath, file), encoding="utf-8")
                docs.extend(loader.load())
            except Exception as e:
                pass
print(f"{len(docs)}")

9


In [4]:
RecursiveCharacterTextSplitter.get_separators_for_language(Language.CPP)

chunks_cpp_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.CPP, chunk_size=250, chunk_overlap=50
)
chunks = chunks_cpp_splitter.split_documents(docs)
print(f"{len(chunks)}")

4290


In [5]:
documents_cpp_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.CPP, chunk_size=4000, chunk_overlap=150
)
splitted_docs = documents_cpp_splitter.split_documents(docs)
print(f"{len(splitted_docs)}")

258


In [6]:
hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': False}
)

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

openai_azure_embeddings = OpenAIEmbeddings(
    openai_api_type="azure",
    openai_api_key="02e3dbabaf334ccb959cbeadbd3f99c3",
    openai_api_base="https://llm-x-gpt.openai.azure.com/",
    chunk_size=1
)

In [8]:
hf_embeddings_vector_db = FAISS.from_documents(chunks, hf_embeddings)
hf_embeddings_vector_db.save_local("linux-kernel_embeddings")

In [10]:
question = "What is the purpose of the get_user_page in the linux kernel?"
retrieved_chunks = hf_embeddings_vector_db.similarity_search(question)

In [11]:
gpt35_azure_llm = AzureChatOpenAI(
    temperature=0,
    openai_api_key="02e3dbabaf334ccb959cbeadbd3f99c3",
    openai_api_base="https://llm-x-gpt.openai.azure.com/",
    deployment_name='LLM-X-GPT35-TURBO',
    openai_api_version="2023-03-15-preview"
)

In [12]:
gpt4_azure_llm = AzureChatOpenAI(
    temperature=0,
    openai_api_key="a8d69f68a36b40789df2cc3fdbaacda9",
    openai_api_base="https://llmx-gpt-canada-east.openai.azure.com/",
    deployment_name='LLM-X-GPT-4',
    openai_api_version="2023-03-15-preview"
)

In [13]:
qa_rag_chain = RetrievalQA.from_chain_type(
    llm=gpt35_azure_llm,
    retriever=hf_embeddings_vector_db.as_retriever()
)

qa_rag_chain({"query": question})

{'query': 'What is the purpose of the get_user_page in the linux kernel?',
 'result': "The purpose of the `get_user_pages` function in the Linux kernel is to walk a process's page tables and obtain a reference to each `struct page` that corresponds to a user address at a given moment. This function is typically used by the kernel's futex code and is used to access user pages directly. It ensures that the pages are accessible and can be used for various operations, such as IO operations or handling faults."}

In [14]:
logging.basicConfig()
logging.getLogger('langchain.retrievers.multi_query').setLevel(logging.INFO)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=hf_embeddings_vector_db.as_retriever(),
    llm=gpt35_azure_llm
)

unique_docs = retriever_from_llm.get_relevant_documents(query=question)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. What is the function of the get_user_page in the linux kernel?', '2. How does the get_user_page function serve its purpose in the linux kernel?', '3. Can you explain the role and significance of the get_user_page in the linux kernel?']


In [15]:
def create_qna_GT_df(docs: List[Document], num_of_qna_for_doc: int) -> pd.DataFrame:
    documents_cpp_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.CPP, chunk_size=4000, chunk_overlap=150
    )
    splitted_docs = documents_cpp_splitter.split_documents(docs)

    multi_qa_prompt = PromptTemplate.from_template(template=MULTI_QA_GPT4_PROMPT_TEMPLATE,
                                                   partial_variables={"k": num_of_qna_for_doc})
    qa_generation_chain = QAGenerationChain.from_llm(llm=gpt4_azure_llm,
                                                     prompt=multi_qa_prompt,
                                                     text_splitter=documents_cpp_splitter)


    qna_GT = []
    for splitted_doc in tqdm(splitted_docs):
        try:
            qna = qa_generation_chain.run(splitted_doc.page_content)[INDEX_OF_FIRST_QNA_IN_RESPONSE]
            qna_GT += qna
        except JSONDecodeError:
            print("Failed to generate valid QnA JSON for doc")

    qna_GT_df = pd.DataFrame(qna_GT)
    return qna_GT_df

In [42]:
def get_qna_with_chain_answers_df(qa_rag_chain: RetrievalQA, qna_GT_df: pd.DataFrame) -> pd.DataFrame:
    qna_with_chain_answers = qna_GT_df.copy()
    qna_with_chain_answers["chain_answer"] = qna_with_chain_answers.apply(
        lambda qna: qa_rag_chain({"query": qna["question"]})["result"], axis=1)

    return qna_with_chain_answers

In [43]:
def get_evaluator_score(evaluator: StringEvaluator, qna_with_chain_answer: pd.Series) -> float:
    grade = evaluator.evaluate_strings(
        prediction=qna_with_chain_answer["chain_answer"],
        reference=qna_with_chain_answer["answer"],
        input=qna_with_chain_answer["question"])

    return grade["score"]

In [45]:
def get_retrieval_score(retriever, qna_with_chain_answer: pd.Series):
    GRADE_DOCS_PROMPT = PromptTemplate(input_variables=['result', 'answer', 'query'],
                                       template=GRADE_DOCS_PROMPT_TEMPLATE)
    retrieval_eval_chain = load_evaluator(
        evaluator=EvaluatorType.QA,
        llm=gpt35_azure_llm,
        prompt=GRADE_DOCS_PROMPT
    )

    retrieved_docs = retriever.get_relevant_documents(query=qna_with_chain_answer["question"],
                                                      search_type="similarity_score_threshold",
                                                      search_kwargs={"k": 2})

    grade = retrieval_eval_chain.evaluate_strings(
        prediction=retrieved_docs,
        reference=qna_with_chain_answer["answer"],
        input=qna_with_chain_answer["question"])

    return grade["score"]

In [19]:
def _fix_embedding_distance_evaluator_score(score: float) -> float:
    return round(1 - score, 3)

In [46]:
def get_grades_for_chain_qna(qna_with_chain_answers_df: pd.DataFrame,
                             retriever: BaseRetriever = None) -> pd.DataFrame:
    grades_for_chain_qna = qna_with_chain_answers_df.copy()

    labeled_criteria_evaluator = load_evaluator(evaluator=EvaluatorType.LABELED_CRITERIA,
                                                criteria="correctness",
                                                llm=gpt35_azure_llm)

    embedding_distance_evaluator = load_evaluator(evaluator=EvaluatorType.EMBEDDING_DISTANCE,
                                                  distance_metric=EmbeddingDistance.COSINE,
                                                  embeddings=hf_embeddings,
                                                  llm=gpt35_azure_llm)

    qa_llm_jugde_evaluator = load_evaluator(evaluator=EvaluatorType.QA,
                                            llm=gpt35_azure_llm)

    grades_for_chain_qna["labeled_criteria_grades"] = qna_with_chain_answers_df.apply(
        lambda qna_with_chain_answer: get_evaluator_score(
            evaluator=labeled_criteria_evaluator,
            qna_with_chain_answer=qna_with_chain_answer), axis=1)

    grades_for_chain_qna["qa_llm_jugde_grades"] = qna_with_chain_answers_df.apply(
        lambda qna_with_chain_answer: get_evaluator_score(
            evaluator=qa_llm_jugde_evaluator,
            qna_with_chain_answer=qna_with_chain_answer), axis=1)

    grades_for_chain_qna["embedding_distance_grades"] = qna_with_chain_answers_df.apply(
        lambda qna_with_chain_answer: _fix_embedding_distance_evaluator_score(
            get_evaluator_score(evaluator=embedding_distance_evaluator,
                                qna_with_chain_answer=qna_with_chain_answer)), axis=1)

    if retriever is not None:
        grades_for_chain_qna["retrieval_score"] = qna_with_chain_answers_df.apply(
            lambda qna_with_chain_answer: get_retrieval_score(
                retriever=retriever,
                qna_with_chain_answer=qna_with_chain_answer), axis=1)

    return grades_for_chain_qna

In [47]:
def run_evaluation(qa_rag_chain: RetrievalQA, qna_GT_df: pd.DataFrame) -> pd.DataFrame:
    qna_with_chain_answers_df = get_qna_with_chain_answers_df(qa_rag_chain, qna_GT_df)
    grades_for_chain_qna = \
        get_grades_for_chain_qna(qna_with_chain_answers_df=qna_with_chain_answers_df,
                                 retriever=qa_rag_chain.retriever)
    return grades_for_chain_qna

In [50]:
run_evaluation(qa_rag_chain=qa_rag_chain, qna_GT_df=qna_GT_df)



Unnamed: 0,question,answer,chain_answer,labeled_criteria_grades,embedding_distance_grades,qa_llm_jugde_grades,retrieval_score
0,What is the purpose of the 'for' loop in the g...,The 'for' loop is used to calculate the factor...,The purpose of the 'for' loop in the given cod...,,0.3103038,1,
1,What does the variable 'fact' represent in the...,The variable 'fact' stores the factorial of th...,"In the given code, the variable 'fact' represe...",1.0,0.1048536,1,1.0
2,What is the purpose of the given code?,The purpose of the given code is to calculate ...,The purpose of the given code is to input the ...,,0.6754299,0,1.0
3,What is the return type of the 'fact' function?,The return type of the 'fact' function is 'int'.,The return type of the 'fact' function is 'int'.,1.0,-2.220446e-16,1,1.0
4,What is the purpose of the code snippet?,To calculate and display the factorial of a nu...,The purpose of the code snippet is to prompt t...,,0.7435595,0,
5,What is the value of 'fact' after the code sni...,The factorial of the number entered by the user,The value of 'fact' cannot be determined witho...,1.0,0.5335894,1,1.0


In [16]:
qna_GT_df = create_qna_GT_df(docs=docs, num_of_qna_for_doc=3)
qna_GT_df.to_csv("qna_GT_df.csv")

  9%|▉         | 24/258 [03:34<41:36, 10.67s/it]

Failed to generate valid QnA JSON for doc


 20%|█▉        | 51/258 [07:01<23:50,  6.91s/it]

Failed to generate valid QnA JSON for doc


 35%|███▌      | 91/258 [11:58<24:41,  8.87s/it]

Failed to generate valid QnA JSON for doc


 46%|████▌     | 118/258 [15:14<17:30,  7.50s/it]

Failed to generate valid QnA JSON for doc


 52%|█████▏    | 135/258 [17:29<18:51,  9.20s/it]

Failed to generate valid QnA JSON for doc


 66%|██████▌   | 170/258 [21:48<13:28,  9.19s/it]

Failed to generate valid QnA JSON for doc


 74%|███████▎  | 190/258 [24:36<08:56,  7.88s/it]

Failed to generate valid QnA JSON for doc


 85%|████████▌ | 220/258 [28:38<05:00,  7.90s/it]

Failed to generate valid QnA JSON for doc


 91%|█████████ | 235/258 [30:45<03:22,  8.80s/it]

Failed to generate valid QnA JSON for doc


 91%|█████████▏| 236/258 [30:52<03:02,  8.29s/it]

Failed to generate valid QnA JSON for doc


 92%|█████████▏| 238/258 [31:13<03:03,  9.20s/it]

Failed to generate valid QnA JSON for doc


100%|██████████| 258/258 [33:36<00:00,  7.82s/it]


In [4]:
from langchain.evaluation import QAEvalChain

template = """You are a teacher grading a quiz on the Linux kernel.
You are given a question, the student's answer, and the true answer, and are asked to score the student answer as either CORRECT or INCORRECT.

Example Format:
QUESTION: question here
STUDENT ANSWER: student's answer here
TRUE ANSWER: true answer here
GRADE: CORRECT or INCORRECT here

Grade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin!

QUESTION: {query}
STUDENT ANSWER: {result}
TRUE ANSWER: {answer}
GRADE:"""
PROMPT = PromptTemplate(
    input_variables=["query", "result", "answer"], template=template
)

NameError: name 'QAEvalChain' is not defined