In [None]:
from langsmith import Client

import evaluation

from langsmith.evaluation import evaluate
from langchain import hub
from typing import Dict, Any
from langchain_mistralai import ChatMistralAI
from langchain import hub
import os

In [None]:
MISTRAL_API_KEY = #your_api_key

In [None]:
from my_rag import MovieRAGWithDeepSearch

rag = MovieRAGWithDeepSearch(csv_path="data/TMDB_all_movies.csv")

In [None]:
dataset_single_name = "RAG Movie Expert Test Dataset"

PATH_TO_DATASET="test_queries_mistral.json"

evaluation.create_langsmith_dataset(json_file=PATH_TO_DATASET, dataset_name=dataset_single_name)

Answer vs Reference Test

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser

llm_grader = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0,
    max_retries=2,
    api_key=MISTRAL_API_KEY,
)

def rag_predictor(example: Dict[str, Any]) -> Dict[str, str]:

    question = example["query"]
    
    response = rag.rag_function({"query": question})
    
    return {
        "answer": response["answer"],
    }

manual_grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", """You are a grader assessing the accuracy of a student's answer given a question and a reference answer.
        
        You will be given:
        1. The Question
        2. The Reference Answer (Ground Truth)
        3. The Student's Answer (Prediction)

        Compare the Student's Answer to the Reference Answer. 
        If the student's answer conveys the same meaning as the reference answer, score it as 1. 
        If it is different, but the films are similar give something between 0 and 1 depending how similar are films
        If it is incorrect or different, score it as 0.

        Return a JSON object with a single key 'Score' and the integer value (0 or 1).
        """),
        ("human", """Question: {question}
        Reference Answer: {correct_answer}
        Student Answer: {student_answer}
        """)
    ]
)

def answer_evaluator(run, example) -> dict:
    input_question = example.inputs["query"]
    reference = example.outputs["reference_answer"]
    prediction = run.outputs["answer"]

    llm_grader = rag.llm
    parser = JsonOutputParser()

    answer_grader = manual_grade_prompt | llm_grader | parser

    score_result = answer_grader.invoke({
        "question": input_question,
        "correct_answer": reference,
        "student_answer": prediction
    })

    score = score_result.get("Score", 0)

    return {"key": "answer_v_reference_score", "score": score}

Сравнение с результатами LLM без RAG

In [None]:
def answer_evaluator(run, example) -> dict:
    input_question = example.inputs["query"]
    reference = example.outputs["reference_answer"]
    print(run.outputs.keys())
    prediction = run.outputs["answer"]

    llm_grader = llm_grader
    
    parser = JsonOutputParser()

    answer_grader = manual_grade_prompt | llm_grader | parser

    score_result = answer_grader.invoke({
        "question": input_question,
        "correct_answer": reference,
        "student_answer": prediction
    })

    score = score_result.get("Score", 0)

    return {"key": "answer_v_reference_score", "score": score}



student_llm = ChatMistralAI(model="open-mistral-7b", temperature=0, api_key=MISTRAL_API_KEY)

student_prompt = ChatPromptTemplate.from_template("Answer the question: {question}")
student_chain = student_prompt | student_llm | StrOutputParser()


def target_function(inputs: dict) -> dict:
    response = student_chain.invoke({"question": inputs["query"]})
    return {"answer": response}


experiment_results = evaluate(
    target_function,
    data=dataset_single_name,
    evaluators=[answer_evaluator],
    experiment_prefix="mistral-no-rag-evaluation",
)