In [31]:
!pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading rapidfuzz-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m2.6/3.1 MB[0m [31m38.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz
Successfully installed rapidfuzz-3.10.1


First, visit [Langsmith](https://smith.langchain.com/) web page, register and create an API key, please. A free tier will be enough.

In [None]:
LANGCHAIN_TRACING_V2 = True
LANGCHAIN_ENDPOINT = "https://api.smith.langchain.com"
LANGCHAIN_PROJECT = "test"
LANGCHAIN_API_KEY = "YOUR_API_KEY"

## RAG with LangSmith

Let's use the vectorstore we created in Chapter 6:

In [None]:
project = "YOUR_PROJECT"
location = "us-central1"
bucket_name = "YOUR_BUCKET_NAME_FOR_VECTORSTORE"

First, let's use get the corresponding Vertex Vector Search index (feel free to change the name):

In [2]:
project = "kuligin-sandbox1"
location = "us-central1"
bucket_name = "kuligin-sandbox1"

In [3]:
from google.cloud import aiplatform

for index_endpoint in aiplatform.MatchingEngineIndexEndpoint.list():
  if index_endpoint.display_name == "multimodal_example_endpoint":
    break

print(index_endpoint.display_name)
index_endpoint_name = index_endpoint.name

for index in index_endpoint.deployed_indexes:
  if index.id == "multimodal_example_lc":
    break

print(index.id)

index_name = index.index

multimodal_example_endpoint
multimodal_example_lc


Now let's create a RAG, run it and visit Langsmith UI to explore the tracing collected:

In [4]:
from langchain_google_vertexai import VectorSearchVectorStore
from langchain_google_vertexai import VertexAIEmbeddings, ChatVertexAI
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


prompt = PromptTemplate(
    input_variables=["question", "context"],
    template="Answer the question\n{question}\ngiven the following context:\n{context}\n.",
)

vectorstore = VectorSearchVectorStore.from_components(
    project_id=project,
    region=location,
    gcs_bucket_name=bucket_name,
    index_id=index_name,
    endpoint_id=index_endpoint.name,
    embedding=VertexAIEmbeddings(model_name="textembedding-gecko@003"),
    stream_update=True
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

def format_docs(docs):
  f = "\n".join("page: {0}\n{1}".format(doc.metadata.get("page", 0), doc.page_content) for doc in docs)
  print(docs[0].metadata)
  return f


chain_rag = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | ChatVertexAI(
        temperature=0, model_name="gemini-pro", max_output_tokens=1024)
    | StrOutputParser()
)

In [5]:
chain_rag.invoke("What was Alphabet's revenue in 2022?")

{'source': 'gen-app-builder/search/alphabet-investor-pdfs/2022_alphabet_annual_report.pdf', 'page': 66, 'element': 'text'}


"The answer to your question is not directly provided in the text. However, the text does state that Alphabet's revenue in 2022 was $282.8 billion."

Visit Langsmith and explore how the traces look!

## Using LangchainHub

We can now do the same using RAG from LangchainHub:

In [None]:
from langchain import hub
prompt_v1 = hub.pull("rlm/rag-prompt")

chain_rag_v1 = (
    {
        "context": retriever | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt_v1
    | ChatVertexAI(
        temperature=0, model_name="gemini-pro", max_output_tokens=1024)
    | StrOutputParser()
)
chain_rag_v1.invoke("What was Alphabet's revenue in 2022?")

# Pointwise evaluators

Let's explore pointwise evaluators available on LangChain and how to use them with Gemini.

### qa evalutors

We'll start with evaluators for Question & Answering:

In [7]:
from langchain.evaluation import load_evaluator
from langchain_google_vertexai import ChatVertexAI
llm = ChatVertexAI(model_name="gemini-1.5-pro-001")

evaluator_qa = load_evaluator("qa", llm=llm)
example = {
    "query": "What is the capital of Germany?",
    "answer": "Berlin is the capital of Germany."}
prediction = {"result": "Berlin."}
results = evaluator_qa.evaluate(
    examples=[example],
    predictions=[prediction]

)
print(results[0])

{'results': 'GRADE: CORRECT \n'}


In [8]:
inputs = [
  {
    "query": "What is the capital of Germany?",
    "answer": "Berlin is the capital of Germany.",
    "result": "Berlin."
    }]
evaluator_qa.generate(inputs)

LLMResult(generations=[[ChatGeneration(text='GRADE: CORRECT \n', generation_info={'is_blocked': False, 'safety_ratings': [{'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability_label': 'NEGLIGIBLE', 'blocked': False, 'severity': 'HARM_SEVERITY_NEGLIGIBLE'}], 'usage_metadata': {'prompt_token_count': 159, 'candidates_token_count': 5, 'total_token_count': 164, 'cached_content_token_count': 0}, 'finish_reason': 'STOP', 'avg_logprobs': -0.00013375983107835054, 'logprobs_result': {'top_candidates': [], 'chosen_candidates': []}}, message=AIMessage(content='GRADE: CORRECT \n', addit

In [9]:
result = evaluator_qa.evaluate_strings(
    input="What is the capital of Germany?",
    reference="Berlin is the capital of Germany.",
    prediction="Berlin."

)
print(result)

{'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1}


We can specify which evaluator to use:

In [10]:
evaluator_cotqa = load_evaluator("cot_qa", llm=llm)
result_cot = evaluator_cotqa.evaluate_strings(
    input="What is the capital of Germany?",
    reference="Berlin is the capital of Germany.",
    prediction="Berlin."

)
print(result_cot)

{'reasoning': 'EXPLANATION: The student answer matches the capital provided in the context. \n\nGRADE: CORRECT', 'value': 'CORRECT', 'score': 1}


In [11]:
evaluator_contextqa = load_evaluator("context_qa", llm=llm)
example = {
    "query": "What is the capital?",
    "context": "The question is about Germany."}
prediction = {"result": "Berlin."}
results = evaluator_contextqa.evaluate(
    examples=[example],
    predictions=[prediction]

)
print(results[0])

{'text': 'GRADE: CORRECT \n'}


In [12]:
result_context = evaluator_contextqa.evaluate_strings(
    input="What is the capital?",
    reference="The question is about Germany.",
    prediction="Berlin."

)
print(result_context)

{'reasoning': 'GRADE: CORRECT', 'value': 'CORRECT', 'score': 1}


And we can explore the prompt that is used by the corresponding evluator:

In [13]:
evaluator_contextqa.prompt

PromptTemplate(input_variables=['context', 'query', 'result'], input_types={}, partial_variables={}, template="You are a teacher grading a quiz.\nYou are given a question, the context the question is about, and the student's answer. You are asked to score the student's answer as either CORRECT or INCORRECT, based on the context.\n\nExample Format:\nQUESTION: question here\nCONTEXT: context the question is about here\nSTUDENT ANSWER: student's answer here\nGRADE: CORRECT or INCORRECT here\n\nGrade the student answers based ONLY on their factual accuracy. Ignore differences in punctuation and phrasing between the student answer and true answer. It is OK if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements. Begin! \n\nQUESTION: {query}\nCONTEXT: {context}\nSTUDENT ANSWER: {result}\nGRADE:")

### score_string evaluator

Let's look at evaluator that computes scores between reference and prediction strings:

In [None]:
evaluator_scorestr = load_evaluator("score_string", llm=llm)

result = evaluator_scorestr.evaluate_strings(
    input="What is the capital of Germany?",
    reference="Berlin is the capital of Germany.",
    prediction="Berlin."

)
print(result["score"])

Let's take a look at criterui available:

In [15]:
from langchain.evaluation import Criteria

for c in Criteria:
  print(c.value)

conciseness
relevance
correctness
coherence
harmfulness
maliciousness
helpfulness
controversiality
misogyny
criminality
insensitivity
depth
creativity
detail


In [16]:
from langchain.evaluation.scoring.eval_chain import resolve_criteria

print(resolve_criteria("creativity"))

{'creativity': 'Does the submission demonstrate novelty or unique ideas?'}


In [17]:
print(resolve_criteria(None))

{'helpfulness': 'Is the submission helpful, insightful, and appropriate?', 'relevance': 'Is the submission referring to a real quote from the text?', 'correctness': 'Is the submission correct, accurate, and factual?', 'depth': 'Does the submission demonstrate depth of thought?'}


### criteria evaluator

In [None]:
evaluator_cr = load_evaluator("criteria", llm=llm)

result = evaluator_cr.evaluate_strings(
    input="What is the capital of Germany?",
    reference="Berlin is the capital of Germany.",
    prediction="Berlin."

)
print(result)

In [19]:
evaluator_cr.prompt

PromptTemplate(input_variables=['input', 'output'], input_types={}, partial_variables={'criteria': 'helpfulness: Is the submission helpful, insightful, and appropriate? If so, respond Y. If not, respond N.'}, template='You are assessing a submitted answer on a given task or input based on a set of criteria. Here is the data:\n[BEGIN DATA]\n***\n[Input]: {input}\n***\n[Submission]: {output}\n***\n[Criteria]: {criteria}\n***\n[END DATA]\nDoes the submission meet the Criteria? First, write out in a step by step manner your reasoning about each criterion to be sure that your conclusion is correct. Avoid simply stating the correct answers at the outset. Then print only the single character "Y" or "N" (without quotes or punctuation) on its own line corresponding to the correct answer of whether the submission meets all criteria. At the end, repeat just the letter again by itself on a new line.')

In [20]:
evaluator_scorestr.prompt

ChatPromptTemplate(input_variables=['input', 'prediction'], input_types={}, partial_variables={'reference': '', 'criteria': 'For this evaluation, you should primarily consider the following criteria:\nhelpfulness: Is the submission helpful, insightful, and appropriate?\nrelevance: Is the submission referring to a real quote from the text?\ncorrectness: Is the submission correct, accurate, and factual?\ndepth: Does the submission demonstrate depth of thought?\n'}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant.'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['criteria', 'input', 'prediction'], input_types={}, partial_variables={}, template='[Instruction]\nPlease act as an impartial judge and evaluate the quality of the response provided by an AI assistant to the user question displayed below. {criteria}Begin your evaluation by providing

### Vertex evaluators

Set up the numerical project id:

In [21]:
PROJECT_ID = "YOUR_PROJECT_ID"

Let's start with a simple evaluator that computes a score:

In [22]:
from langchain_google_vertexai import VertexStringEvaluator

evaluator = VertexStringEvaluator(
        metric="bleu", project_id=PROJECT_ID
    )
result = evaluator.evaluate(
    examples=[
        {"reference": "Berlin is the capital of Germany."},
        {"reference": "London is the capital of Britain."},
        {"reference": "London is the capital of Britain."},
    ],
    predictions=[
        {"prediction": "The capital of Germany is Berlin."},
        {"prediction": "London is the capital of Britain."},
        {"prediction": "London is a capital of Britain."},
    ],
)
print(result)

[{'score': 0.29071537}, {'score': 1.0}, {'score': 0.488923}]


Now let's compute question-answering relevance using VertexAI Evaluation service:

In [23]:
evaluator_qa = VertexStringEvaluator(
        metric="question_answering_relevance", project_id=PROJECT_ID
    )
result = evaluator_qa.evaluate_strings(
   instruction="Which processor does Pixel 8 has?",
   prediction="Qualcomm Snapdragon 765G ",
   contex="Google Tensor G3 works with the Titan M2 security chip to protect personal information and make your Pixel more resilient to sophisticated attacks. And now, Face Unlock on Pixel 8 meets the highest Android biometric class, allowing you to access compatible banking and payment apps like Google Wallet.",
)
print(result)

{'score': 1.0, 'explanation': 'STEP 1: Assess relevance: the response does not address the instruction directly. The instruction asks about the Pixel 8 processor, but the response provides an incorrect processor.\nSTEP 2: Score based on the criteria and rubrics: the response is irrelevant to the instruction. Thus, the score is 1.', 'confidence': 1.0}


In [24]:
evaluator_c = VertexStringEvaluator(
        metric="question_answering_correctness", project_id=PROJECT_ID
    )
result = evaluator_c.evaluate_strings(
   instruction="Which processor does Pixel 8 has?",
   prediction="Qualcomm Snapdragon 765G ",
   contex="Google Tensor G3 works with the Titan M2 security chip to protect personal information and make your Pixel more resilient to sophisticated attacks. And now, Face Unlock on Pixel 8 meets the highest Android biometric class, allowing you to access compatible banking and payment apps like Google Wallet.",
)
print(result)

{'score': 0.0, 'explanation': "The reference does not contain claims regarding the Pixel 8's processor. Therefore, I am not able to assess reference claim alignment. Since the instruction requests information on the Pixel 8 and the response provides information about Qualcomm Snapdragon 765G processor, there is no information to assess the correctness based on the criteria. Thus, I'm defaulting the score to 0 based on the rubric.", 'confidence': 1.0}


In [25]:
evaluator_qa = VertexStringEvaluator(
        metric="question_answering_relevance", project_id=PROJECT_ID
    )
result = evaluator_qa.evaluate_strings(
   instruction="Which processor does Pixel 8 has?",
   #prediction="Tensor G3",
   prediction="Qualcomm Snapdragon 765G",
   contex="Google Tensor G3 works with the Titan M2 security chip to protect personal information and make your Pixel more resilient to sophisticated attacks. And now, Face Unlock on Pixel 8 meets the highest Android biometric class, allowing you to access compatible banking and payment apps like Google Wallet.",
   #reference="Bundestag is located in Munich"
)
print(result)

{'score': 1.0, 'explanation': "STEP 1: Assess relevance: the response does not address the instruction.\nThe instruction asks about Pixel 8's processor, but the response provides information about Qualcomm Snapdragon 765G, which is not related to Pixel 8. \nSTEP 2: Score based on the criteria and rubrics.\nAccording to the rubric, a score of 1 is given when the response is completely irrelevant to the instruction. Thus, the response receives a score of 1.", 'confidence': 1.0}


In [26]:
from langchain_google_vertexai import VertexPairWiseStringEvaluator

evaluator_pw = VertexPairWiseStringEvaluator(
        metric="pairwise_question_answering_quality",
        project_id=PROJECT_ID
    )
result = evaluator_pw.evaluate_string_pairs(
        prediction="London",
        prediction_b="Berlin",
        input="What is the capital of Great Britain?",
        instruction="Be concise",
    )

print(result)



{'pairwise_choice': 'BASELINE', 'explanation': 'BASELINE response is correct while CANDIDATE response is incorrect. The capital of Great Britain is London.', 'confidence': 1.0}


# Pairwise evaluators

With Vertex AI Evaluation service, you can also run pairwise evalouators. Let's look at the example:

In [32]:
from langchain.evaluation import load_evaluator

evaluator_ps = load_evaluator(
    "pairwise_string", llm=llm)

result = evaluator_ps.evaluate_string_pairs(
    prediction="Berlin.",
    prediction_b="Berlin is a capital of Germany.",
    input="What is the capital of Germany?",
)



In [33]:
print(result)

{'reasoning': 'Both assistants provide the correct answer to the user\'s question, which is that the capital of Germany is Berlin. However, assistant A is more concise and directly answers the question without any extraneous wording. Assistant B\'s response is grammatically incorrect as it states "a capital" instead of "the capital." \n\nOverall, assistant A provides a slightly better response due to its conciseness and grammatically correct structure.\n\n[[A]] \n', 'value': 'A', 'score': 1}


In [None]:
evaluator_ps = load_evaluator(
    "pairwise_string_distance", llm=llm)

result = evaluator_ps.evaluate_string_pairs(
    prediction="Berlin.",
    prediction_b="Berlin is a capital of Germany.",
    input="What is the capital of Germany?",
)

In [35]:
result

{'score': 0.3164362519201229}