In [12]:
### Chatbot Evaluation

import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["LANGSMITH_TRACING"]="true"

In [13]:
# Create datapoints
from langsmith import Client

# Initialize the LangSmith client
# Assumes LANGSMITH_API_KEY and LANGSMITH_URL environment variables are set
client = Client()

# Define dataset: these are your test cases
dataset_name = "Chatbots Evaluation"
dataset = client.create_dataset(dataset_name)

# Create the examples (input/output pairs) within the dataset
client.create_examples(
    dataset_id=dataset.id,
    examples=[
        {
            "inputs": {"question": "What is LangChain?"},
            "outputs": {"answer": "A framework for building LLM applications"},
        },
        {
            "inputs": {"question": "What is LangSmith?"},
            "outputs": {"answer": "A platform for observing and evaluating LLM applications"},
        },
        {
            "inputs": {"question": "What is OpenAI?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        },
        {
            "inputs": {"question": "What is Google?"},
            "outputs": {"answer": "A technology company known for search"},
        },
        {
            "inputs": {"question": "What is Mistral?"},
            "outputs": {"answer": "A company that creates Large Language Models"},
        }
    ]
)

{'example_ids': ['89389fae-2c43-4b2b-b37e-d6d7e11f3ede',
  '662e9f01-cac9-4628-8400-19b3c171a2b1',
  '644874cc-1fbf-4932-bcf8-11b66c5f6483',
  'cf56b67e-d961-4a5f-b228-6fd19b0dd577',
  '4b127975-64db-4fd3-a787-bdc28828bb18'],
 'count': 5}

In [14]:
### Define Metrics (LLM As A Judge)

import openai
from langsmith import wrappers

# Initialize the OpenAI client wrapped for LangSmith tracing
# Assumes OPENAI_API_KEY is set in the environment
openai_client = wrappers.wrap_openai(openai.OpenAI())

# System instruction for the grading LLM
eval_instructions = "You are an expert professor specialized in grading students' answers to questions."

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """
    Custom LangSmith evaluator that uses an LLM to determine if the 
    predicted answer is correct relative to the reference answer.
    """
    # NOTE: Assuming the predicted output uses the key 'answer' based on RAGState structure.
    # If your output dict uses 'response', change 'answer' back to 'response' here.
    predicted_answer = outputs.get('answer', 'No answer found')

    user_content = f"""You are grading the following question:
{inputs.get('question', 'N/A')}

Here is the real answer:
{reference_outputs.get('answer', 'N/A')}

You are grading the following predicted answer:
{predicted_answer}

Respond with CORRECT or INCORRECT:
Grade:
"""
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        temperature=0,
        messages=[
            {"role": "system", "content": eval_instructions},
            {"role": "user", "content": user_content}
        ]
    ).choices[0].message.content.strip() # Strip whitespace to ensure accurate comparison

    # Ensure the comparison is robust against leading/trailing whitespace
    return response.upper() == "CORRECT"

In [15]:
# Assuming 'answer' is the key for the predicted output (from the graph) 
# and 'answer' is the key for the reference output (from the dataset).

## Concision - checks whether the actual output is less than 2x the length of the expected result.

def concision(outputs: dict, reference_outputs: dict) -> bool:
    """
    Evaluates if the predicted answer is "concise," defined as being 
    less than double the length of the reference answer.
    """
    
    # Use 'answer' for the predicted output key (assuming RAGState structure)
    predicted_len = len(outputs.get("answer", "")) 
    reference_len = len(reference_outputs.get("answer", ""))
    
    # Return the boolean result directly
    # Use >= 1 for reference_len to avoid division by zero if this were a ratio, 
    # but for simple comparison, a check against the empty string is safer.
    
    # The evaluation logic: predicted length < 2 * reference length
    return predicted_len < 2 * reference_len

In [16]:
### Run Evaluations

# Assuming openai_client is an initialized LangSmith-wrapped OpenAI client

default_instructions = "Respond to the users question in a short, concise manner (one short sentence)."

def my_app(question: str, model: str = "gpt-4o-mini", instructions: str = default_instructions) -> str:
    """
    Invokes the OpenAI API with a system instruction and the user's question.
    """
    return openai_client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    ).choices[0].message.content

In [17]:
### Call my_app for every datapoints
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"])}

In [18]:
# Assuming 'client' (LangSmith client), 'ls_target' (your system function/chain), 
# 'dataset_name', 'correctness', and 'concision' are defined in the scope.

## Run our evaluation
experiment_results = client.evaluate(
    ls_target, # The function/chain being tested (Your AI system)
    data=dataset_name, # The name of the dataset to evaluate against
    evaluators=[correctness, concision], # List of custom evaluator functions
    experiment_prefix="openai-4o-mini-chatbot"
)

View the evaluation results for experiment: 'openai-4o-mini-chatbot-949fbc11' at:
https://smith.langchain.com/o/31db0815-5803-45ca-b62d-697f0171635a/datasets/8e8960c1-6378-4567-bd7c-f0bd03ccbb27/compare?selectedSessions=411cfbfc-123c-4f9b-a315-ca3832afe32e




5it [00:09,  2.00s/it]


In [19]:
### Call my_app for every datapoints
def ls_target(inputs: str) -> dict:
    return {"response": my_app(inputs["question"],model="gpt-4-turbo")}

In [20]:
# Assuming 'client' (LangSmith client), 'ls_target' (your system function/chain), 
# 'dataset_name', 'correctness', and 'concision' are defined in the scope.

## Run our evaluation
experiment_results = client.evaluate(
    ls_target, # The function/chain being tested (Your AI system)
    data=dataset_name, # The name of the dataset to evaluate against
    evaluators=[correctness, concision], # List of custom evaluator functions
    experiment_prefix="openai-4-turbo-chatbot"
)

View the evaluation results for experiment: 'openai-4-turbo-chatbot-3165821b' at:
https://smith.langchain.com/o/31db0815-5803-45ca-b62d-697f0171635a/datasets/8e8960c1-6378-4567-bd7c-f0bd03ccbb27/compare?selectedSessions=68e370ca-c2c4-4216-bf42-3447cdc38cb5




5it [00:10,  2.18s/it]


In [21]:
### Evaluation For RAG

## RAG
from langchain_community.document_loaders import WebBaseLoader
# NOTE: InMemoryVectorStore is deprecated in favor of specific in-memory implementations.
# It's better practice to use a dedicated in-memory store like Chroma/FAISS 
# or ensure you are using the correct path from the latest langchain-core package.
from langchain_core.vectorstores import InMemoryVectorStore 
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# List of URLs to load documents from
urls = [
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
]

# Load documents from the URLs
# Uses a list comprehension to load, then flattens the list of lists
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Initialize a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=250, 
    chunk_overlap=0
)

# Split the documents into chunks
doc_splits = text_splitter.split_documents(docs_list)

# Add the document chunks to the "vector store" using OpenAIEmbeddings
vectorstore = InMemoryVectorStore.from_documents(
    documents=doc_splits,
    embedding=OpenAIEmbeddings(),
)

# With langchain we can easily turn any vector store into a retrieval component:
retriever = vectorstore.as_retriever(k=6)

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [22]:
retriever.invoke("what is agents")

[Document(id='c20aaaba-3db7-406d-a46d-f219ff3d3023', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\n\nPlanning\n\nSubgoal and decomposition: The agent breaks down large tasks into smaller, manageable subgoals, enabling efficient handling of complex tasks.\nReflection and refinement: The agent can do self-criticism and self-reflection over past actions, learn from mistakes and refine them for future steps, 

In [23]:
from langchain.chat_models import init_chat_model
llm=init_chat_model("openai:gpt-4o-mini")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x0000016AE7727E10>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x0000016AE77D5FD0>, root_client=<openai.OpenAI object at 0x0000016AE7727BB0>, root_async_client=<openai.AsyncOpenAI object at 0x0000016AE78616D0>, model_name='gpt-4o-mini', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [24]:
from langsmith import traceable
# Assuming retriever and llm are defined in the scope

## Add decorator
@traceable()
def rag_bot(question: str) -> dict:
    """
    Performs Retrieval-Augmented Generation (RAG) and returns the answer 
    along with the source documents.
    """
    ## Relevant context
    # Invoke the retriever to get relevant documents
    docs = retriever.invoke(question)
    docs_string = " ".join(doc.page_content for doc in docs)

    # System instructions, cleaned up for proper indentation
    instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions.
Use the following source documents to answer the user's questions.
If you don't know the answer, just say that you don't know.
Use three sentences maximum and keep the answer concise.

Documents:
{docs_string}"""
    
    ## llm invoke
    # The list of messages to send to the LLM
    ai_msg = llm.invoke([
        {"role": "system", "content": instructions},
        {"role": "user", "content": question},
    ])
    
    # Return the predicted answer and the documents used for context
    return {"answer": ai_msg.content, "documents": docs}

In [25]:
rag_bot("What is agents")

{'answer': 'Agents refer to autonomous entities powered by large language models (LLMs) that can perform tasks, make decisions, and interact within a specific environment. They utilize components like planning, memory, and reflection to enhance their behavior and problem-solving capabilities. Generative agents, for instance, simulate human-like interactions in environments inspired by games like The Sims.',
 'documents': [Document(id='c20aaaba-3db7-406d-a46d-f219ff3d3023', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview\nIn 

In [26]:
### Dataset

from langsmith import Client

# Initialize the LangSmith client
client = Client()

# Define the examples for the dataset
examples = [
    {
        "inputs": {"question": "How does the ReAct agent use self-reflection? "},
        "outputs": {"answer": "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs."},
    },
    {
        "inputs": {"question": "What are the types of biases that can arise with few-shot prompting?"},
        "outputs": {"answer": "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias."},
    },
    {
        "inputs": {"question": "What are five types of adversarial attacks?"},
        "outputs": {"answer": "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming."},
    }
]

### Create the dataset and examples in LangSmith
dataset_name = "RAG Test Evaluation"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=examples
)

{'example_ids': ['309046a2-617b-438e-898c-f95c1c9b8318',
  'ceb5d801-b45d-4ac6-a76a-e57dd46b397a',
  'ab408aac-c1a1-4306-9b7a-e33be1c46b1e'],
 'count': 3}

In [None]:
### Evaluators or Metrics
#1. Correctness: Response vs reference answer
#- Goal: Measure "how similar/correct is the RAG chain answer, relative to a ground-truth answer
# - Mode: Requires a ground truth (reference) answer supplied through a dataset
# - Evaluator: Use LLM-as-judge to assess answer correctness.

In [27]:
from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI

## Correctness Output Schema
# ----------------------------------------------------------------------
class CorrectnessGrade(TypedDict):
    """Grade output schema for the correctness evaluator."""
    
    # Note that the order in the fields are defined is the order in which the model will generate them.
    # It is useful to put explanations before responses because it forces the model to think through
    # its final response before generating it:
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# ----------------------------------------------------------------------
## Correctness Prompt Instructions
# ----------------------------------------------------------------------
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# ----------------------------------------------------------------------
## Grader LLM Definition
# ----------------------------------------------------------------------
grader_llm = ChatOpenAI(model="gpt-4o-mini", temperature=0).with_structured_output(
    CorrectnessGrade,
    method="json_schema",
    strict=True
)

# ----------------------------------------------------------------------
## Evaluator Function
# ----------------------------------------------------------------------
def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy that uses a structured LLM call."""
    
    # Construct the user message containing all the necessary context
    # Use .get() for safer access and ensure no indentation is carried into the string
    answers = f"""QUESTION: {inputs.get('question', 'N/A')}
GROUND TRUTH ANSWER: {reference_outputs.get('answer', 'N/A')}
STUDENT ANSWER: {outputs.get('answer', 'N/A')}"""

    # Run structured evaluator
    # NOTE: The expected output key for the RAG graph is 'answer', not 'outputs'.
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions}, 
        {"role": "user", "content": answers}
    ])
    
    # The structured output is a dictionary conforming to CorrectnessGrade
    return grade["correct"]

In [29]:
### Relevance: Response vs input
from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI
# Assuming other necessary imports like the RAGState structure are present

## Grade output schema
class RelevanceGrade(TypedDict):
    """Structured output for the relevance evaluator score."""
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[bool, ..., "Provide the score on whether the answer addresses the question"]

# ----------------------------------------------------------------------
## Grade prompt
# Cleaned up the multi-line string to remove notebook indentation
relevance_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# ----------------------------------------------------------------------
## Grader LLM
# Note: Using 'relevance_llm' assumes 'ChatOpenAI' is imported.
relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    RelevanceGrade, 
    method="json_schema", 
    strict=True
)

# ----------------------------------------------------------------------
## Evaluator
def relevance(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer helpfulness."""
    
    # Use .get() for safe dictionary access (best practice)
    question = inputs.get('question', 'N/A')
    answer = outputs.get('answer', 'N/A')

    user_content = f"QUESTION: {question}\nSTUDENT ANSWER: {answer}"
    
    # Run structured evaluator
    grade = relevance_llm.invoke([
        {"role": "system", "content": relevance_instructions}, 
        {"role": "user", "content": user_content}
    ])
    
    # The structured output is a dictionary conforming to RelevanceGrade
    return grade["relevant"]

In [30]:
### Groundedness: Response vs retrieved docs

from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI
# Assuming other necessary imports are present

# ----------------------------------------------------------------------
## Grade output schema
class GroundedGrade(TypedDict):
    """Structured output for the groundedness evaluator score."""
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    grounded: Annotated[bool, ..., "Provide the score on if the answer hallucinates from the documents"]

# ----------------------------------------------------------------------
## Grade prompt
# Cleaned up the multi-line string to remove notebook indentation
grounded_instructions = """You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 
(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Grounded:
A grounded value of True means that the student's answer meets all of the criteria.
A grounded value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# ----------------------------------------------------------------------
## Grader LLM
grounded_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    GroundedGrade, 
    method="json_schema", 
    strict=True
)

# ----------------------------------------------------------------------
## Evaluator
def groundedness(inputs: dict, outputs: dict) -> bool:
    """A simple evaluator for RAG answer groundedness."""
    
    # Concatenate the page_content from the documents list
    # Use .get() for safe dictionary access (best practice)
    docs = outputs.get("documents", [])
    doc_string = "\n\n".join(doc.page_content for doc in docs)
    answer = outputs.get("answer", "N/A")

    user_content = f"FACTS: {doc_string}\nSTUDENT ANSWER: {answer}"
    
    # Run structured evaluator
    grade = grounded_llm.invoke([
        {"role": "system", "content": grounded_instructions}, 
        {"role": "user", "content": user_content}
    ])
    
    # The structured output is a dictionary conforming to GroundedGrade
    return grade["grounded"]

In [32]:
### Retrieval Relevance: Retrieved docs vs input

from typing_extensions import Annotated, TypedDict
from langchain_openai import ChatOpenAI
# Assuming other necessary imports are present

# ----------------------------------------------------------------------
# Grade output schema
class RetrievalRelevanceGrade(TypedDict):
    """Structured output for the retrieval relevance evaluator score."""
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    relevant: Annotated[bool, ..., "True if the retrieved documents are relevant to the question, False otherwise"]

# ----------------------------------------------------------------------
# Grade prompt
# Cleaned up the multi-line string to remove notebook indentation
retrieval_relevance_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a set of FACTS provided by the student. 

Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met

Relevance:
A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
A relevance value of False means that the FACTS are completely unrelated to the QUESTION.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# ----------------------------------------------------------------------
# Grader LLM
retrieval_relevance_llm = ChatOpenAI(model="gpt-4o", temperature=0).with_structured_output(
    RetrievalRelevanceGrade, 
    method="json_schema", 
    strict=True
)

# ----------------------------------------------------------------------
# Evaluator
def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    """An evaluator for document relevance."""
    
    # Use .get() for safe dictionary access (best practice)
    docs = outputs.get("documents", [])
    doc_string = "\n\n".join(doc.page_content for doc in docs)
    question = inputs.get("question", "N/A")
    
    user_content = f"FACTS: {doc_string}\nQUESTION: {question}"

    # Run structured evaluator
    grade = retrieval_relevance_llm.invoke([
        {"role": "system", "content": retrieval_relevance_instructions}, 
        {"role": "user", "content": user_content}
    ])
    
    # The structured output is a dictionary conforming to RetrievalRelevanceGrade
    return grade["relevant"]

In [34]:
### Run the evaluation

# Assuming 'rag_bot' (the RAG function), 'client' (LangSmith client), 
# 'dataset_name', and all four evaluators are defined in the scope.

def target(inputs: dict) -> dict:
    """Wrapper function to map the dataset input 'question' to the rag_bot's input."""
    # Maps {"question": "..."} from the dataset input to rag_bot's expected argument
    return rag_bot(inputs["question"])

experiment_results = client.evaluate(
    target,  # Your AI system wrapped in a target function
    data=dataset_name,
    evaluators=[correctness, groundedness, relevance, retrieval_relevance],
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gpt-4-0125-preview"},
)

# Explore results locally as a dataframe if you have pandas installed
experiment_results.to_pandas()

View the evaluation results for experiment: 'rag-doc-relevance-e236e8c1' at:
https://smith.langchain.com/o/31db0815-5803-45ca-b62d-697f0171635a/datasets/9a68ec3c-49d4-4279-8792-45eead90804c/compare?selectedSessions=cdac4b48-24d6-425e-8164-909331554fa2




3it [00:38, 12.76s/it]


Unnamed: 0,inputs.question,outputs.answer,outputs.documents,error,reference.answer,feedback.correctness,feedback.groundedness,feedback.relevance,feedback.retrieval_relevance,execution_time,example_id,id
0,How does the ReAct agent use self-reflection?,The ReAct agent uses self-reflection by incorp...,[page_content='Self-reflection is a vital aspe...,,"ReAct integrates reasoning and acting, perform...",True,False,True,False,3.23474,309046a2-617b-438e-898c-f95c1c9b8318,70043cc0-b43a-4218-b199-8b902a1e3503
1,What are five types of adversarial attacks?,The five types of adversarial attacks are:\n\n...,[page_content='Black-box attacks assume that a...,,Five types of adversarial attacks are (1) Toke...,True,True,True,True,1.712367,ab408aac-c1a1-4306-9b7a-e33be1c46b1e,9db49274-8835-425b-aa20-865cf3782f93
2,What are the types of biases that can arise wi...,The types of biases that can arise with few-sh...,[page_content='Zero-shot and few-shot learning...,,The biases that can arise with few-shot prompt...,True,True,True,True,2.891409,ceb5d801-b45d-4ac6-a76a-e57dd46b397a,6b9805be-a626-40df-86ae-b3174ddb9e9a
