In [None]:
%pip install --upgrade --quiet pip setuptools wheel matplotlib seaborn
%pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken crate 'crate[sqlalchemy]' pandas jq 
%pip install --use-pep517 --quiet python-dotenv
%pip install --quiet langchain-google-genai
%pip install -qU langchain-anthropic

# Evaluate RAG search with CrateDB vector, fulltext, fusion and OpenAI

## Setup environment variables

In [1]:
import os

from dotenv import load_dotenv

load_dotenv()

True

## setup embeddings

In [2]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
len(embeddings.embed_query("a"))

1536

In [3]:
conn_url = "crate://{user}:{password}@{server}".format(
    user=os.environ["CRATEDB_USER"],
    password=os.environ["CRATEDB_PASS"],
    server=os.environ["CRATEDB_SERVER"],
)
conn_url

'crate://crate:@localhost:4200'

In [34]:
# open file
from langchain_community.document_loaders import JSONLoader, DirectoryLoader


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source_url"] = record.get("url")
    metadata["source_title"] = record.get("title")

    if "source" in metadata:
        metadata["source"] = metadata["source_url"]

    return metadata


loader = DirectoryLoader(
    './',
    glob="everything-*.json",
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".[]",
        "text_content": False,
        "content_key": "html",
        "metadata_func": metadata_func,
    }
)

data = loader.load()
# data[:1]

In [35]:
# split documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
    ],
    chunk_size=1000,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

docs_splits = text_splitter.split_documents(data)
# docs_splits[:2]

## RAG search, indexing pipeline    

In [36]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_anthropic import ChatAnthropic 


In [37]:
from rag.vectorstore.crate import CrateVectorStore

vectorstore = CrateVectorStore.from_documents(
    # assumes that data was imported already
    # allow faster recomputation of notebook, without need of reindexing
    # documents=[],
    documents=docs_splits,
    embedding=embeddings,
    database_kwargs={
        "database_uri": conn_url,
    },
    vectorstore_kwargs={
       "drop_if_exists" : True,
    },
)
vectorstore

<rag.vectorstore.crate.CrateVectorStore at 0x333733350>

In [38]:
from langchain.retrievers import EnsembleRetriever

retriever_a = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 100, "algorith": "knn"}
)

retriever_b = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 100, "algorith": "fulltext"}
)

retriever_c = EnsembleRetriever(
    retrievers=[
        retriever_a,
        retriever_b
    ],
    weights=[0.5, 0.5],
)

In [39]:
import json

In [41]:

template = """Answer the question based only on the context, if possible use links to reference the source, use markdown, avoid answering when the context and question are not related.

today date is 2024 April 3rd

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model_openai = ChatOpenAI()
model_geminai = ChatGoogleGenerativeAI(model="gemini-pro")
model_cloude = ChatAnthropic(model="claude-3-opus-20240229", temperature=0.1)

def format_docs(docs):
    breakpoint()
    return json.dumps([{"text": d.page_content, "source": d.metadata.get('source')} for d in docs])


chain_a = (
        {"context": retriever_a | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model_openai
        | StrOutputParser()
)

chain_b = (
        {"context": retriever_b | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model_openai
        | StrOutputParser()
)

chain_c = (
        {"context": retriever_c | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model_openai
        | StrOutputParser()
)

# result = chain.invoke("How to limit permissions?")
# result = chain.invoke(" How AWS marketplace works, and why I cannot see deployment in my account?")
# result = chain.invoke("What are edge regions and how to use them?")
result = chain_a.invoke("Write me example of using blobs?")
# result = chain.invoke("How to use BLOB store in CrateDB? and what are the benefits?")
result


'To use BLOBs in CrateDB, you would need to follow these steps:\n\n1. **Creating a BLOB Table**: Before adding a BLOB to CrateDB, you must create a BLOB table. This can be done using the Crate Shell (CraSh) by issuing an SQL statement like this:\n   ```sh\n   crash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"\n   ```\n   Source: [CrateDB Documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/blobs.html)\n\n2. **Uploading BLOBs**: To upload a BLOB, you need to know the SHA1 hash of the BLOB upfront. You can compute the hash using a command like this in Python:\n   ```sh\n   python -c \'import hashlib;print(hashlib.sha1("contents".encode("utf-8")).hexdigest())\'\n   ```\n   Once you have the hash, you can upload the BLOB by issuing a PUT request. \n   Source: [CrateDB Blog](https://cratedb.com/blog/using-crate-as-a-blobstore)\n\n3. **Listing & Querying BLOBs**: To list all BLOBs inside a BLOB table, you can use a SELECT statement like

In [42]:
model_cloude.invoke("Write me poem")

AIMessage(content="Here's a poem for you:\n\nIn the tapestry of life, threads intertwine,\nWeaving moments of joy, love, and sometimes pain.\nEach stitch a memory, a story to tell,\nOf laughter, tears, and dreams that dwell.\n\nColors blend and patterns form,\nCreating a masterpiece, vibrant and warm.\nThrough the ups and downs, the twists and turns,\nThe fabric of existence forever churns.\n\nYet in the midst of chaos and strife,\nThere's beauty to be found, a zest for life.\nIn the simple things, the everyday,\nA smile, a hug, a word to say.\n\nSo let us cherish the moments we hold dear,\nAnd face the future without fear.\nFor in this grand design, we play a part,\nStitching our stories, with love in our heart.", response_metadata={'id': 'msg_01DojgM1ickYARzXhEsCexcb', 'model': 'claude-3-opus-20240229', 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 10, 'output_tokens': 196}}, id='run-238434b4-9123-4319-99be-a211693669b9-0')

In [43]:
from IPython.display import display, Markdown

display(Markdown(result))

To use BLOBs in CrateDB, you would need to follow these steps:

1. **Creating a BLOB Table**: Before adding a BLOB to CrateDB, you must create a BLOB table. This can be done using the Crate Shell (CraSh) by issuing an SQL statement like this:
   ```sh
   crash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"
   ```
   Source: [CrateDB Documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/blobs.html)

2. **Uploading BLOBs**: To upload a BLOB, you need to know the SHA1 hash of the BLOB upfront. You can compute the hash using a command like this in Python:
   ```sh
   python -c 'import hashlib;print(hashlib.sha1("contents".encode("utf-8")).hexdigest())'
   ```
   Once you have the hash, you can upload the BLOB by issuing a PUT request. 
   Source: [CrateDB Blog](https://cratedb.com/blog/using-crate-as-a-blobstore)

3. **Listing & Querying BLOBs**: To list all BLOBs inside a BLOB table, you can use a SELECT statement like this:
   ```sh
   crash -c "select digest, last_modified from blob.myblobs"
   ```
   This allows you to see the digests and modification timestamps of the BLOBs stored in the table.
   Source: [CrateDB Blog](https://cratedb.com/blog/using-crate-as-a-blobstore)

## Evaluation

In [44]:
questions = [
    "Write me example of using blobs?",
    "How to use BLOB store in CrateDB? and what are the benefits?",
    "How to limit permissions?",
    "How AWS marketplace works, and why I cannot see deployment in my account?",
    "What are edge regions and how to use them?",
    "What are recent blog posts about CrateDB?",
    "Write me example python code to use CrateDB?",
    "Write me example golang code to use CrateDB?",
    "create RAG search with CrateDB and OpenAI?",
    "how to alter table and add fulltext index?",
    "how to alter table and add vector type field that allows for KNN search?",
    "create table with fields ID, name, vector, and index vector field for KNN search?",
    "What are limits and limitations of CrateDB?",
    "What are the benefits of using CrateDB?",
    "Does index creation block write operations?",
    "Does crate supports conditional indices",
    "How to create ID field that is autoincremented?",
    "how to create analysers for fulltext search?",
    "give me information about password and admin",
    "Shared file system implementation of the BlobStoreRepository",
    "Is Cloud UI opensource?",
    "How to do fusion search and connect vector search with fulltext search",
    "How to MATH fulltext "
    "Benchmarks of CrateDB",
]

In [45]:
from langchain.evaluation import load_evaluator

# from langchain.evaluation.criteria import Criteria
# 
# criteria = [
#     Criteria.COHERENCE,
#     Criteria.RELEVANCE,
#     Criteria.DETAIL,
#     Criteria.HARMFULNESS,
#     Criteria.HELPFULNESS,
# ]

criteria2 = {
    "runnable": "Software engineer who is looking for information can copy and paste the code and it should work",
    "relevance": "The answer is relevant to the question",
    "grounded": "The answer links to the source of the information",
    "coherence": "The answer is coherent and makes sense",
    "detail": "The answer is detailed and provides enough information",
    "helpfulness": "The answer is helpful and provides value",
    "direct": "The answer directly answers the question",
    "code": "The answer contains code that can be copied and pasted",
    "reference": "The answer contains reference to the source in form of URL",
}

evaluator_openai = load_evaluator("pairwise_string", criteria=criteria2, llm=model_openai)
evaluator_gemini = load_evaluator("pairwise_string", criteria=criteria2, llm=model_geminai)
evaluator_cloude = load_evaluator("pairwise_string", criteria=criteria2, llm=model_cloude)


This chain was only tested with GPT-4. Performance may be significantly worse with other models.
This chain was only tested with GPT-4. Performance may be significantly worse with other models.
This chain was only tested with GPT-4. Performance may be significantly worse with other models.


In [46]:
question = "Write me example of using blobs?"
prediction = chain_a.invoke(question)
prediction_b = chain_b.invoke(question)

ev = evaluator_cloude.evaluate_string_pairs(
    prediction=prediction,
    prediction_b=prediction_b,
    input=question
)



In [47]:
ev

{'reasoning': "In comparing the two responses, both provide relevant information on how to use BLOBs with CrateDB. However, Assistant A's response is more comprehensive and helpful for a software engineer looking to implement this functionality.\n\nAssistant A provides a clear, step-by-step example of creating a BLOB table using the Crate Shell, including the specific command to use. They also provide a runnable example of uploading a BLOB using a PUT request, including the curl command and the URL structure. Additionally, they include a direct link to the official CrateDB documentation for further reference.\n\nWhile Assistant B also provides the command to create a BLOB table, they do not provide a complete example of uploading a BLOB. Instead, they suggest using a Python one-liner to get the SHA1 hash, which, although helpful, is not as directly relevant to the question of using BLOBs with CrateDB. They mention that you can list, query, download, and delete BLOBs but do not provide 

In [48]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List


class InferredRational(BaseModel):
    rationale: str = Field(description="rational of the labeler")
    category: str = Field(description="category used for evaluation")
    score: float = Field(
        description="Score -1.5 to 1.5. Where -1.5 is below expectations in the rational category, and 1.5 is above expectations in the rational category.")


class RationalList(BaseModel):
    list: List[InferredRational] = Field(description="list of rationals used to asses quality by labeler")


parser_ret = JsonOutputParser(pydantic_object=RationalList)

prompt_reg = PromptTemplate(
    template="Extract labeler rational following criterias {criterias}.\n{format_instructions}\n{reasoning}\n",
    input_variables=["reasoning"],
    partial_variables={
        "format_instructions": parser_ret.get_format_instructions(),
        "criterias": json.dumps(criteria2, indent=2)
    },
)

chain_reasoning_extract = prompt_reg | model_openai | parser_ret


def extract_reasoning(reasoning) -> RationalList:
    try:
        return chain_reasoning_extract.invoke({"reasoning": reasoning})
    except BaseException as e:
        return None

In [49]:
extract_reasoning(ev['reasoning'])

{'list': [{'rationale': 'Provides clear, step-by-step example of creating a BLOB table using Crate Shell',
   'category': 'detail',
   'score': 1.2},
  {'rationale': 'Includes runnable example of uploading a BLOB using PUT request',
   'category': 'runnable',
   'score': 1.4},
  {'rationale': 'Directly links to official CrateDB documentation for further reference',
   'category': 'reference',
   'score': 1.3},
  {'rationale': 'Answer is detailed, coherent, and directly answers the question with runnable code examples',
   'category': 'helpfulness',
   'score': 1.5}]}

In [50]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List


class TaggedQuestion(BaseModel):
    question: str = Field(description="the original question")
    tags: List[str] = Field(description="the list of tags that best categorize the question")


class TaggedQuestionList(BaseModel):
    list: List[TaggedQuestion] = Field(description="list of categorized questions")


parser_cat = JsonOutputParser(pydantic_object=TaggedQuestionList)

prompt_cat = PromptTemplate(
    template="Categorize question {questions}.\n{format_instructions}\n\n",
    input_variables=["reasoning"],
    partial_variables={
        "format_instructions": parser_cat.get_format_instructions(),
        "questions": json.dumps(questions, indent=2)
    },
)

chain_tag_question = prompt_cat | model_openai | parser_cat


def categorise_questions(questions: List[str]) -> TaggedQuestionList:
    try:
        return chain_tag_question.invoke({"questions": questions})
    except BaseException as e:
        return None

In [51]:
categorised_questions = categorise_questions(questions)
categorised_questions

{'list': [{'question': 'Write me example of using blobs?',
   'tags': ['database', 'blob']},
  {'question': 'How to use BLOB store in CrateDB? and what are the benefits?',
   'tags': ['database', 'blob', 'CrateDB']},
  {'question': 'How to limit permissions?', 'tags': ['security']},
  {'question': 'How AWS marketplace works, and why I cannot see deployment in my account?',
   'tags': ['AWS', 'deployment']},
  {'question': 'What are edge regions and how to use them?',
   'tags': ['edge computing']},
  {'question': 'What are recent blog posts about CrateDB?',
   'tags': ['CrateDB', 'blog']},
  {'question': 'Write me example python code to use CrateDB?',
   'tags': ['CrateDB', 'python']},
  {'question': 'Write me example golang code to use CrateDB?',
   'tags': ['CrateDB', 'golang']},
  {'question': 'create RAG search with CrateDB and OpenAI?',
   'tags': ['CrateDB', 'search']},
  {'question': 'how to alter table and add fulltext index?',
   'tags': ['database', 'indexing']},
  {'question

In [52]:
from typing import List
# setup LLM Comparator
import llm_comparator as llc
import concurrent.futures

def safe_evaluator(i, evaluators, prediction, prediction_b, input_text):
    try:
        result = evaluators[i%3].evaluate_string_pairs(
            prediction=prediction if i % 2 == 0 else prediction_b,
            prediction_b=prediction_b if i % 2 == 0 else prediction,
            input=input_text
        )
        return result
    except Exception as e:
        return {'score': 0, 'reasoning': 'Evaluation failed', 'value': 'Error'}
    
def build_example(index: int, question: TaggedQuestion, chain_a, chain_b, raters=3):
    prediction = chain_a.invoke(question['question'])
    prediction_b = chain_b.invoke(question['question'])

    individual_rater_scores: List[llc.RaterScore] = []
    
    evaluators = [evaluator_openai, evaluator_cloude,
                  evaluator_openai, evaluator_cloude]

    # parallelize
    with concurrent.futures.ThreadPoolExecutor() as executor:
        evaluations = list(executor.map(
            lambda i: safe_evaluator(i, evaluators, prediction, prediction_b, question['question']),
            range(raters * 2)
        ))

        for i, ev in enumerate(evaluations):
            is_flipped = i % 2 == 1
            # make score that is between 0 and 1, to be between -1.5 and 1.5
            score = (ev['score'] - 0.5) * 3
            if is_flipped:
                score = -score

            individual_rater_scores.append(
                llc.RaterScore(
                    is_flipped=is_flipped,
                    score=score,
                    rationale=ev['reasoning'],
                    rating_label=ev['value']
                )
            )

    avg_score = sum([r.score for r in individual_rater_scores]) / len(individual_rater_scores)

    return llc.Example(
        index=index,
        input_text=question['question'],
        tags=question["tags"],
        output_text_a=prediction,
        output_text_b=prediction_b,
        score=avg_score,
        custom_fields={},
        individual_rater_scores=individual_rater_scores,
    )


metadata = llc.Metadata(
    source_path="Compare RAG search with CrateDB vector, fulltext, fusion and OpenAI",
    custom_fields_schema=[]
)

models: List[llc.Model] = [
    llc.Model(
        name="A",
    ),
    llc.Model(
        name="B",
    ),
]

examples: List[llc.Example] = [build_example(i, x, chain_a, chain_b) for i, x in
                               enumerate(categorised_questions['list'])]




In [53]:
rationale_clusters: List[llc.RationaleCluster] = [
    llc.RationaleCluster(
        title="provide more details",
    ),
    llc.RationaleCluster(
        title="is more engaging",
    ),
    llc.RationaleCluster(
        title="directly answers the question",
    ),
    llc.RationaleCluster(
        title="has more references",
    ),
    llc.RationaleCluster(
        title="has code examples",
    ),
]



In [54]:
from numpy import dot
from numpy.linalg import norm


def measure_similarities(rational_cluster_emb: List[List[float]], rational_emb: List[float]) -> List[List[float]]:
    b = rational_emb
    return [
        dot(a, b) / (norm(a) * norm(b)) for a in rational_cluster_emb
    ]



In [55]:
# enhance each example with rationale_list. 
for example in examples:
    rationale_clusters_emb_list = embeddings.embed_documents([r.title for r in rationale_clusters])

    with concurrent.futures.ThreadPoolExecutor() as executor:
        extracted_list = list(executor.map(
            lambda rater_score: extract_reasoning(rater_score.rationale),
            example.individual_rater_scores
        ))
        for extracted in extracted_list:
            if extracted and 'list' in extracted and len(extracted['list']) > 0:
                example.rationale_list = [llc.Rational(
                    rationale=r['rationale'],
                    paraphrased_rationales=[],
                    similarities=measure_similarities(rationale_clusters_emb_list,
                                                      embeddings.embed_query(r['rationale'])),
                ) for r in extracted['list']]


In [56]:
llcset = llc.LLMComparatorData(
    metadata=metadata,
    models=models,
    examples=examples,
    rationale_clusters=rationale_clusters,
)

with open("eval.json", "w") as f:
    f.write(llcset.to_json())

# Analysis of evaluation results
- Which variant was preferred more?
- What is probability that people presented with a variant A over B will be more happy?
- Which prediction variant on average performed better?
- What is distribution of scores for each prediction variant?