# Compare embeddings performance

We use different approaches to create embeddings from the same texts and compare their performance.

## Dependencies

To run the required vector database locally, use this command:  
`docker run --name 04-compare-embeddings-demo-vectordb -p 6333:6333 -p 6334:6334 -d qdrant/qdrant`

In [1]:
%pip install chromadb
%pip install langchain
%pip install langchain-community
%pip install langchain-chroma
%pip install langchain-huggingface
%pip install langchain-openai
%pip install pickleshare
%pip install qdrant-client
%pip install tabulate

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting qdrant-client
  Downloading qdrant_client-1.13.3-py3-none-any.whl.metadata (10 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant-client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<5.0dev,>=4.21.6 (from grpcio-tools>=1.41.0->qdrant-client)
  Downloading protobuf-4.25.6-cp37-abi3-macosx_10_9_universal2.whl.metadata (541 bytes)
Collecting h2<5,>=3 (from httpx[http2]>=0.20.0->qdrant-client)
  Downloading h2-4.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting hyperframe<7,>=6.1 

## Configuration:

Please select the model you want to use for the transformations.

In [2]:
llm_source = "openai" # openai or hf for huggingface
embedding_source = "openai" # openai or hf for huggingface

llm_model = "gpt-4o"
temperature = 0

embeddings_model = "text-embedding-3-large"

markdown_documents_path = "../../../../tt-readme"

use_cached_documents = False
use_cached_transforms = False
reindex_documents = True

## Test different approaches of indexing

This will
- create a question for each document,
- create an answer for each document and
- summarize each document

## Load and split markdown contents of the TT Readme


In [3]:
if use_cached_documents:
    print("Skipping loading documents from markdown files")
else:

    from langchain.document_loaders import DirectoryLoader, TextLoader
    from langchain.text_splitter import MarkdownHeaderTextSplitter

    readme_documents = DirectoryLoader(
        markdown_documents_path,
        glob="**/*.md",
        loader_cls=TextLoader
        ).load()

    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
    ]

    splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

    split_documents = []
    for doc in readme_documents:
        result = splitter.split_text(doc.page_content)

        if isinstance(result, list):
            for res in result:
                res.metadata.update(doc.metadata)
            split_documents.extend(result)
        else:
            result.metadata.update(doc.metadata)
            split_documents.append(result)

    # For brevity, reduce amount of entries to a few only
    # split_documents = split_documents[50:60]

    index  = 1
    for doc in split_documents:
        doc.metadata["index"] = index
        index += 1
        doc.metadata["original_content"] = doc.page_content
        #print(doc.metadata)
        #print("\n")

### Persist the data to files or load cached files

In [4]:
import pickle

if (use_cached_documents):
    print("Loading documents from file")
    with open("./cache/split_documents.pickle", "rb") as f:
        split_documents = pickle.load(f)
else:
    print("Writing documents to file")
    with open("./cache/split_documents.pickle", "wb") as f:
        pickle.dump(split_documents, f)

Writing documents to file


## Massage content into new embedding documents

In [5]:
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

llm = ChatOpenAI(model=llm_model, temperature=temperature)

def build_chain(prompt):
    return LLMChain(llm=llm, prompt=PromptTemplate(input_variables=["input"], template=prompt))

question_chain = build_chain("Formuliere drei verschiedene deutsche Fragen, die der folgende Text beantwortet: {input}")
answer_chain = build_chain("Erkläre in zwei bis drei deutschen Sätzen, was der folgende Text beantwortet: {input}")
summarize_chain = build_chain("Erstelle eine kurze deutsche Zusammenfassung des folgenden Textes: {input}")

  return LLMChain(llm=llm, prompt=PromptTemplate(input_variables=["input"], template=prompt))


In [6]:
import copy

def transform_documents(chain, file):
    if use_cached_transforms:
        print(f"Loading cached file {file}")
        with open(f"cache/{llm_model}_{file}_documents.pickle", "rb") as f:
            result = pickle.load(f)
        return result
    else:
        result = copy.deepcopy(split_documents)
        for doc in result:
            print(f"Transforming {file} document {doc.metadata['index']} with model {llm_model}")
            doc.metadata["original_content"] = copy.copy(doc.page_content)
            doc.page_content = chain.run(doc.page_content)
        print(f"Writing {file} documents from model {llm_model} to file")
        with open(f"cache/{llm_model}_{file}_documents.pickle", "wb") as f:
            pickle.dump(result, f)
        return result

question_documents = transform_documents(question_chain, "questions")
answer_documents = transform_documents(answer_chain, "answers")
summary_documents = transform_documents(summarize_chain, "summaries")

Transforming questions document 1 with model gpt-4o


  doc.page_content = chain.run(doc.page_content)


Transforming questions document 2 with model gpt-4o
Transforming questions document 3 with model gpt-4o
Transforming questions document 4 with model gpt-4o
Transforming questions document 5 with model gpt-4o
Transforming questions document 6 with model gpt-4o
Transforming questions document 7 with model gpt-4o
Transforming questions document 8 with model gpt-4o
Transforming questions document 9 with model gpt-4o
Transforming questions document 10 with model gpt-4o
Transforming questions document 11 with model gpt-4o
Transforming questions document 12 with model gpt-4o
Transforming questions document 13 with model gpt-4o
Transforming questions document 14 with model gpt-4o
Transforming questions document 15 with model gpt-4o
Transforming questions document 16 with model gpt-4o
Transforming questions document 17 with model gpt-4o
Transforming questions document 18 with model gpt-4o
Transforming questions document 19 with model gpt-4o
Transforming questions document 20 with model gpt-4o
T

## Prepare Embeddings model

In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = None

if embedding_source == "openai":
    embeddings = OpenAIEmbeddings(model=embeddings_model)

## Prepare store

In [8]:
from langchain.vectorstores import Qdrant

def store(documents, collection_name):
    Qdrant.from_documents(
        documents,
        url="http://localhost:6333",
        embedding=embeddings,
        collection_name=collection_name,
        force_recreate=True,
    )

pure_collection = f"{embeddings_model}-{llm_model}-p"
question_collection = f"{embeddings_model}-{llm_model}-q"
answer_collection = f"{embeddings_model}-{llm_model}-a"
summary_collection = f"{embeddings_model}-{llm_model}-s"

collections = [pure_collection, question_collection, answer_collection, summary_collection]

## Create embeddings and store them in different collections

In [9]:
if reindex_documents:
    store(split_documents, pure_collection)
    store(question_documents, question_collection)
    store(answer_documents, answer_collection)
    store(summary_documents, summary_collection)

## Search with a query in the different indexes

You can look at the Qudrant collections and data at http://localhost:6333/dashboard

In [10]:
queries = [
    "Was mache ich, wenn ich meinen letzten Zug verpasst habe?",
    "Nach wie vielen Jahren kann ich mein Notebook erneuern?",
    "Was ist MITOD?",
]

In [11]:
from qdrant_client import QdrantClient

client = QdrantClient("http://localhost:6333")

def search(collection, query):
    return Qdrant(client, collection, embeddings)._similarity_search_with_relevance_scores(query)

collections = [pure_collection, question_collection, answer_collection, summary_collection]

result_table = []
result_table.append(["Collection"] + queries)

for collection in collections:
    row = []
    for query in queries:
        print(f"Searching {collection} for {query}")
        search_results = search(collection, query)

        row.append("\n".join([f"{document.metadata['index']} - {score}" for document, score in search_results]))

    result_table.append([collection] + row)

Searching text-embedding-3-large-gpt-4o-p for Was mache ich, wenn ich meinen letzten Zug verpasst habe?


  return Qdrant(client, collection, embeddings)._similarity_search_with_relevance_scores(query)


Searching text-embedding-3-large-gpt-4o-p for Nach wie vielen Jahren kann ich mein Notebook erneuern?
Searching text-embedding-3-large-gpt-4o-p for Was ist MITOD?
Searching text-embedding-3-large-gpt-4o-q for Was mache ich, wenn ich meinen letzten Zug verpasst habe?
Searching text-embedding-3-large-gpt-4o-q for Nach wie vielen Jahren kann ich mein Notebook erneuern?
Searching text-embedding-3-large-gpt-4o-q for Was ist MITOD?
Searching text-embedding-3-large-gpt-4o-a for Was mache ich, wenn ich meinen letzten Zug verpasst habe?
Searching text-embedding-3-large-gpt-4o-a for Nach wie vielen Jahren kann ich mein Notebook erneuern?
Searching text-embedding-3-large-gpt-4o-a for Was ist MITOD?
Searching text-embedding-3-large-gpt-4o-s for Was mache ich, wenn ich meinen letzten Zug verpasst habe?
Searching text-embedding-3-large-gpt-4o-s for Nach wie vielen Jahren kann ich mein Notebook erneuern?
Searching text-embedding-3-large-gpt-4o-s for Was ist MITOD?


In [12]:
from tabulate import tabulate

print(tabulate(result_table, tablefmt="grid", headers="firstrow"))

+---------------------------------+-------------------------------------------------------------+-----------------------------------------------------------+------------------+
| Collection                      | Was mache ich, wenn ich meinen letzten Zug verpasst habe?   | Nach wie vielen Jahren kann ich mein Notebook erneuern?   | Was ist MITOD?   |
| text-embedding-3-large-gpt-4o-p | 198 - 0.5888042                                             | 100 - 0.36935392                                          | 140 - 0.51360524 |
|                                 | 200 - 0.45595053                                            | 160 - 0.36125535                                          | 139 - 0.45612168 |
|                                 | 197 - 0.37155625                                            | 159 - 0.3516118                                           | 41 - 0.3039658   |
|                                 | 199 - 0.36761343                                            | 99 - 0.33544755  

# To check a result, put the index in the following cell and run it

In [None]:
found_index = 156

# find the document with the metadata index of the found_index variable

found_document = None
for doc in split_documents:
    if doc.metadata["index"] == found_index:
        found_document = doc
        break

print(f'{found_document.page_content}\n\n')
print(f'{found_document.metadata}\n\n')

for doc in question_documents:
    if doc.metadata["index"] == found_index:
        found_document = doc
        break

print(f"Questions: {found_document.page_content}\n\n")

for doc in answer_documents:
    if doc.metadata["index"] == found_index:
        found_document = doc
        break

print(f"Answers: {found_document.page_content}\n\n")

for doc in summary_documents:
    if doc.metadata["index"] == found_index:
        found_document = doc
        break

print(f"Summary: {found_document.page_content}\n\n")
