# Pipeline Testing
This file will be steps that will be required to convert uploaded files to MM vector embeddings using voyage-multimodal-3.

## Imports
Importing necessary files and getting important env vars below, as well as creating references to all necessary databases and OpenAI

In [17]:
%%capture

import os, PIL
import numpy as np
from dotenv import load_dotenv
from openai import OpenAI
from qdrant_client import QdrantClient, models
from qdrant_client.models import Distance
from pymongo import MongoClient
from voyageai import Client
from langchain_experimental.text_splitter import SemanticChunker
from langchain_voyageai.embeddings import VoyageAIEmbeddings
import docx2txt

Just separating imports and env variables/initializations.

In [6]:
# load env
load_dotenv()
MONGO_USER = os.getenv("MONGO_USER")
MONGO_PWD = os.getenv("MONGO_PWD")
QDRANT_API_URL = os.getenv("QDRANT_API_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_KEY")
VOYAGE_API_KEY = os.getenv("VOYAGE_KEY")

VOYAGE_BATCH_SIZE = 10

uri = f"mongodb+srv://{MONGO_USER}:{MONGO_PWD}@passionaibot.4dwr2me.mongodb.net/?retryWrites=true&w=majority&appName=PassionAIBot"
# Create a new client and connect to the server
mongo_client = MongoClient(uri)

# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Successfully pinged MongoDB deployment.")
except Exception as e:
    print(e)

# connect to API database
_db = mongo_client['PassionAIDB_API']

# create reference for user database
_tests = _db['tests']
_users = _db['users']
_groups = _db['groups']
_api_users = _db['api_users']
_access_tokens = _db['access_tokens']

# initialize Qdrant Vector DB
_qdrant_client = QdrantClient(
    url=QDRANT_API_URL, 
    api_key=QDRANT_API_KEY,
)

print(_qdrant_client.get_collections())

# create OpenAI client
_openai_client = OpenAI(api_key=OPENAI_API_KEY)

# get voyage AI
_voyage_client = Client(api_key=VOYAGE_API_KEY)

# create semantic chunker instance with voyage AI
_text_splitter = SemanticChunker(
    embeddings=VoyageAIEmbeddings(
        batch_size=VOYAGE_BATCH_SIZE,
        model="voyage-2",
        voyage_api_key=VOYAGE_API_KEY
    ),
    breakpoint_threshold_type="gradient",
    breakpoint_threshold_amount=85.0,
    min_chunk_size=300
)

Successfully pinged MongoDB deployment.
collections=[]


## Processing Files
Taking uploaded files, converting to screenshot format.
Needs to convert:
- Text into screenshots
- Images into screenshots without too much data
- Audio should be transcribed & chunked, embed text and/or audio chunks
- Video should have key frames sampled, transcribed audio

### Processing Text
I need to find a more effective way to chunk text documents. I will first use VoyageAI's multimodal on some text documents to test.
This step is not necessary in the final sequence, it is just to test that VoyageAI embeddings can be generated.

In [None]:
# example inputs
inputs = [
    {
        "content":
        [
            {
                "type": "text",
                "text": "A kitten is a juvenile cat. After being born, kittens display primary altriciality and are fully dependent on their mothers for survival. They normally do not open their eyes for seven to ten days. After about two weeks, kittens develop quickly and begin to explore the world outside their nest. After a further three to four weeks, they begin to eat solid food and grow baby teeth. Domestic kittens are highly social animals and usually enjoy human companionship."
            }
        ]
    }
]
inputs_2 = [
    {
        "content":
        [
            {
                "type": "text",
                "text": "The cat (Felis catus), also referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal. It is the only domesticated species of the family Felidae. Advances in archaeology and genetics have shown that the domestication of the cat occurred in the Near East around 7500 BC. It is commonly kept as a pet and farm cat, but also ranges freely as a feral cat avoiding human contact. It is valued by humans for companionship and its ability to kill vermin. Its retractable claws are adapted to killing small prey species such as mice and rats. It has a strong, flexible body, quick reflexes, and sharp teeth, and its night vision and sense of smell are well developed. It is a social species, but a solitary hunter and a crepuscular predator."
            }
        ]
    },
    {
        "content":
        [
            {
                "type": "text",
                "text": "What is a cat?"
            }
        ]
    }
]
inputs_3 = [["The cat (Felis catus), also referred to as the domestic cat or house cat, is a small domesticated carnivorous mammal."], ["What is a cat?"], ["Thus"], ["That"], ["Bob"]]

image_input = [
    {
        "content":
        [
            {
                "type": "image_url",
                "image_url": "https://d2zp5xs5cp8zlg.cloudfront.net/image-86754-800.jpg"
            }
        ]
    }
]

# vectorize inputs
text_res = _voyage_client.multimodal_embed(
    inputs=inputs_3, 
    model="voyage-multimodal-3",
    input_type="document"
)
# text_res_2 = _voyage_client.multimodal_embed(inputs_2, model="voyage-multimodal-3")
# image_res = _voyage_client.multimodal_embed(image_input, model="voyage-multimodal-3")

Testing the resulting embeddings.

In [None]:
def get_data(voyager_output):
    print(f"Embeddings: {voyager_output.embeddings}\nImage pixels: {voyager_output.image_pixels}\nText tokens: {voyager_output.text_tokens}\nTotal tokens: {voyager_output.total_tokens}")

get_data(text_res)
print(len(text_res.embeddings))

def cosine(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2)

# print(cosine(text_res.embeddings[0], text_res_2.embeddings[0]))
# print(cosine(text_res.embeddings[0], image_res.embeddings[0]))
# print(cosine(text_res_2.embeddings[0], image_res.embeddings[0]))
    

Embeddings: [[-0.00732421875, 0.00616455078125, 0.044921875, 0.0185546875, -0.025390625, 0.0242919921875, -0.031982421875, -0.004150390625, 0.0172119140625, -0.009521484375, -0.01806640625, 0.00482177734375, 0.005889892578125, 0.00159454345703125, 0.0341796875, 0.036376953125, 0.0002422332763671875, 0.0216064453125, -0.007537841796875, -0.0184326171875, -0.01470947265625, -0.0052490234375, -0.01171875, 0.0263671875, -0.01556396484375, -0.04296875, 0.024169921875, 0.01458740234375, 0.0306396484375, 0.048828125, 0.007232666015625, 0.01373291015625, -0.034912109375, -0.01416015625, -0.010986328125, -0.0272216796875, -0.006500244140625, -0.023193359375, 0.013427734375, 0.010986328125, -0.00970458984375, -0.01202392578125, -0.01031494140625, -0.0277099609375, 0.09326171875, -0.01348876953125, 0.037841796875, -0.044677734375, -0.005218505859375, 0.0267333984375, 0.029541015625, -0.03857421875, 0.004180908203125, -0.0498046875, 0.019287109375, -0.009521484375, -0.028564453125, 0.0279541015625

### Splitting documents
I will be using LangChain's SemanticChunker.
We will read a text file with minimal formatting, run it through the SemanticChunker, and then process them into dictionaries containing the embedding and associated text content.

In [156]:
files = []
path = "embedding_docs\passions"
for file in os.listdir(path):
    if (file.endswith(".docx")):
        files.append(file)

text_contents = []
for file in files:
    text = docx2txt.process(f"{path}\{file}")
    text_contents.append(text)

docs = _text_splitter.create_documents(text_contents)

# with open(f"{path}\test.txt", "r", encoding="utf-8") as f:
#     text = f.read()
#     docs = _text_splitter.create_documents([text])
    

In [None]:
# get only text chunks formatted for embedding model
chunks = [[doc.page_content] for doc in docs if doc.page_content]
chunks
# t = [len(c[0]) for c in chunks]
# t

We now create embeddings for each of the documents and format them to be stored in our Pinecone database.


In [None]:
# get only text chunks formatted for embedding model
chunks = [[doc.page_content] for doc in docs if doc.page_content]

# get multimodal embeddings
page_res = _voyage_client.multimodal_embed(
    inputs=chunks, 
    model="voyage-multimodal-3",
    input_type="document"
)

# extract embeddings
total_tokens = page_res.total_tokens
docs = list()
for i in range(len(chunks)):
    docs.append({
        "embeddings": page_res.embeddings[i],
        "text": chunks[i]
    })
print(f"{total_tokens} tokens used for {len(docs)} embeddings.")
print(docs)




2708 tokens used for 18 embeddings.
[{'embeddings': [0.00537109375, -0.01422119140625, -0.002899169921875, -0.0142822265625, 0.00347900390625, 0.0308837890625, -0.03955078125, -0.033935546875, -0.00958251953125, -0.01483154296875, -0.007049560546875, 0.0169677734375, 0.0274658203125, -0.00762939453125, 0.01495361328125, -0.0036773681640625, 0.026611328125, -0.0191650390625, 0.0673828125, 0.048583984375, 0.0169677734375, 0.0106201171875, 0.00146484375, -0.000370025634765625, -0.01470947265625, 0.06787109375, -0.0361328125, -0.04736328125, -0.05126953125, -0.00390625, 0.018310546875, -0.0654296875, 0.049072265625, -0.00457763671875, -0.001678466796875, 0.01275634765625, 0.0038604736328125, -0.00994873046875, -0.0177001953125, 0.04345703125, -0.033935546875, 0.00836181640625, -0.0220947265625, -0.0098876953125, -0.037841796875, -0.080078125, -0.051025390625, 0.0284423828125, -0.0079345703125, 0.048828125, 0.0185546875, -0.00933837890625, -0.005859375, -0.06396484375, -0.029052734375, 0.00

In [None]:
vectors = []
for embed in data:
    count += 1
    print(f"Count: {count}")

    embedding_response = pinecone_client.embeddings.create(input=embed, model='text-embedding-3-small', dimensions=1536)
    embedding_response_dict = embedding_response.model_dump()
    
    embedding = embedding_response_dict['data'][0]['embedding']
    token_usage = embedding_response_dict['usage']['total_tokens']

    vectors.append({
        'id': str(count),
        'values': embedding,
        'metadata': {
            'token_ct': token_usage,
            'text': embed,
            'group': 'individual_comparisons'
        }
    })

# upsert vectors
_pai_index.upsert(vectors=vectors)

## Embedding Pages

Using the `voyage-multimodal-3` embedding model, we can set up a function which will create a dense embedding, as well as using the `Qdrant/bm25` sparse embedding model to create a sparse embedding. Using Qdrant's `models.PointStruct`, we can create a template that contains the two vectors as well as a `payload`, which will include identifying information about the original file.

In [None]:
# for base64 encoding
import base64
from io import BytesIO

# import local BM-25 embedding model
from fastembed import SparseTextEmbedding

bm25_embed = SparseTextEmbedding(model_name="Qdrant/bm25")

def img_to_bytes(img: Image, format: str = "PNG") -> str:
    """Takes a Pillow image and converts it into a base64-encoded string

    Args:
        img (Image): The image to convert
        format (str, Optional): The format of the image. Defaults to "PNG"

    Returns:
        str: The base64-encoded string
    """
    buffer = BytesIO()
    img.save(buffer, format=format)
    img_str = base64.b64encode(buffer.getvalue())

    return img_str

def create_embedding_dense(img: Image = None, text: str = None):
    """Create a dense vector with multimodal data

    Args:
        img (Image, optional): The image to create the embedding with. Defaults to None.
        text (str, optional): The text to create the embedding with. Defaults to None.
    """
    if not (img or text):
        raise ValueError("You must provide either an image, text, or both!")

    inputs = []

    # append proper input based on what was specified
    if img:
        inputs.append(
            {
                "content":
                [
                    {
                        "type": "image_base64",
                        "image": img_to_bytes(img=img) # format: default PNG
                    }
                ]
            }
        )
    if text:
        inputs.append(
            {
                "content":
                [
                    {
                        "type": "text",
                        "text": text
                    }
                ]
            }
        )

    embedding = _voyage_client.multimodal_embed(inputs, model="voyage-multimodal-3")

def create_embedding_sparse(text: str):
    """Create a BM25 sparse embedding

    Args:
        text (str): The text to create an embedding for

    Returns:
        SparseEmbedding: the resulting BM25 embedding
    """
    embedding = bm25_embed.embed(text)
    return list(embedding)

## Qdrant Database Setup

This section will handle the set up of the Qdrant database with a hybrid retrieval using both BM-25 sparse vector ranking and a standard dense embedding vector created from Voyage AI

In [24]:
COLLECTION_NAME = "passionai-db"

if not _qdrant_client.collection_exists(COLLECTION_NAME):
    _qdrant_client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config={
            "dense": models.VectorParams(size=1024, distance=Distance.COSINE)
        },
        sparse_vectors_config={
            "bm25": models.SparseVectorParams()
        }
    )

### Example Query
An example query that leverages a hybrid search with both BM-25 and embedding dense vectors.

In [None]:
result = _qdrant_client.query_points(
    collection_name=COLLECTION_NAME,
    prefetch=[
        models.Prefetch(
            query=models.SparseVector(), # sparse vector created from embedding
            using="bm25",
            limit=20,
        ),
        models.Prefetch(
            query=[], # dense vector
            using="dense",
            limit=20,
        )
    ],
    query=models.FusionQuery(fusion=models.Fusion.RRF), # query using reciprocal rank fusion
)

NameError: name 'Prefetch' is not defined