In [None]:
import os
from models.file import File, FileType
from service.embedding import EmbeddingService
from termcolor import colored

PINECONE_INDEX = os.getenv("PINECONE_INDEX", "")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY", "")
PINECONE_HOST = os.getenv("PINECONE_HOST", "")

file = File(
    type=FileType.pdf,
    url="https://arxiv.org/pdf/2210.03629.pdf"
)
vector_credentials = {
        "type": "pinecone",
        "config": {
            "api_key": PINECONE_API_KEY,
            "host": PINECONE_HOST,
        }
    },

embedding_service = EmbeddingService(
        files=[file],
        index_name=PINECONE_INDEX,
        vector_credentials=vector_credentials
    )

In [None]:
elements = await embedding_service._partition_file(file, strategy="auto")


In [None]:
for element in elements:
    print(type(element))
    # print(f"Text: {element.text}")


In [None]:
docs = await embedding_service.generate_chunks(strategy="auto")

In [None]:
texts = [doc.content for doc in docs]

In [None]:
colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white']

concatenated_document = ""

for i, chunk in enumerate(texts):
    color = colors[i % len(colors)]
    colored_text = colored(chunk, color)
    print(colored_text)
    concatenated_document += chunk + " "

# print("\nConcatenated Document:\n", concatenated_document)