In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
import nltk
nltk.download(['punkt', 'punkt_tab'])

[nltk_data] Downloading package punkt to /home/work/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/work/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements, elements_to_json
from unstructured.partition.text import partition_text
from unstructured.partition.docx import partition_docx
from unstructured.partition.doc import partition_doc
from unstructured.partition.image import partition_image

from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title


In [4]:
import logging
import sys
import os

import qdrant_client
from IPython.display import Markdown, display
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.fastembed import FastEmbedEmbedding
from llama_index.core import Settings

import hashlib

In [5]:
from qdrant_client import QdrantClient, models

In [6]:
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [7]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings


In [8]:
import openai

In [9]:
qdrant_api_key=""
qdrant_api_url="https://113d73aa-ce94-4871-8c26-bd7208c08a88.europe-west3-0.gcp.cloud.qdrant.io"

client = QdrantClient(
    qdrant_api_url,
    api_key=qdrant_api_key,
)

Normalize Docs using unstructured

In [10]:
def createPartition(file_path, file_type):
    extension = os.path.splitext(file_path)[-1].lower()
    file_elements = createPartitionByFileType(file_path, extension)
    file_dict = [el.to_dict() for el in file_elements]
    return file_dict
 
        
def createPartitionByFileType(file_path, file_type):
    match file_type:
        case '.pdf':
            return partition_pdf(file_path)
        case '.doc':
            return partition_doc(file_path)
        case '.docx':
            return partition_docx(file_path)
        case '.jpeg' | '.png' | '.jpeg':
            return createImagePartition(file_path, False)
        case '.txt':
            return partition_text(file_path)
        case _:
            return partition(file_path)

def createImagePartition(file_path, isOcrEnabled):
    if isOcrEnabled:
        return partition_image(file_path, strategy="ocr_only")
    else:
        return  partition_image(file_path)
    


In [11]:
def normalizePDF(document_dict, partition_labels, chapter):
    label_ids = {}
    for element in document_dict:
        for label in partition_labels:
            if element["text"] == label and element["type"] == "Title":
                label_ids[element["element_id"]] = chapter
                break
    return label_ids

Chunk

In [12]:
def createChunk(file_element):
    chunks = chunk_by_title(
    file_element,
    combine_text_under_n_chars=100,
    max_characters=3000,
    )   
    return chunks

# Add metadata to chunks
def processChunk(chunks):
    documents = []
    for element in chunks:
        metadata = element.metadata.to_dict()
        print(metadata)
        del metadata["languages"]
        metadata["source"] = metadata["filename"]
        metadata["hash_id"] = createHash(element.id)
        documents.append(Document(page_content=element.text, metadata=metadata))
    return documents

def createHash(id):
   return hashlib.sha256().update((id).encode('utf-8')).hexdigest()

Create Embeddings

In [13]:
openai_embedding_models = {
    "small": "text-embedding-3-small",
    "ada": "text-embedding-ada-002"
}

In [14]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [15]:
def createQdrantEmbeddings(documents, collection_name):
    vector_store = QdrantVectorStore(client=client, collection_name=collection_name)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(
        documents,
        storage_context=storage_context,
    )

In [16]:
# def folderReader(folder_path):
#     return SimpleDirectoryReader(folder_path).load_data()

# def buildQdrantVectoreStore(documents):
#     vector_store = QdrantVectorStore(client=client, collection_name="paul_graham")
#     storage_context = StorageContext.from_defaults(vector_store=vector_store)
#     index = VectorStoreIndex.from_documents(
#         documents,
#         storage_context=storage_context,
#     )