In [None]:
from typing import List
import json

from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from langchain_core.documents import Document
from langchain_ollama import OllamaLLM
from langchain_core.messages import HumanMessage

In [None]:
def partition_documents(file_path):
    print(f"Partitioning document: {file_path}")
    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
        )
    print(f"Partitioned {len(elements)} elements.")
    return elements

In [None]:
elements = partition_documents("docs/history.pdf")
print(f"First 5 elements: {elements[:5]}")

In [None]:
elements[6].to_dict()

In [None]:
def create_chunk_by_title(elements):
    print("Creating chunks by title...")
    chunks = chunk_by_title(
        elements,
        max_characters=3000,
        new_after_n_chars=2400,
        combine_text_under_n_chars=500
        )
    print(f"Created {len(chunks)} chunks.")
    return chunks

In [None]:
chunks = create_chunk_by_title(elements)

In [None]:
chunks[5].metadata.orig_elements

In [None]:
def separate_content_types(chunk):
    print("Separating content types in chunk...")
    content_data = {
        "text": chunk.text,
        "tables": [],
        "images": [],
        "types": ['text']
    }

    if hasattr(chunk, "metadata") and hasattr(chunk.metadata, "orig_elements"):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__

            if element_type == "Table":
                content_data["types"].append("table")
                table_html = getattr(element.metadata, "text_as_html", element.text)
                content_data["tables"].append(table_html)
            elif element_type == "Image":
                if hasattr(element, 'metadata') and hasattr(element.metadata, "image_base64"):
                    element.filename = element.metadata.filename
                    content_data["types"].append("image")
                    content_data["images"].append(element.metadata.image_base64)
    content_data["types"] = list(set(content_data["types"]))
    return content_data

In [None]:
def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    print("Creating AI-enhanced summary...")
    
    try:
        llm =  OllamaLLM(model="llama3.2:latest")
        prompt_text = f"""You are creating a searchable description for document content retrieval.

        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}

        """
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"TABLE {i+1}:\n{table}\n\n"

                prompt_text += """
                YOUR TASK:
                Genarate a comprehensive, searchable description tha covers:

                1. Key facts, numbers, and data points from text and tables
                2. Main topics and concepts discussed.
                3. Questions this content could answer.
                4. Visual content analysis (charts, diagrams, patterns in images)
                5. Alternative search terms users might use

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:
                """

        message_content = [{"type": "text", "text": prompt_text}]

        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                    "image_url": f"data:image/jpeg;base64,{image_base64}"
                    })
            
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        return response
    except Exception as e:
        print(f"Error in AI summary generation: {e}")
        summary = f"{text[:300]}..."
        if tables:
            summary += f" [Contains {len(tables)} table(s)]"
        if images:
            summary += f" [Contains {len(images)} image(s)]"
        return summary

In [None]:
def summarise_chunks(chunks):
    print("Summarising chunks...")
    langchain_documents = []
    total_chunks = len(chunks)

    for i, chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f"Summarising chunk {current_chunk}/{total_chunks}...")

        content_data = separate_content_types(chunk)

        print(f"Types found: {content_data['types']}")
        print(f"Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")

        if content_data["tables"] or content_data["images"]:
            print("Creating AI summary for chunk with tables/images...")
            try:
                enhanced_content = create_ai_enhanced_summary(
                    content_data["text"],
                    content_data["tables"],
                    content_data["images"]
                )
                print("Enhanced AI summary created successfully.")
                print(f"Enhanced summary: {enhanced_content[:200]}...")  # Print first 200 chars of summary
            except Exception as e:
                print(f"Error creating AI summary: {e}")
                enhanced_content = content_data["text"]
        else:
            print("No tables/images found, using original text.")
            enhanced_content = content_data["text"]

        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })            
            })
        langchain_documents.append(doc)
    return langchain_documents

In [None]:
processed_chunks = summarise_chunks(chunks)

In [None]:
processed_chunks

In [None]:

from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

def create_vector_store(chunks, persist_directory="db/chroma_db"):
    # Create a vector store from the chunks
    print("Creating vector store...")
    if not chunks:
        print("No chunks to create vector store.")
        return None

    embedding_model = get_embedding_function()

    vector_store = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_model,
        persist_directory=persist_directory,
        collection_metadata={"hnsw:space": "cosine"}
        )

    print(f"Vector store created and saved to {persist_directory}.")
    return vector_store

In [None]:
create_vector_store(processed_chunks)