In [1]:
import os
from typing import Optional

import torch
from dotenv import load_dotenv
from duckdb import duckdb, DuckDBPyConnection
from duckdb.typing import DuckDBPyType
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.core.node_parser import SentenceSplitter

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
db_path = '../data/llama.db'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ARRAY_TYPE = DuckDBPyType(list[float])
CHUNK_SIZE = 1024
CHUNK_OVERLAP = 200
BATCH_SIZE = 16

In [37]:
def close_db(conn):
    if conn is not None:
        conn.close()
    else:
        print('You must provide the connection to the database')

def open_db(db_path: str = None, in_memory: Optional[bool] = False) -> DuckDBPyConnection:
  if db_path is None or in_memory:
    return duckdb.connect(':memory:')
  return duckdb.connect(db_path)


def load_extension(conn: DuckDBPyConnection, extension: str = 'vss') -> DuckDBPyConnection:
  try:
    conn.install_extension(extension)
    conn.load_extension(extension)
    conn.execute("SET GLOBAL hnsw_enable_experimental_persistence = true;")
  except Exception as e:
    print(f"Error loading extension: {e}")
  return conn


def initialize_schema(conn: DuckDBPyConnection) -> None:
  # Create documents table
  conn.execute(
    f"""
        CREATE TABLE IF NOT EXISTS documents (
            id INT PRIMARY KEY,
            file TEXT,
            text TEXT
        );
    """
    )

  conn.execute("""
    CREATE TABLE IF NOT EXISTS chunks (
    id INT PRIMARY KEY,
    doc_id INT,
    chunk_text TEXT,
    chunk_index INT,
    FOREIGN KEY(doc_id) REFERENCES documents(id)
    );
  """)

  # Create embeddings table
  conn.execute(
    f"""
        CREATE TABLE IF NOT EXISTS embeddings (
            chunk_id INT,
            embedding {ARRAY_TYPE},
            FOREIGN KEY (chunk_id) REFERENCES chunks(id)
        );
    """
    )

def setup_node_parser() -> SentenceSplitter:

    return SentenceSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        separator=" ",
        paragraph_separator="\n\n"
    )

def process_documents(documents: list[Document], db: DuckDBPyConnection, node_parser: SentenceSplitter) -> None:

    for i, doc in enumerate(documents):
        db.execute(
            """
            INSERT INTO documents (id, file, text)
            VALUES (?, ?, ?)""",
            [i, doc.metadata.get('file_path', ''), doc.text]
        )

        nodes = node_parser.get_nodes_from_documents([doc])

        for chunk_idx, node in enumerate(nodes):
            # The hacky id incrementer here should be replaced with a sequence
            db.execute(
                """
                INSERT INTO chunks (id, doc_id, chunk_text, chunk_index)
                VALUES (( SELECT COALESCE(MAX(id), -1) + 1 FROM chunks), ?, ?, ?)""",
                [i, node.text, chunk_idx]
            )

def setup_embedding_model() -> HuggingFaceEmbedding:
  return HuggingFaceEmbedding(
    model_name='BAAI/bge-small-en-v1.5',
      device='cuda' if torch.cuda.is_available() else 'cpu',
  )

def create_embeddings(db: DuckDBPyConnection, embedding_model: HuggingFaceEmbedding) -> None:

    chunks = db.execute("""
    SELECT id, chunk_text FROM chunks""").fetchall()

    for idx, chunk in enumerate(chunks):
        chunk_id, chunk_text = chunk
        embedding = embedding_model.get_text_embedding(chunk_text)

        db.execute("""
        INSERT INTO embeddings (chunk_id, embedding)
        VALUES (?, ?)""",
                   [chunk_id, embedding])

def query_documentation(
        query_text: str,
        db: DuckDBPyConnection,
        embed_model: HuggingFaceEmbedding,
        top_k: int = 5,
) -> list[dict]:

    query_embedding = embed_model.get_text_embedding(query_text)

    results = db.execute("""
      WITH top_matches AS (
        SELECT
          chunk_id,
          array_inner_product(
          embedding::FLOAT[384], ?::FLOAT[384]) as similarity,
          FROM embeddings
          ORDER BY similarity DESC
          LIMIT ?
      )
      SELECT c.id as chunk_id,
      c.chunk_text,
      c.chunk_index,
      d.file,
      d.text as full_doc_text,
      m.similarity
      from top_matches m
      JOIN chunks c on c.id = m.chunk_id
      JOIN documents d on d.id = c.doc_id
      ORDER BY m.similarity DESC
    """, [query_embedding, top_k]).fetchall()

    return [
      {
          "chunk_id": row[0],
          "chunk_text": row[1],
          "chunk_index": row[2],
          "file": row[3],
          "document_text": row[4],
          "similarity": row[5]

      }
                  for row in results
    ]


In [10]:
db = open_db(db_path, in_memory=False)

load_extension(db, extension='vss')
initialize_schema(db)

node_parser = setup_node_parser()
embed_model = setup_embedding_model()


In [12]:
required_extensions = ['.md']


reader = SimpleDirectoryReader('/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/textual', required_exts=required_extensions, recursive=True)
documents = reader.load_data()


In [14]:
process_documents(documents, db=db, node_parser=node_parser)

In [16]:
db.sql("""
SELECT * FROM documents;""").show()


┌───────┬──────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [33]:
create_embeddings(db, embed_model)


In [38]:
results = query_documentation(
    "How do I create a new app in Textualize?",
    top_k=5,
    db=db,
    embed_model=embed_model,
)

for r in results:
    print(f"\nSimilarity: {r['similarity']:.4f}")
    print(f"File: {r['file']}")
    print(f"Chunk {r['chunk_index']}:")
    print(f"Text: {r['chunk_text']}")
    print("---")


Similarity: 0.7984
File: /Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/textual/api/app.md
Chunk 0:
Text: ---
title: "textual.app"
---

::: textual.app
    options:
        filters:
          - "!^_"
          - "^__init__$"
---

Similarity: 0.7772
File: /Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/textual/guide/app.md
Chunk 0:
Text: # App Basics

In this chapter we will cover how to use Textual's App class to create an application. Just enough to get you up to speed. We will go into more detail in the following chapters.

## The App class

The first step in building a Textual app is to import the [App][textual.app.App] class and create a subclass. Let's look at the simplest app class:

```python
--8<-- "docs/examples/app/simple01.py"
```


### The run method

To run an app we create an instance and call [run()][textual.app.App.run].

```python hl_lines="8-10" title="simple02.py"
--8<-- "docs/examples/app/simple02.py"
```

Apps

In [12]:
db.close()