In [1]:

import os
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

import torch
from dotenv import load_dotenv
from duckdb import duckdb, DuckDBPyConnection
from duckdb.typing import DuckDBPyType
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
db_path = '../data/llama.db'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# if Path(db_path).exists():
#     os.remove(db_path)

ARRAY_TYPE = DuckDBPyType(list[float])
EMBEDDING_DIMENSIONS = 4096
BATCH_SIZE = 16

In [3]:
def open_db(db_path: str = None, in_memory: Optional[bool] = False) -> DuckDBPyConnection:
  if db_path is None or in_memory:
    return duckdb.connect(':memory:')
  return duckdb.connect(db_path)


def load_extension(conn: DuckDBPyConnection, extension: str = 'vss') -> DuckDBPyConnection:
  try:
    conn.install_extension(extension)
    conn.load_extension(extension)
    conn.execute("SET GLOBAL hnsw_enable_experimental_persistence = true;")
  except Exception as e:
    print(f"Error loading extension: {e}")
  return conn


def initialize_schema(conn: DuckDBPyConnection) -> None:
  # Create documents table
  conn.execute(
    f"""
        CREATE TABLE IF NOT EXISTS documents (
            id INT PRIMARY KEY,
            file TEXT,
            text TEXT
        );
    """
    )

  # Create embeddings table
  conn.execute(
    f"""
        CREATE TABLE IF NOT EXISTS embeddings (
            doc_id INT,
            embedding {ARRAY_TYPE},
            FOREIGN KEY (doc_id) REFERENCES documents(id)
        );
    """
    )


def setup_embedding_model() -> HuggingFaceEmbedding:
  return HuggingFaceEmbedding(
    model_name='BAAI/bge-small-en-v1.5',
      device='cuda' if torch.cuda.is_available() else 'cpu',
      # embedding_dimensions=EMBEDDING_DIMENSIONS,
  )


In [4]:
db = open_db(db_path, in_memory=False)

load_extension(db, extension='vss')
initialize_schema(db)


In [5]:
required_extensions = ['.md']


# reader = SimpleDirectoryReader('/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/textual', required_exts=required_extensions, recursive=True)
# documents = reader.load_data()

@dataclass
class Document:
  text: str
  metadata: dict = None


doc1 = Document(
  text="""In the Loge, also known as At the Opera, is an 1878 Impressionist painting by the American artist Mary Cassatt. The oil-on-canvas work depicts a bourgeois woman in a box at an opera house looking through her opera glasses, while a man in the background looks at her from a different box. The woman's costume and fan identify her upper-class status. Art historians see the painting as a commentary on the role of gender, looking, and power in the social spaces of the 19th century. The painting is now at the Museum of Fine Arts, Boston, which also holds a preliminary drawing for the work.
        """,
  metadata={'file_path': '/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/doc_1.md'}
  )

doc2 = Document(
    text="""Sunflowers (original title, in French: Tournesols) is the title of two series of still life paintings by the Dutch painter Vincent van Gogh. The first series, executed in Paris in 1887, depicts the flowers lying on the ground, while the second set, made a year later in Arles, shows a bouquet of sunflowers in a vase. In the artist's mind, both sets were linked by the name of his friend Paul Gauguin, who acquired two of the Paris versions. About eight months later, van Gogh hoped to welcome and impress Gauguin again with Sunflowers, now part of the painted Décoration for the Yellow House that he prepared for the guestroom of his home in Arles, where Gauguin was supposed to stay. """,
    metadata={'file_path': '/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/doc_2.md'}
)

doc3 = Document(
    text="""
    On March 30, 1987, Japanese insurance magnate Yasuo Goto paid the equivalent of US$39,921,750 for van Gogh's Still Life: Vase with Fifteen Sunflowers at auction at Christie's London, at the time a record-setting amount for a work of art.[21] The price was over three times the previous record of about $12 million paid for Andrea Mantegna's Adoration of the Magi in 1985. The record was broken a few months later with the purchase of another van Gogh, Irises, by Alan Bond for $53.9 million at Sotheby's, New York on November 11, 1987.[citation needed]

While it is uncertain whether Yasuo Goto bought the painting himself or on behalf of his company, the Yasuda Fire and Marine Insurance Company of Japan, the painting currently resides at Seiji Togo Yasuda Memorial Museum of Modern Art in Tokyo. After the purchase, a controversy arose whether this is a genuine van Gogh or an Émile Schuffenecker forgery.[citation needed]
Provenances

Two Paris versions van Gogh exchanged with Gauguin in December 1887 or January 1888, were both sold to Ambroise Vollard: one in January 1895 and the other in April 1896. The first canvas resided for a short time with Félix Roux, but was reacquired by Vollard and sold to Degas, then from his estate to Rosenberg, then to Hahnloser and bequested to the Kunstmuseum Bern. The second was acquired by the Dutch collector Hoogendijk at the sale of his collection by Kann, who ceded the painting to Richard Bühler and then via Thannhauser to the Metropolitan Museum in New York.[citation needed]

Two of van Gogh's Sunflowers paintings never left the artist's estate: the study for one of the Paris versions (F377) and the repetition of fourth version (F458). Both are in the possession of the Vincent van Gogh Foundation, established 1962 by Vincent Willem van Gogh, the artist's nephew, and on permanent loan to the Van Gogh Museum, Amsterdam.[citation needed]

Five other versions are recorded in the van Gogh estate papers:[22]

    the final Paris version (F.452) in the artist's estate was sold 1909 via C. M. van Gogh, The Hague (J. H. de Bois) to Kröller-Müller
    (F457) sold 1894 to Émile Schuffenecker. (Tokyo version).
    (F456) sold 1905 via Paul Cassirer to Hugo von Tschudi. (Munich version).
    (F459) sold 1908 C. M. van Gogh (J. H. de Bois), The Hague to Fritz Meyer-Fierz, Zürich (destroyed by U.S. air raid in Japan on 6 August 1945).
    (F454) sold 1924 via Ernest Brown & Phillips (The Leicester Galleries) to the Tate Gallery; since on permanent loan to the National Gallery, London. (London version).""",
    metadata={
        'file_path': '/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/doc_3.md'
    }
)

mock_docs = [doc1, doc2, doc3]

documents = mock_docs

# df: pd.DataFrame = pd.DataFrame(schema={
#     'id': int,
#     'file': str,
#     'text': str,
# })
# files = glob.glob('/Users/ada.sh/Developments/Projects/TechnicalDocsAssistant/tests/test_data/textual/**/*.md', recursive=True)
# for i, file in enumerate(files):
#   try:
#       with open(file, 'r', encoding="utf-8") as f:
#           text = f.read()
#           documents.append(text)
#           fd = pd.DataFrame({
#               'id': i,
#               'file': file,
#               'text': text
#           })
#           df.extend(fd)
#   except Exception as e:
#       print(e)


In [6]:
for i, doc in enumerate(documents):
  db.execute(
    """
        INSERT INTO documents (id, file, text) VALUES (?, ?, ?)
        """, [i, doc.metadata.get('file_path', ''), doc.text]
    )

In [8]:
db.sql("""
SELECT * FROM documents;""").show()


┌───────┬─────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [9]:
embed_model = setup_embedding_model()

In [11]:
for doc in db.execute("SELECT id, text FROM documents; ").fetchall():
    doc_id, text = doc
    embedding = embed_model.get_text_embedding(text)

    db.execute("""
    INSERT INTO embeddings (doc_id, embedding) VALUES (?, ?)""", [doc_id, embedding])

In [None]:
db.execute("""""")