In [37]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import time

In [19]:
loader = TextLoader('moby_dick.txt')
documents = loader.load()
print(len(documents))

1


In [20]:
len(documents[0].page_content)

1238223

In [21]:
chunk_size = 1000
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=80)
texts = text_splitter.split_documents(documents)
print(f'{len(texts)} chunks')
texts[0]

1770 chunks


Document(page_content='The Project Gutenberg eBook of Moby Dick; Or, The Whale\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Moby Dick; Or, The Whale\n\nAuthor: Herman Melville\n\nRelease date: July 1, 2001 [eBook #2701]\n                Most recently updated: August 18, 2021\n\nLanguage: English\n\nCredits: Daniel Lazarus, Jonesey, and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK MOBY DICK; OR, THE WHALE ***\n\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Lo

In [27]:
embedding = OllamaEmbeddings(model='mxbai-embed-large:latest', base_url='http://host.docker.internal:31415') # optionally self host ollama and use http://localhost:11434
temp = embedding.embed_query('This is a quick text')
len(temp)

2048

In [56]:
N_texts = 10
start_time = time.time()
embedding.embed_documents([t.page_content for t in texts[:N_texts]])
end_time = time.time()
time_taken = end_time-start_time
print(f'Took {time_taken} seconds to process {N_texts} of chunk_size {chunk_size} chars.')
print(f'Therefore would take ca. {(time_taken * (len(texts) / N_texts) / 60)} minutes to finish all')

Took 197.8019278049469 seconds to process 10 of chunk_size 1000 chars.
Therefore would take ca. 583.5156870245934 minutes to finish all


In [59]:
connection = "postgresql+psycopg://postgres:password@db:5432/vector_db"
collection_name = "test_docs"

vectorstore = PGVector(
    embeddings=embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
docs = [
    Document(
        page_content="there are cats in the pond",
        metadata={"id": 1, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="ducks are also found in the pond",
        metadata={"id": 2, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="fresh apples are available at the market",
        metadata={"id": 3, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the market also sells fresh oranges",
        metadata={"id": 4, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the new art exhibit is fascinating",
        metadata={"id": 5, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a sculpture exhibit is also at the museum",
        metadata={"id": 6, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a new coffee shop opened on Main Street",
        metadata={"id": 7, "location": "Main Street", "topic": "food"},
    ),
    Document(
        page_content="the book club meets at the library",
        metadata={"id": 8, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="the library hosts a weekly story time for kids",
        metadata={"id": 9, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="a cooking class for beginners is offered at the community center",
        metadata={"id": 10, "location": "community center", "topic": "classes"},
    ),
]

In [60]:
vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [61]:
vectorstore.similarity_search("art", k=10)

[Document(page_content='a sculpture exhibit is also at the museum', metadata={'id': 6, 'topic': 'art', 'location': 'museum'}),
 Document(page_content='the new art exhibit is fascinating', metadata={'id': 5, 'topic': 'art', 'location': 'museum'}),
 Document(page_content='ducks are also found in the pond', metadata={'id': 2, 'topic': 'animals', 'location': 'pond'}),
 Document(page_content='there are cats in the pond', metadata={'id': 1, 'topic': 'animals', 'location': 'pond'}),
 Document(page_content='the library hosts a weekly story time for kids', metadata={'id': 9, 'topic': 'reading', 'location': 'library'}),
 Document(page_content='fresh apples are available at the market', metadata={'id': 3, 'topic': 'food', 'location': 'market'}),
 Document(page_content='the book club meets at the library', metadata={'id': 8, 'topic': 'reading', 'location': 'library'}),
 Document(page_content='the market also sells fresh oranges', metadata={'id': 4, 'topic': 'food', 'location': 'market'}),
 Documen

In [62]:
vectorstore = PGVector(
    embeddings=embedding,
    collection_name="moby",
    connection=connection,
    use_jsonb=True,
)
vectorstore.add_documents([t for t in texts])

KeyboardInterrupt: 

In [55]:
vectorstore.similarity_search("vision", k=10)

[Document(page_content='CHAPTER 118. The Quadrant.\n\nCHAPTER 119. The Candles.\n\nCHAPTER 120. The Deck Towards the End of the First Night Watch.\n\nCHAPTER 121. Midnight.—The Forecastle Bulwarks.\n\nCHAPTER 122. Midnight Aloft.—Thunder and Lightning.\n\nCHAPTER 123. The Musket.\n\nCHAPTER 124. The Needle.\n\nCHAPTER 125. The Log and Line.\n\nCHAPTER 126. The Life-Buoy.\n\nCHAPTER 127. The Deck.\n\nCHAPTER 128. The Pequod Meets The Rachel.\n\nCHAPTER 129. The Cabin.\n\nCHAPTER 130. The Hat.\n\nCHAPTER 131. The Pequod Meets The Delight.\n\nCHAPTER 132. The Symphony.\n\nCHAPTER 133. The Chase—First Day.\n\nCHAPTER 134. The Chase—Second Day.\n\nCHAPTER 135. The Chase.—Third Day.\n\nEpilogue\n\n\n\n\nOriginal Transcriber’s Notes:', metadata={'source': 'moby_dick.txt'}),
 Document(page_content='EXTRACTS. (Supplied by a Sub-Sub-Librarian).\n\n\n\n  It will be seen that this mere painstaking burrower and grub-worm of\n  a poor devil of a Sub-Sub appears to have gone through the long\n  Vatic