In [2]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector
from langchain_core.documents import Document
import time

In [27]:
loader = TextLoader('moby_dick.txt')
documents = loader.load()
print(len(documents))

1


In [28]:
len(documents[0].page_content)

1238223

In [29]:
chunk_size = 1000
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=80)
texts = text_splitter.split_documents(documents)
print(f'{len(texts)} chunks')
texts[0]

1770 chunks


Document(page_content='The Project Gutenberg eBook of Moby Dick; Or, The Whale\n    \nThis ebook is for the use of anyone anywhere in the United States and\nmost other parts of the world at no cost and with almost no restrictions\nwhatsoever. You may copy it, give it away or re-use it under the terms\nof the Project Gutenberg License included with this ebook or online\nat www.gutenberg.org. If you are not located in the United States,\nyou will have to check the laws of the country where you are located\nbefore using this eBook.\n\nTitle: Moby Dick; Or, The Whale\n\nAuthor: Herman Melville\n\nRelease date: July 1, 2001 [eBook #2701]\n                Most recently updated: August 18, 2021\n\nLanguage: English\n\nCredits: Daniel Lazarus, Jonesey, and David Widger\n\n\n*** START OF THE PROJECT GUTENBERG EBOOK MOBY DICK; OR, THE WHALE ***\n\n\n\n\nMOBY-DICK;\n\nor, THE WHALE.\n\nBy Herman Melville\n\n\n\nCONTENTS\n\nETYMOLOGY.\n\nEXTRACTS (Supplied by a Sub-Sub-Librarian).\n\nCHAPTER 1. Lo

In [30]:
embedding = OllamaEmbeddings(model='mxbai-embed-large:latest') # optionally self host ollama and use http://localhost:11434
temp = embedding.embed_query('This is a quick text')
len(temp)

1024

In [31]:
N_texts = 10
start_time = time.time()
embedding.embed_documents([t.page_content for t in texts[:N_texts]])
end_time = time.time()
time_taken = end_time-start_time
print(f'Took {time_taken} seconds to process {N_texts} of chunk_size {chunk_size} chars.')
print(f'Therefore would take ca. {(time_taken * (len(texts) / N_texts) / 60)} minutes to finish all')

Took 1.1735339164733887 seconds to process 10 of chunk_size 1000 chars.
Therefore would take ca. 3.4619250535964965 minutes to finish all


In [33]:
connection = "postgresql+psycopg://postgres:password@db:5432/test" # connection address is localhost (for connecting on machine) and "db" for inside docker's network
collection_name = "test_docs"

vectorstore = PGVector(
    embeddings=embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)
docs = [
    Document(
        page_content="there are cats in the pond",
        metadata={"id": 1, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="ducks are also found in the pond",
        metadata={"id": 2, "location": "pond", "topic": "animals"},
    ),
    Document(
        page_content="fresh apples are available at the market",
        metadata={"id": 3, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the market also sells fresh oranges",
        metadata={"id": 4, "location": "market", "topic": "food"},
    ),
    Document(
        page_content="the new art exhibit is fascinating",
        metadata={"id": 5, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a sculpture exhibit is also at the museum",
        metadata={"id": 6, "location": "museum", "topic": "art"},
    ),
    Document(
        page_content="a new coffee shop opened on Main Street",
        metadata={"id": 7, "location": "Main Street", "topic": "food"},
    ),
    Document(
        page_content="the book club meets at the library",
        metadata={"id": 8, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="the library hosts a weekly story time for kids",
        metadata={"id": 9, "location": "library", "topic": "reading"},
    ),
    Document(
        page_content="a cooking class for beginners is offered at the community center",
        metadata={"id": 10, "location": "community center", "topic": "classes"},
    ),
]

In [34]:
vectorstore.add_documents(docs, ids=[doc.metadata["id"] for doc in docs])

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [35]:
vectorstore.similarity_search("art", k=10)

[Document(page_content='the new art exhibit is fascinating', metadata={'id': 5, 'topic': 'art', 'location': 'museum'}),
 Document(page_content='a sculpture exhibit is also at the museum', metadata={'id': 6, 'topic': 'art', 'location': 'museum'}),
 Document(page_content='a new coffee shop opened on Main Street', metadata={'id': 7, 'topic': 'food', 'location': 'Main Street'}),
 Document(page_content='ducks are also found in the pond', metadata={'id': 2, 'topic': 'animals', 'location': 'pond'}),
 Document(page_content='fresh apples are available at the market', metadata={'id': 3, 'topic': 'food', 'location': 'market'}),
 Document(page_content='a cooking class for beginners is offered at the community center', metadata={'id': 10, 'topic': 'classes', 'location': 'community center'}),
 Document(page_content='the book club meets at the library', metadata={'id': 8, 'topic': 'reading', 'location': 'library'}),
 Document(page_content='there are cats in the pond', metadata={'id': 1, 'topic': 'ani

In [36]:
vectorstore = PGVector(
    embeddings=embedding,
    collection_name="moby",
    connection=connection,
    use_jsonb=True,
)
vectorstore.add_documents([t for t in texts])

['04b537d0-9fa5-40ca-aadf-66bd449488b0',
 '03c7a22a-d9d5-4098-b4e5-3ea28edf6b23',
 '579f095b-362d-4eb3-816d-16a581e55918',
 '3d022c1e-b91d-45da-b526-540689797e3b',
 'a31c624f-daab-41f4-a24a-3c2b1ce712fc',
 '969f552a-3184-4aa5-ae4d-27668feb8f66',
 'e5fcdcd9-a343-4956-bd4b-33c94eee5a9d',
 '0c4ade45-e999-4899-9002-b15b640a598c',
 'a5e95a18-f824-4c53-a598-e914171d4312',
 'b4d7c974-e838-4c38-92bb-c46db4a7c8e9',
 '25840d46-5e44-46c5-af28-7b23e9ee7b04',
 '907ec3a9-d395-4a79-88ea-80581a52d972',
 'b7cff25b-b1c0-4909-a7c7-65b4b7676254',
 'f70bd60e-4504-404a-affb-d692cf8b8f4e',
 'be19f707-b1db-44c2-970a-dd01f19af642',
 'e866373a-fe6a-425c-8419-5009c39844bf',
 '59e89d59-7df2-4829-8f96-2ce70e8752c4',
 'b5c51db9-8272-46fd-83d8-2908906fd5ec',
 '244c8eae-03b1-4380-86d6-edd2387934ba',
 '7b7ff5ab-a19b-4e55-b67a-e308d740fe70',
 '0538f30b-5496-4381-81c0-f4b427243984',
 'af55510a-7ce3-4112-9da9-d6e2b3569cac',
 'aa192714-be38-4124-8723-3b2dbb06542b',
 '60d42e65-3f70-4770-a9b7-aa68a04dc242',
 'e3bd17fd-793c-

In [5]:
connection = 'postgresql+psycopg://postgres:password@db:5432/llama'
embedding = OllamaEmbeddings(model='llama3:latest', base_url='http://host.docker.internal:31415')
collection_name = 'mind'

vectorstore = PGVector(
    embeddings=embedding,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

# en: How best to support a friend who is depressed?
# sv: Hur stöder man bäst en vän som är deprimerad?
for row in vectorstore.similarity_search('Research on combatting depression in patients with medical history', k=5):
    print()
    print(row.page_content)
    print(row.metadata)



initiated at later times in the evening (a late chronotype) is associated with depression in both
adult and adolescent populations [46]. A longitudinal study of adolescents found that late
chronotype is associated with future risk of diagnosis of depression and increases in depressive
symptoms, but the relationship seems bi-directional [47]. Several possible mechanisms may
explain this relationship, including societal factors such as early school start times, that prevent
late chronotypes from sleeping enough and thus increase their risk for adverse outcomes. Also,
psychological factors (i.e., rumination in the evening), biological factors (i.e., gene variations
associated with both circadian rhythm and mood), and light exposure (which may be reduced
in depressed individuals and has an impact on the circadian rhythm) could explain the rela-
tionship between chronotype and depression [48].
Study aims and goals
{'page': 3, 'title': 'Associations between sleep habits, quality, chronotype