This version uses Milvus through Docker Compose so you must have Docker installed to run this notebook (Milvus is spun up via `docker compose up -d` as shown in the block below)

In [None]:
# ! pip install -qU pymilvus langchain sentence-transformers tiktoken octoai-sdk openai
# docker-compose up -d

In [67]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OCTOAI_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [68]:
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="mixtral-8x7b-instruct-fp16",
        max_tokens=200,
        presence_penalty=0,
        temperature=0.1,
        top_p=0.9,
    )

                model was transferred to model_kwargs.
                Please confirm that model is what you intended.


In [69]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus

In [70]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [71]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [54]:
# files = os.listdir("../data")

In [125]:
files = ['2_sanitized.txt']

In [126]:
file_texts = []

In [127]:
for file in files:
    with open(f"../task_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=256, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))
        
    with open(f"../task_data/output_{file.split('.')[0]}.txt", 'w') as output_file:
        for document in file_texts:
            output_file.write(f"Title: {document.metadata['doc_title']}, Chunk: {document.metadata['chunk_num']}\n")
            output_file.write(f"{document.page_content}\n")
            output_file.write("--------\n")

In [129]:
# For the first run
# 
# print(file_texts)
# vector_store = Milvus.from_documents(
#     file_texts,
#     embedding=embeddings,
#     connection_args={"host": "localhost", "port": 19530},
#     collection_name="tasks"
# )

# if you already have the data you need stored in Milvus
vector_store = Milvus(
    embedding_function=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="tasks"
)

In [130]:
retriever = vector_store.as_retriever()

In [131]:
from langchain.prompts import ChatPromptTemplate
template="""You will receive data in following format: date isfinished content. anwer the question based on these data. For example, 2023-04-02 Y Kayaking on the river means I plan to kayak on 2023-04-02 and I make it (since it is Y) 
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [132]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [136]:
response = chain.invoke("what is the data you receive, how long it is")
print(response)

 The data you receive is a list of documents, where each document is a series of strings in a specific format. This format consists of a date, an 'N' or 'Y' indicating whether the activity was completed or not, and a description of the activity. The length of this data is 5 documents, each with 11 such strings.


In [107]:
response

' I\'m sorry, I don\'t see any connection between your statement "I also kyaka in 4/4, 4/5..." and the provided context. Could you please clarify or rephrase your question?'