# Store and retrieval

Make sure to have your huggingface api key provided to download the embedding model (see README.md).

## 1. Load and split the documents

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

# load the document and split it into chunks
loader = TextLoader("./content/max-und-moritz.txt")
documents = loader.load()

# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size = 2500, chunk_overlap = 200)
docs = text_splitter.split_documents(documents)

## 2. Store documents into the vector database

In [None]:
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# prepare the embedding model
embedding_function = SentenceTransformerEmbeddings(model_name = 'intfloat/multilingual-e5-large')

# load it into Chroma
db = Chroma(embedding_function = embedding_function, persist_directory = './chromadb/')
db.add_documents(docs)

## 3. Query the documents

In [None]:
query = 'Wie heisst der Lehrer?'
#query = 'Who is the sensei?'
#query = 'Was machen die Hühner unmittelbar vor ihrem Tod?'
#query = 'Wie viele Hühner hatte Witwe Bolte?'
#query = 'Was machen Max und Moritz mit dem Schneider und wie heisst er?'

results = db.similarity_search_with_score(query)

for result in results:
    print(f'Score: {result[1]}')
    print(f'Text:\n{result[0].page_content}\n\n')

## 4. RAG

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

retriever = db.as_retriever()

template = """Beantworte die Frage nur aufgrund der folgenenden Informationen:
{context}

Frage: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model_name = 'gpt-3.5-turbo'
model_name = 'gpt-4-1106-preview'
model = ChatOpenAI(model_name = model_name)

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke(query)