In [None]:
import os

from langchain_community.document_loaders import UnstructuredFileLoader
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import utils as chromautils

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2" # standard embedding model 
model = "llama3.2" # model needs already be available, already pulled with for example 'ollama run llama3:instruct'

In [None]:
question = f'When did God forsake us?'

In [None]:
# Define the directory containing your log files. Note: if they have .CSV endings other document loaders might be better
text_files_directory = ""

In [None]:
def list_text_files(directory):
    file_list = []
    for (root, dirs, file) in os.walk(directory):
        for f in file:
            if '.txt' in f:
                file_list.append(f'{root}/{f}')
    return file_list

# Specify the directory to search for PDF files
txt_files = list_text_files(text_files_directory)
num_files = len(txt_files)
print(txt_files)


In [None]:
# Load and embed the content of the log files
def load_and_embed_files(file_paths):
    documents = []
    for file_path in file_paths:
        loader = UnstructuredFileLoader(file_path, mode="elements")
        documents.extend(loader.load())
        documents = chromautils.filter_complex_metadata(documents)
    return documents

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
documents = load_and_embed_files(txt_files)
# embedding_model = SentenceTransformerEmbeddings(model_name=embedding_model_name)

In [None]:
# Define the path to store the Chroma vector store (in SQLite format)
v_path_vector_store = ''

In [17]:
# create the vector store from the documents / logs you provided
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=v_path_vector_store)

In [None]:
# load vectorstore from disk
chroma_db = Chroma(persist_directory=v_path_vector_store, embedding_function=embedding_model)
type(chroma_db)

In [None]:
# llm_model = Ollama(model=model, verbose=False)  # Disable verbose for batch processing
llm = ChatOllama(model=model, temperature=0.2, num_ctx=50000, verbose=False)
print(f"Loaded LLM model {llm.model}")

In [None]:
retriever = chroma_db.as_retriever(search_kwargs={f"k": 2})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever,)

# Use the 'invoke' method to handle the query
result = qa_chain.invoke({"query": 'When did God forsake us?'})
print(result.get('result'))

In [None]:
# Define the instruction and log file prompts
v_instruct = 'You are an existentialist that is convinced you were born without purpose.'
v_prompt = 'When did God forsake us?'
combined_prompt = v_instruct + "\n" + v_prompt

In [None]:
# Use the LLM to process the combined prompt
response = llm(combined_prompt)

In [None]:
# Print the response
print(response)