In [2]:
import os

from langchain_unstructured import UnstructuredLoader
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import utils as chromautils

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

from torch import cuda
from gc import collect

import shutil

#embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
#embedding_model_name = "ibm-granite/granite-embedding-125m-english"
embedding_model_name = 'BAAI/bge-large-en-v1.5'
model = 'llama3.2'

In [29]:
collect()
cuda.empty_cache()

In [None]:
text_files_directory = "/home/spinnaker/py_dev/ai_training/rag_db/nissan"

def list_text_files(directory):
    file_list = []
    for (root, dirs, file) in os.walk(directory):
        for f in file:
            if '.txt' in f:
                file_list.append(f'{root}/{f}')
    return file_list

# Specify the directory to search for PDF files
txt_files = list_text_files(text_files_directory)
print(txt_files)
print(len(txt_files))

In [None]:
# Load and embed the content of the log files
def load_and_embed_files(file_paths):
    documents = []
    for file_path in file_paths:
        loader = UnstructuredLoader(file_path, mode="elements")
        documents.extend(loader.load_and_split())
        documents = chromautils.filter_complex_metadata(documents)
    return documents
documents = load_and_embed_files(txt_files)
print(len(documents))

In [30]:
documents.clear()
txt_files.clear()

In [None]:
# Split documents list into sublists to be fed in a loop for vectorization.
def split_into_chunks(lst, chunk_size):
    chunks = []
    for i in range(0, len(lst), chunk_size):
        chunks.append(lst[i:i + chunk_size])
    return chunks
documents_list = split_into_chunks(documents, 40000)
print(len(documents_list))

In [None]:
# create the vector store from the documents / logs you provided
# Max batch size for embedding is 41666
v_path_vector_store = '/home/spinnaker/py_dev/ai_training/rag_db/nissan_db'
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=v_path_vector_store)

In [None]:
# Add documents to existing vector store
for i in documents_list:
    v_path_vector_store = '/home/spinnaker/py_dev/ai_training/rag_db/cyber_db'
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    vector_store = Chroma(embedding_function=embedding_model, persist_directory=v_path_vector_store)
    vector_store.add_documents(documents=i)
    print('Vector store done')

In [3]:
# Properly delete old vector store to reuse same disk space
v_path_vector_store = '/home/spinnaker/py_dev/ai_training/rag_db/nissan_db'
shutil.rmtree(v_path_vector_store)

In [None]:
# load vectorstore from disk
v_path_vector_store = '/home/spinnaker/py_dev/ai_training/rag_db/nissan_db'
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
chroma_db = Chroma(persist_directory=v_path_vector_store, embedding_function=embedding_model)
type(chroma_db)

In [None]:
# llm_model = Ollama(model=model, verbose=False)  # Disable verbose for batch processing
llm = ChatOllama(model=model, temperature=0, num_ctx=4096, repeat_last_n=10000, verbose=False, keep_alive=0)
print(f"Loaded LLM model {llm.model}")

In [None]:
retriever = chroma_db.as_retriever(search_kwargs={f"k": 20})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever,)

# Use the 'invoke' method to handle the query
result = qa_chain.invoke({"query": 'You have been loaded with the 2022 Nissan Frontier Owners Manual in a RAG database. Using this information, write a summary of anything mentioned about maintenance schedules.'})
print(result.get('result'))