In [1]:
import os

from langchain_unstructured import UnstructuredLoader
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import utils as chromautils
from chromadb.utils.batch_utils import create_batches

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

import shutil

#embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
#embedding_model_name = "ibm-granite/granite-embedding-125m-english"
embedding_model_name = 'BAAI/bge-large-en-v1.5'
model = 'llama3.2'

In [2]:
text_files_directory = "/home/spinnaker/py_dev/rag_db/source_files/20250421"

def list_text_files(directory):
    file_list = []
    for (root, dirs, file) in os.walk(directory):
        for f in file:
            if '.txt' in f:
                file_list.append(f'{root}/{f}')
    return file_list

# Specify the directory to search for PDF files
txt_files = list_text_files(text_files_directory)
print(txt_files)
print(len(txt_files))

['/home/spinnaker/py_dev/rag_db/source_files/20250421/security_cisa-warns-of-increased-breach-risks-following-oracle-cloud-leak.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/CVE-2025-1093.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/CVE-2025-3826.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/security_cisa-extends-funding-to-ensure-no-lapse-in-critical-cve-services_#comments.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/CVE-2025-3830.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/CVE-2021-4455.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/security_cisa-tags-sonicwall-vpn-flaw-as-actively-exploited-in-attacks_#comments.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/CVE-2025-3820.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/security_fbi-scammers-pose-as-fbi-ic3-employees-to-help-recover-lost-funds.txt', '/home/spinnaker/py_dev/rag_db/source_files/20250421/security_sonicwall-sma-vpn-devi

In [3]:
# Load and embed the content of the log files
def load_and_embed_files(file_paths):
    documents = []
    for file_path in file_paths:
        loader = UnstructuredLoader(file_path, mode="elements")
        documents.extend(loader.load_and_split())
        documents = chromautils.filter_complex_metadata(documents)
    return documents
documents = load_and_embed_files(txt_files)

In [4]:
# Split documents list into sublists to be fed in a loop for vectorization.
def split_into_chunks(lst, chunk_size):
    chunks = []
    for i in range(0, len(lst), chunk_size):
        chunks.append(lst[i:i + chunk_size])
    return chunks
documents_list = split_into_chunks(documents, 500)

In [None]:
# create the vector store from the documents / logs you provided
v_path_vector_store = '/home/spinnaker/py_dev/rag_db/test9'
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=v_path_vector_store)

In [None]:
# Add documents to existing vector store
for i in documents_list:
    v_path_vector_store = '/home/spinnaker/py_dev/rag_db/test9'
    embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
    vector_store = Chroma(embedding_function=embedding_model, persist_directory=v_path_vector_store)
    vector_store.add_documents(documents=i)

In [None]:
# Properly delete old vector store to reuse same disk space
v_path_vector_store = '/home/spinnaker/py_dev/rag_db/test9'
shutil.rmtree(v_path_vector_store)

In [9]:
# load vectorstore from disk
v_path_vector_store = '/home/spinnaker/py_dev/rag_db/test9'
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
chroma_db = Chroma(persist_directory=v_path_vector_store, embedding_function=embedding_model)
type(chroma_db)

INFO: Use pytorch device_name: cuda
INFO: Load pretrained SentenceTransformer: BAAI/bge-large-en-v1.5


langchain_chroma.vectorstores.Chroma

In [None]:
# llm_model = Ollama(model=model, verbose=False)  # Disable verbose for batch processing
llm = ChatOllama(model=model, temperature=0, num_ctx=30000, repeat_last_n=10000, verbose=False, keep_alive=0)
print(f"Loaded LLM model {llm.model}")

Loaded LLM model llama3.2


In [None]:
retriever = chroma_db.as_retriever(search_kwargs={f"k": 20})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever,)

# Use the 'invoke' method to handle the query
result = qa_chain.invoke({"query": 'You are a cybersecurity analyst and have been loaded with new data in a RAG database with information on cybersecurity from 2024 and 2025. Review ransomware attacks and malicious cyber activity that took place in April 2025 using the newly loaded context database. Use data from all sources at your disposal. Include CISA advisories, Bleeping Computer, The Register, and Tenable. After you have reviewed, write an intelligence report on the matter and provide specific dates if available. Provide specific analysis and historical context for any threat actors discussed. Be as exhaustive and lengthy with your report as this will be read by upper management. Include a sources section with specific dates at the end of the report.'})
print(result.get('result'))

INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
