In [75]:
import os

from langchain_unstructured import UnstructuredLoader
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import utils as chromautils

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

#embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model_name = "ibm-granite/granite-embedding-125m-english"
model = "llama3.2" 

In [67]:

text_files_directory = "/home/spinnaker/py_dev/wayback"

In [None]:
def list_text_files(directory):
    file_list = []
    for (root, dirs, file) in os.walk(directory):
        for f in file:
            if '.txt' in f:
                file_list.append(f'{root}/{f}')
    return file_list

# Specify the directory to search for PDF files
txt_files = list_text_files(text_files_directory)
num_files = len(txt_files)
print(txt_files)

In [None]:
# Load and embed the content of the log files
def load_and_embed_files(file_paths):
    documents = []
    for file_path in file_paths:
        loader = UnstructuredLoader(file_path, mode="elements")
        documents.extend(loader.load_and_split())
        documents = chromautils.filter_complex_metadata(documents)
    return documents

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
documents = load_and_embed_files(txt_files)

In [70]:
# Define the path to store the Chroma vector store (in SQLite format)
v_path_vector_store = '/home/spinnaker/py_dev/wayback/test4'

In [None]:
# create the vector store from the documents / logs you provided
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=v_path_vector_store)

In [None]:
# load vectorstore from disk
chroma_db = Chroma(persist_directory=v_path_vector_store, embedding_function=embedding_model)
type(chroma_db)

In [None]:
# llm_model = Ollama(model=model, verbose=False)  # Disable verbose for batch processing
llm = ChatOllama(model=model, temperature=0.2, num_ctx=20000, verbose=False)
print(f"Loaded LLM model {llm.model}")

In [None]:
retriever = chroma_db.as_retriever(search_kwargs={f"k": 5})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever,)

# Use the 'invoke' method to handle the query
result = qa_chain.invoke({"query": 'Review cyber attacks and malicious cyber activity that took place 2025 using the newly loaded context database. Use data from all sources at your disposal. After you have reviewed, write an intelligence report on the matter and provide specific dates if available. Provide specific analysis for any threat actors mentioned. Include a sources section at the end of the report.'})
print(result.get('result'))