In [34]:
import os

from langchain_unstructured import UnstructuredLoader
from langchain_ollama import ChatOllama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import utils as chromautils

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
model = "gemma3:12b" 

In [19]:

text_files_directory = "/home/spinnaker/py_dev/wayback"

In [40]:
def list_text_files(directory):
    file_list = []
    for (root, dirs, file) in os.walk(directory):
        for f in file:
            if '.txt' in f:
                file_list.append(f'{root}/{f}')
    return file_list

# Specify the directory to search for PDF files
txt_files = list_text_files(text_files_directory)
num_files = len(txt_files)
print(txt_files)

['/home/spinnaker/py_dev/wayback/error_trap.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250314.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250112.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250228.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250211.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250225.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250304.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20241226.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250311.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250224.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250109.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250305.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250107.txt', '/home/spinnaker/py_dev/wayback/the_register/the_register_20250124.txt', '

In [41]:
# Load and embed the content of the log files
def load_and_embed_files(file_paths):
    documents = []
    for file_path in file_paths:
        loader = UnstructuredLoader(file_path, mode="elements")
        documents.extend(loader.load_and_split())
        documents = chromautils.filter_complex_metadata(documents)
    return documents

embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)
documents = load_and_embed_files(txt_files)

INFO: Use pytorch device_name: cuda
INFO: Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2


In [42]:
# Define the path to store the Chroma vector store (in SQLite format)
v_path_vector_store = '/home/spinnaker/py_dev/wayback/test2'

In [43]:
# create the vector store from the documents / logs you provided
vectorstore = Chroma.from_documents(documents=documents, embedding=embedding_model, persist_directory=v_path_vector_store)

In [44]:
# load vectorstore from disk
chroma_db = Chroma(persist_directory=v_path_vector_store, embedding_function=embedding_model)
type(chroma_db)

langchain_chroma.vectorstores.Chroma

In [None]:
# llm_model = Ollama(model=model, verbose=False)  # Disable verbose for batch processing
llm = ChatOllama(model=model, temperature=0.2, num_ctx=15000, verbose=False)
print(f"Loaded LLM model {llm.model}")

Loaded LLM model gemma3:12b


In [39]:
retriever = chroma_db.as_retriever(search_kwargs={f"k": 3})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(llm,retriever=retriever,)

# Use the 'invoke' method to handle the query
result = qa_chain.invoke({"query": 'Discuss cyber attacks and activity that took place 2025 using the newly loaded context database. Analyze any cyber threat actors mentioned.'})
print(result.get('result'))

INFO: HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Okay, here's a summary of cyber attacks and activity in 2025, based on the provided context from "The Hacker News" archive.

**Overview of Cyber Activity in 2025 (as reported by The Hacker News)**

The provided context highlights two significant cyber events in 2025:

1.  **Vo1d Botnet Infection of Android TVs:** This is a major ongoing campaign.
    *   **What Happened:** A botnet malware dubbed "Vo1d" has infected Android TV devices across 226 countries and regions. As of February 25, 2025, the botnet reached a peak of 1,590,299 infected devices on January 19, 2025. India has seen a notable surge in infection rates.
    *   **Technical Details:** The malware has evolved to enhance stealth, resilience, and antidetection capabilities. It uses RSA encryption for network communication (preventing command-and-control takeover), unique downloaders with XXTEA encryption, and RSA-protected keys, making analysis difficult. It was initially documented by Doctor Web in September 2024.
    *   *

In [28]:
# Define the instruction and log file prompts
v_instruct = 'You are a cybersecurity intelligence analyst learning new information about cyber topics in 2025.'
v_prompt = 'Summarize content from Hacker News in 2025.'
combined_prompt = v_instruct + "\n" + v_prompt

In [None]:
# Use the LLM to process the combined prompt
response = llm(combined_prompt)

In [None]:
# Print the response
print(response)