# Vector DB Playground

In [1]:
%load_ext autoreload
%autoreload 2
%env LOGGING_LEVEL=INFO

env: LOGGING_LEVEL=INFO


In [None]:
import json

from langchain.text_splitter import CharacterTextSplitter
from pydantic import TypeAdapter

from hbit import bootstrap, core, dto, enums, settings, types
from hbit.corpus import datasets, loader

In [3]:
registry = bootstrap.create_services(
    device_extractor_type=enums.DeviceExtractorType.SQL,
    patch_extractor_type=enums.PatchExtractorType.SQL,
    summary_service_type=enums.SummaryServiceType.AI,
    model_provider=enums.ModelProvider.OPEN_AI,
)
embedding_model = registry.get_service(types.EmbeddingModel)
dataset = datasets.SecurityDataset(settings.SECURITY_PAPERS_PATH)
db = core.VectorService()
splitter = CharacterTextSplitter(
    chunk_size=settings.CHUNK_SIZE,
    chunk_overlap=settings.CHUNK_OVERLAP,
)
security_papers = loader.SecurityPapersService(
    db=db.client, embeddings_model=embedding_model, splitter=splitter
)

In [4]:
sps = [
    dto.SecurityPaper(
        text="In contemporary enterprise environments, the process of threat modeling plays a pivotal role in identifying potential vulnerabilities prior to system deployment. By leveraging methodologies such as STRIDE and DREAD, security analysts can methodically assess attack surfaces and prioritize mitigation strategies. This proactive approach reduces the likelihood of zero-day exploits by encouraging secure design principles early in the software development lifecycle. Additionally, integration with DevSecOps pipelines ensures that threat modeling is not a one-time event, but an iterative process that evolves alongside system updates and feature expansions.",
        category=enums.SecurityPaperCategory.PDF,
    ),
    dto.SecurityPaper(
        text="Recent studies have highlighted the susceptibility of machine learning-based intrusion detection systems to adversarial examples. Attackers can subtly perturb benign inputs to evade detection or manipulate feature vectors to mimic normal behavior. These evasion techniques challenge the reliability of IDS models trained on static datasets. To counteract this, researchers are exploring adversarial training, ensemble models, and online learning methods that adapt to evolving attack vectors. Ensuring robustness in these models remains a significant challenge, particularly in high-throughput, low-latency environments such as industrial control systems.",
        category=enums.SecurityPaperCategory.PDF,
    ),
    dto.SecurityPaper(
        text="Federated learning has emerged as a promising paradigm for decentralized model training without compromising data privacy. In cybersecurity contexts, this allows institutions to collaboratively improve threat detection models without sharing raw data, which is often sensitive or regulated. However, federated approaches introduce new attack vectors such as model poisoning and gradient inversion. Countermeasures including differential privacy, secure aggregation protocols, and anomaly detection mechanisms are being actively developed to preserve both privacy and model integrity. The balance between accuracy, efficiency, and privacy continues to drive ongoing research in this domain.",
        category=enums.SecurityPaperCategory.PDF,
    ),
    dto.SecurityPaper(
        text="Ransomware threats have evolved beyond opportunistic attacks to incorporate sophisticated lateral movement tactics within hybrid on-premise/cloud environments. Attackers often exploit misconfigured identity federation protocols, credential reuse, and insecure API endpoints to propagate across organizational units. Simulation of such attack chains using frameworks like MITRE ATT&CK and CALDERA provides defenders with actionable insights into potential blind spots. Effective containment strategies now rely on micro-segmentation, identity-based access controls, and automated response mechanisms that isolate compromised assets in real-time.",
        category=enums.SecurityPaperCategory.PDF,
    ),
    dto.SecurityPaper(
        text="The advent of quantum computing presents a looming threat to widely used asymmetric cryptographic algorithms such as RSA and ECC. In anticipation, post-quantum cryptography (PQC) schemes are being standardized by bodies such as NIST, focusing on lattice-based, hash-based, and multivariate polynomial-based algorithms. Early adoption of hybrid cryptographic frameworks enables gradual migration while maintaining backward compatibility. However, implementing PQC in constrained environments such as IoT devices poses performance challenges. Careful evaluation of key sizes, computational overhead, and integration complexity is necessary to ensure practical deployment.",
        category=enums.SecurityPaperCategory.PDF,
    ),
]

In [4]:
security_papers.save_texts(sps)

In [18]:
query_responses = security_papers.query_vector_db(
    query="Quantum-Resistant Cryptographic Schemes for Secure Communication",
    query_filters=dto.SecurityPaperQuery(),
)
type_adapter = TypeAdapter(list[dto.SecurityPaperResponse])

In [None]:
response_dict = type_adapter.dump_python(query_responses)
response_text = json.dumps(response_dict, indent=4)
print(response_text)

[
    {
        "security_paper": {
            "text": "The advent of quantum computing presents a looming threat to widely used asymmetric cryptographic algorithms such as RSA and ECC. In anticipation, post-quantum cryptography (PQC) schemes are being standardized by bodies such as NIST, focusing on lattice-based, hash-based, and multivariate polynomial-based algorithms. Early adoption of hybrid cryptographic frameworks enables gradual migration while maintaining backward compatibility. However, implementing PQC in constrained environments such as IoT devices poses performance challenges. Careful evaluation of key sizes, computational overhead, and integration complexity is necessary to ensure practical deployment.",
            "category": "pdf"
        },
        "distance": 0.5777119398117065
    },
    {
        "security_paper": {
            "text": "Federated learning has emerged as a promising paradigm for decentralized model training without compromising data privacy. In cyb