# Lab-1-2 name

## Lab-1-2 description

## Infrastructure

In [1]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 24.04.4 LTS
Release:	24.04
Codename:	noble


In [2]:
!python --version

Python 3.12.3


In [4]:
!pip freeze | grep -E "langchain|chromadb|sentence-transformers|requests|pandas|pyyaml|jq"

chromadb==1.5.1
langchain==1.2.10
langchain-chroma==1.1.0
langchain-classic==1.0.1
langchain-community==0.4.1
langchain-core==1.2.15
langchain-huggingface==1.2.0
langchain-ollama==1.0.1
langchain-text-splitters==1.1.1
requests==2.32.5
requests-oauthlib==2.0.0
requests-toolbelt==1.0.0
sentence-transformers==5.2.2


In [5]:
!ollama --version

]11;?\[6nollama version is 0.17.0


In [6]:
!ollama list

]11;?\[6nNAME             ID              SIZE      MODIFIED     
qwen3:4b         359d7dd4bcda    2.5 GB    2 hours ago     
llama3:latest    365c0bd3c000    4.7 GB    2 hours ago     
phi3:latest      4f2222927938    2.2 GB    22 hours ago    
qwen:0.5b        b5dc5e784f2a    394 MB    24 hours ago    


## Load data from MITRE repository

In [26]:
!git clone https://github.com/mitre-atlas/atlas-data.git

fatal: destination path 'atlas-data' already exists and is not an empty directory.


In [8]:
!tree -L 1 atlas-data

/bin/bash: line 1: tree: command not found


In [9]:
import yaml
import json
import os
from pathlib import Path

In [10]:
def load_atlas_data(data_dir="atlas-data/data"):
    techniques = []
    file_path = os.path.join(data_dir, "techniques.yaml")
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = yaml.safe_load(f)
            for obj in data:
                tech = {
                    "id": obj.get("id"),
                    "name": obj.get("name"),
                    "description": obj.get("description", ""),
                    "url": f"https://atlas.mitre.org/techniques/{obj.get('id')}"
                }
                # Добавляем внешние ссылки для контекста, если есть
                ext_refs = obj.get("external_references", [])
                if ext_refs:
                    tech['external_id'] = ext_refs[0].get('external_id', '')
                techniques.append(tech)
    except Exception as e:
        print(f"Error reading {file}: {e}")
    return techniques


In [11]:
data = load_atlas_data()
print(f"Загружено техник: {len(data)}")
# Сохраняем в простой JSON для удобства
with open("atlas_processed.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)


Загружено техник: 155


# VectorDB creation

In [15]:
from langchain_community.document_loaders import JSONLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

from langchain_huggingface import HuggingFaceEmbeddings

# 1. Загрузка данных

In [16]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["name"] = record.get("name")
    metadata["id"] = record.get("id")
    metadata["url"] = record.get("url")
    # Основной текст для поиска: Название + Описание
    return metadata

loader = JSONLoader(
    file_path="./atlas_processed.json",
    jq_schema=".[]",
    content_key="description",
    metadata_func=metadata_func
)
documents = loader.load()


# 2. Разбиение на чанки (если описания длинные)

In [18]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
texts = text_splitter.split_documents(documents)

## 3. Инициализация эмбеддингов (локальная модель)

In [19]:
# model_name можно заменить на 'all-mpnet-base-v2' для лучшего качества
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

## 4. Создание и сохранение векторной БД

In [20]:
vectorstore = Chroma.from_documents(
    documents=texts,
    embedding=embeddings,
    persist_directory="./atlas_chroma_db"
)

## 5. LLM agent

In [22]:
import os
from langchain_classic.chains import RetrievalQA
from langchain_core.prompts import PromptTemplate

In [23]:
# Импорт LLM
from langchain_ollama import OllamaLLM
llm = OllamaLLM(model="llama3", temperature=0.3)

In [24]:
def setup_agent():
    # Подключение к созданной БД
    embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = Chroma(
        persist_directory="./atlas_chroma_db",
        embedding_function=embeddings
    )
    
    # Настройка Retrieval QA цепи
    # Важно указать промпт, чтобы модель опиралась только на контекст
    template = """Используй только следующий контекст для ответа на вопрос. 
    Если ответа нет в контексте, скажи, что не знаешь, не выдумывай.
    Контекст: {context}
    
    Вопрос: {question}
    Ответ:"""
    
    PROMPT = PromptTemplate(
        template=template, input_variables=["context", "question"]
    )

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}), # Искать топ-3 совпадения
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )
    return qa_chain

def query_agent(agent, question):
    print(f"\nВопрос: {question}")
    result = agent.invoke({"query": question})
    print(f"Ответ: {result['result']}")
    print("\nИсточники:")
    for doc in result['source_documents']:
        print(f"- {doc.metadata.get('name')} ({doc.metadata.get('id')})")
        print(f"  URL: {doc.metadata.get('url')}")

In [25]:
agent = setup_agent()
while True:
    q = input("\nВведите вопрос (или 'exit' для выхода): ")
    if q.lower() == 'exit':
        break
    query_agent(agent, q)


Введите вопрос (или 'exit' для выхода):  What is jailbrake?



Вопрос: What is jailbrake?
Ответ: According to the provided context, "jailbreaking" refers to a process of modifying or manipulating a Large Language Model (LLM) in such a way that it bypasses any controls, restrictions, or guardrails placed on it, allowing an adversary to use the LLM in unintended ways.

Источники:
- LLM Jailbreak (AML.T0054)
  URL: https://atlas.mitre.org/techniques/AML.T0054
- Generative AI (AML.T0016.002)
  URL: https://atlas.mitre.org/techniques/AML.T0016.002
- LLM Prompt Self-Replication (AML.T0061)
  URL: https://atlas.mitre.org/techniques/AML.T0061



Введите вопрос (или 'exit' для выхода):  exit
