# 1. data process

In [1]:
import langchain
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

dir(langchain.document_loaders)

['Any',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__getattr__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_old_to_new_name',
 'is_interactive_env',

In [2]:
import iocextract
def get_ioc_dict(content):
    result = {
        'md5':[],
        'sha1':[],
        'sha256':[],
        'sha512':[],
        'url':[],
        'ipv4':[],
        'ipv6':[]
    }
    for x in iocextract.extract_md5_hashes(content):
        result['md5'].append(x.lower())
    for x in iocextract.extract_sha1_hashes(content):
        result['sha1'].append(x.lower())
    for x in iocextract.extract_sha256_hashes(content):
        result['sha256'].append(x.lower())
    for x in iocextract.extract_sha512_hashes(content):
        result['sha512'].append(x.lower())
    # refang=True to remove common obfuscation methods from IOCs
    for x in iocextract.extract_urls(content, refang=True):
        result['url'].append(x)
    for x in iocextract.extract_ipv4s(content, refang=True):
        result['ipv4'].append(x)
    for x in iocextract.extract_ipv6s(content):
        result['ipv6'].append(x)

    result = {k:list(set(result[k])) for k in result}
    return result


In [13]:
from langchain_community.document_loaders import TextLoader
import os


chunk_size = 800
chunk_overlap = 50
 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

all_splits = []
f1 = '../sandbox/corpus/'
for f2 in os.listdir(f1):
    f3 = os.path.join(f1,f2)
    tld = TextLoader(f3)
    doc = tld.load()
    txt = doc[0].page_content
    sp = text_splitter.split_documents(doc)
    new_sp = []
    md5 = get_ioc_dict(txt)['md5'][0]
    for s in sp:
        s.page_content = 'the sample md5 is ' + md5 + ' . ' + s.page_content
        new_sp.append( s )
    all_splits+=new_sp
    

print(len(all_splits))


67


In [14]:
all_splits[:3]

[Document(page_content='the sample md5 is 1c4989acfd741d3d1612b3495e087a2c . The sample filename is 8f7e73a29ab9ecd808316acd1a68efabd1c994a7295a9db6cc12a25a849362b7, md5 is 1c4989acfd741d3d1612b3495e087a2c, platform is windows, duration of the analysis is 124 seconds, machine name is win7_office2013, machine manager is KVM, and the report score is 1.8 points.\n\nThe sample hits the following YARA rules:\n- The sample hits yara rule "embedded_win_api", description is "A non-Windows executable contains win32 API functions names", and the hit content is [\'U2hlbGxFeGVjdXRl\']\n- The sample hits yara rule "powershell", description is "(no description)", and the hit content is [\'UG93ZXJTaGVsbA==\', \'cG93ZXJzaGVsbA==\']', metadata={'source': '../sandbox/corpus/1c4989acfd741d3d1612b3495e087a2c.json.txt'}),
 Document(page_content='the sample md5 is 1c4989acfd741d3d1612b3495e087a2c . The sample produces the following behavior during analysis:\n- create_guard_page, the risk level is suspicious

# 2. embedding model

In [15]:
from langchain.embeddings import HuggingFaceBgeEmbeddings
embeddings = HuggingFaceBgeEmbeddings(model_name='../../embedding/bge/bge-m3/')

# 3. vector database

In [16]:
from langchain.vectorstores.milvus import Milvus

URI = 'http://localhost:19530'
connection_args = { 'uri': URI }

vector_store = Milvus(
    embedding_function=embeddings,
    connection_args=connection_args,
    collection_name='sandboxllm',
    drop_old=True,
).from_documents(
    all_splits,
    embedding=embeddings,
    collection_name='sandboxllm',
    connection_args=connection_args,
)

# 4. similarity search

In [33]:
query = "tell me the behaviors of sample with md5 1c4989acfd741d3d1612b3495e087a2c  ?"
docs = vector_store.similarity_search(query)

print(len(docs))

4


In [34]:
docs

[Document(page_content='the sample md5 is 1c4989acfd741d3d1612b3495e087a2c . The sample produces the following behavior during analysis:\n- create_guard_page, the risk level is suspicious, the description is: Contains functionality to create guard pages, often used to hinder reverse engineering and debugging. \n- stealth_window, the risk level is suspicious, the description is: A process created a hidden window. \n- long_command_line, the risk level is suspicious, the description is: Very long command line found. \n- martian_command_process, the risk level is high-risk, the description is: A command shell or script process was created by an unexpected parent process.', metadata={'source': '../sandbox/corpus/1c4989acfd741d3d1612b3495e087a2c.json.txt', 'pk': 448797199821148500}),
 Document(page_content='the sample md5 is f1a0396293aff911bcfe68f8d2ab0ed6 . The sample produces the following behavior during analysis:\n- antivm_queries_computername, the risk level is normal, the description 

In [35]:
vec = embeddings.embed_documents([query])

In [36]:
vec

[[-0.04586115851998329,
  -0.025394607335329056,
  -0.06772847473621368,
  -0.031218864023685455,
  -0.0009917306015267968,
  -0.023123877122998238,
  -0.005319113377481699,
  0.03208904340863228,
  -0.00031463924096897244,
  -0.008848409168422222,
  -0.04946018382906914,
  -0.0007517632911913097,
  -0.020836567506194115,
  -0.013475681655108929,
  -0.015474663116037846,
  -0.022439617663621902,
  -0.06095745041966438,
  -0.040922295302152634,
  -0.043244123458862305,
  0.006075705401599407,
  -0.02892078086733818,
  0.010174601338803768,
  -0.023769773542881012,
  0.032728347927331924,
  0.013088103383779526,
  0.0027719156350940466,
  -0.04060860723257065,
  -0.03623069450259209,
  0.021164782345294952,
  0.032998424023389816,
  0.009894764050841331,
  0.0012183497892692685,
  0.0025130833964794874,
  -0.001616357360035181,
  0.009625246748328209,
  0.015675699338316917,
  -0.019903011620044708,
  -0.009467706084251404,
  -0.037594497203826904,
  -0.007241596933454275,
  0.0250716246

In [37]:
docsv = vector_store.similarity_search_by_vector(vec[0])
print(len(docsv))

4


In [38]:
docsv

[Document(page_content='the sample md5 is 1c4989acfd741d3d1612b3495e087a2c . The sample produces the following behavior during analysis:\n- create_guard_page, the risk level is suspicious, the description is: Contains functionality to create guard pages, often used to hinder reverse engineering and debugging. \n- stealth_window, the risk level is suspicious, the description is: A process created a hidden window. \n- long_command_line, the risk level is suspicious, the description is: Very long command line found. \n- martian_command_process, the risk level is high-risk, the description is: A command shell or script process was created by an unexpected parent process.', metadata={'source': '../sandbox/corpus/1c4989acfd741d3d1612b3495e087a2c.json.txt', 'pk': 448797199821148500}),
 Document(page_content='the sample md5 is f1a0396293aff911bcfe68f8d2ab0ed6 . The sample produces the following behavior during analysis:\n- antivm_queries_computername, the risk level is normal, the description 

# 5. RAG Workflow

In [39]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

llm = ChatOpenAI(
    streaming=True,
    verbose=True,
    openai_api_key='none',
    openai_api_base='http://x.x.x.x:8000/v1',
    model_name='sandboxllm-gpt'
)


  warn_deprecated(


In [41]:
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough


retriever = vector_store.as_retriever()

template = """Below context is the sandbox analysis result. Please answer the question at the end based on the context. 
{context}
Question: {question}
Answer:"""
rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
)

print(rag_chain.invoke("Breifly summary the behaviors for sample with md5 1c4989acfd741d3d1612b3495e087a2c?"))


content='The sample with md5 1c4989acfd741d3d1612b3495e087a2c exhibits several suspicious behaviors during analysis. These include creating guard pages, which may hinder reverse engineering and debugging; creating a hidden window, indicating stealthiness; using a very long command line, which could potentially hide malicious activity; and most notably, creating a command shell or script process through an unexpected parent process, which is considered high-risk as it may indicate attempted malicious code execution.' response_metadata={'finish_reason': 'stop'} id='run-05bcd495-8641-4e6f-b10c-e5bf474eb498-0'
