In [1]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, LongformerTokenizerFast
from tqdm import tqdm

In [2]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

In [3]:
import sys

sys.path.append(str(PROJECT_DIR))

In [4]:
from src import parsers
from src.extractor import VirusTotalFeatureExtractor

In [5]:
REPORTS_DIR = DATA_DIR / 'reports'
MALWARE_REPORTS_DIR = REPORTS_DIR / 'malware'
BENIGN_REPORTS_DIR = REPORTS_DIR / 'benign'

PE_DIR = DATA_DIR / 'pe'
MALWARE_EXE_DIR = PE_DIR / 'malware'
BENIGN_EXE_DIR = PE_DIR / 'malware'

RANDOM_STATE = 741

In [6]:
malware_reports = list(map(lambda s: s.stem, MALWARE_REPORTS_DIR.glob('*')))
benign_reports = list(map(lambda s: s.stem, BENIGN_REPORTS_DIR.glob('*')))

In [7]:
df_malware = pd.DataFrame(
    {
        'HASH': malware_reports,
        'LABEL': 'malware',
        'LABEL_ID': 1,
    }
)

df_benign = pd.DataFrame(
    {
        'HASH': benign_reports,
        'LABEL': 'benign',
        'LABEL_ID': 0,
    }
)

In [8]:
df = pd.concat([df_malware, df_benign])

In [9]:
df.shape

(6124, 3)

In [10]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df.LABEL_ID,
)

df_train, df_valid = train_test_split(
    df_train,
    test_size=0.1,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df_train.LABEL_ID,
)

In [11]:
tokenizer = LongformerTokenizerFast.from_pretrained('kazzand/ru-longformer-tiny-16384')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=16_384, chunk_overlap=8_192,
)

In [13]:
def extract_texts(extractor) -> list[str]:
    texts = []

    texts.extend(parsers.MagicParser().transform(extractor.magic))
    texts.extend(parsers.TypeTagParser().transform(extractor.type_tag))
    texts.extend(parsers.TypeTagsParser().transform(extractor.type_tags))
    texts.extend(parsers.DetectitEasyParser().transform(extractor.detectiteasy))
    texts.extend(parsers.TypeExtensionParser().transform(extractor.type_extension))
    texts.extend(parsers.ImportListParser().transform(extractor.import_list))
    texts.extend(parsers.MitreAttackTechniquesParser().transform(extractor.mitre_attack_techniques))
    texts.extend(parsers.SignatureMatchesParser().transform(extractor.signature_matches))
    texts.extend(parsers.CommandExecutionsParser().transform(extractor.command_executions))
    texts.extend(parsers.ProcessesTreeParser().transform(extractor.processes_tree))
    texts.extend(parsers.ProcessesInjectedParser().transform(extractor.processes_injected))
    texts.extend(parsers.ProcessesCreatedParser().transform(extractor.processes_created))
    texts.extend(parsers.ProcessesTerminatedParser().transform(extractor.processes_terminated))
    texts.extend(parsers.FilesOpenedParser().transform(extractor.files_opened))
    texts.extend(parsers.FilesCopiedParser().transform(extractor.files_copied))
    texts.extend(parsers.FilesDroppedParser().transform(extractor.files_dropped))
    texts.extend(parsers.FilesWrittenParser().transform(extractor.files_written))
    texts.extend(parsers.FilesAttributeChangedParser().transform(extractor.files_attribute_changed))
    texts.extend(parsers.MutexesOpenedParser().transform(extractor.mutexes_opened))
    texts.extend(parsers.MutexesCreatedParser().transform(extractor.mutexes_created))
    texts.extend(parsers.ModulesLoadedParser().transform(extractor.modules_loaded))
    texts.extend(parsers.RegistryKeysOpenedParser().transform(extractor.registry_keys_opened))
    texts.extend(parsers.RegistryKeysSetParser().transform(extractor.registry_keys_set))
    texts.extend(parsers.RegistryKeysDeletedParser().transform(extractor.registry_keys_deleted))
    texts.extend(parsers.IpTrafficParser().transform(extractor.ip_traffic))
    texts.extend(parsers.DNSLookupsParser().transform(extractor.dns_lookups))
    texts.extend(parsers.ServicesStartedParser().transform(extractor.services_started))
    texts.extend(parsers.ServicesOpenedParser().transform(extractor.services_opened))
    texts.extend(parsers.CallsHighlightedParser().transform(extractor.calls_highlighted))
    texts.extend(parsers.HTTPConversationsParser().transform(extractor.http_conversations))
    texts.extend(parsers.SignalsHookedParser().transform(extractor.signals_hooked))
    texts.extend(parsers.WindowsSearchedParser().transform(extractor.windows_searched))
    return texts

In [14]:
container_hash_text = {
    'malware': {
        'text': [],
        'hash': [],
    },
    'benign': {
        'text': [],
        'hash': [],
    }
}

for idx in tqdm(range(df_train.shape[0])):
    row = df_train.iloc[idx]
    if row.LABEL == 'malware':
        report_path = MALWARE_REPORTS_DIR
    elif row.LABEL == 'benign':
        report_path = BENIGN_REPORTS_DIR
    else:
        raise NotImplementedError()
    
    extractor = VirusTotalFeatureExtractor.from_json(report_path / f'{row.HASH}.json')
    corpus = '\n'.join(extract_texts(extractor))
    chunks = text_splitter.split_text(corpus)

    container_hash_text[row.LABEL]['text'].extend(chunks)
    container_hash_text[row.LABEL]['hash'].extend([row.HASH] * len(chunks))


  4%|████▋                                                                                                         | 186/4409 [00:13<07:11,  9.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (20205 > 16384). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4409/4409 [04:57<00:00, 14.80it/s]


In [15]:
df_train_malware_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['malware']['hash'],
        'TEXT': container_hash_text['malware']['text'],
        'LABEL': 'malware',
        'LABEL_ID': 1,
    }
)

df_train_benign_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['benign']['hash'],
        'TEXT': container_hash_text['benign']['text'],
        'LABEL': 'benign',
        'LABEL_ID': 0,
    }
)

In [16]:
df_train_chunks = pd.concat([df_train_malware_chunks, df_train_benign_chunks])

In [17]:
for _ in range(10):
    df_train_chunks = df_train_chunks.sample(frac=1, random_state=RANDOM_STATE, ignore_index=True)

In [18]:
container_hash_text = {
    'malware': {
        'text': [],
        'hash': [],
    },
    'benign': {
        'text': [],
        'hash': [],
    }
}

for idx in tqdm(range(df_valid.shape[0]), 'VALID'):
    row = df_train.iloc[idx]
    if row.LABEL == 'malware':
        report_path = MALWARE_REPORTS_DIR
    elif row.LABEL == 'benign':
        report_path = BENIGN_REPORTS_DIR
    else:
        raise NotImplementedError()
    
    extractor = VirusTotalFeatureExtractor.from_json(report_path / f'{row.HASH}.json')
    corpus = '\n'.join(extract_texts(extractor))
    chunks = text_splitter.split_text(corpus)

    container_hash_text[row.LABEL]['text'].extend(chunks)
    container_hash_text[row.LABEL]['hash'].extend([row.HASH] * len(chunks))

VALID: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 490/490 [00:31<00:00, 15.36it/s]


In [19]:
df_valid_malware_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['malware']['hash'],
        'TEXT': container_hash_text['malware']['text'],
        'LABEL': 'malware',
        'LABEL_ID': 1,
    }
)

df_valid_benign_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['benign']['hash'],
        'TEXT': container_hash_text['benign']['text'],
        'LABEL': 'benign',
        'LABEL_ID': 0,
    }
)

In [20]:
df_valid_chunks = pd.concat([df_valid_malware_chunks, df_valid_benign_chunks])

In [21]:
for _ in range(10):
    df_valid_chunks = df_valid_chunks.sample(frac=1, random_state=RANDOM_STATE, ignore_index=True)

In [22]:
container_hash_text = {
    'malware': {
        'text': [],
        'hash': [],
    },
    'benign': {
        'text': [],
        'hash': [],
    }
}

for idx in tqdm(range(df_test.shape[0]), 'TEST'):
    row = df_train.iloc[idx]
    if row.LABEL == 'malware':
        report_path = MALWARE_REPORTS_DIR
    elif row.LABEL == 'benign':
        report_path = BENIGN_REPORTS_DIR
    else:
        raise NotImplementedError()
    
    extractor = VirusTotalFeatureExtractor.from_json(report_path / f'{row.HASH}.json')
    corpus = '\n'.join(extract_texts(extractor))
    chunks = text_splitter.split_text(corpus)

    container_hash_text[row.LABEL]['text'].extend(chunks)
    container_hash_text[row.LABEL]['hash'].extend([row.HASH] * len(chunks))

TEST: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1225/1225 [01:27<00:00, 13.93it/s]


In [23]:
df_test_malware_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['malware']['hash'],
        'TEXT': container_hash_text['malware']['text'],
        'LABEL': 'malware',
        'LABEL_ID': 1,
    }
)

df_test_benign_chunks = pd.DataFrame(
    {
        'HASH': container_hash_text['benign']['hash'],
        'TEXT': container_hash_text['benign']['text'],
        'LABEL': 'benign',
        'LABEL_ID': 0,
    }
)

In [24]:
df_test_chunks = pd.concat([df_test_malware_chunks, df_test_benign_chunks])

In [25]:
for _ in range(10):
    df_test_chunks = df_test_chunks.sample(frac=1, random_state=RANDOM_STATE, ignore_index=True)

In [26]:
df_train_chunks.to_parquet(DATA_DIR / 'df_train_chunks.parquet')
df_valid_chunks.to_parquet(DATA_DIR / 'df_valid_chunks.parquet')
df_test_chunks.to_parquet(DATA_DIR / 'df_test_chunks.parquet')