In [41]:
import json
from abc import ABC
from pathlib import Path

import pandas as pd
import torch
from torch import nn
from torch.utils.data import (
    DataLoader,
    Dataset,
)
from transformers import (
    BertTokenizer,
)

In [7]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'

REPORTS_DIR = DATA_DIR / 'reports'
MALWARE_REPORTS_DIR = REPORTS_DIR / 'malware'
BENIGN_REPORTS_DIR = REPORTS_DIR / 'benign'

EXE_DIR = DATA_DIR / 'exe'
MALWARE_EXE_DIR = EXE_DIR / 'malware'
BENIGN_EXE_DIR = EXE_DIR / 'malware'

RANDOM_STATE = 741

TOKENIZER_OPTIONS = {
    'add_special_tokens': True,
    'max_length': 224,
    'padding': 'max_length',
    'return_token_type_ids': False,
    'return_attention_mask': False,
    'truncation': True,
    'return_tensors': 'pt',
}

In [6]:
df_train = pd.read_parquet(DATA_DIR / 'df_train.parquet')
df_valid = pd.read_parquet(DATA_DIR / 'df_valid.parquet')
df_test = pd.read_parquet(DATA_DIR / 'df_test.parquet')

In [None]:
class MalwareDetectionDataset(Dataset, ABC):
    def __init__(
            self,
            df: pd.DataFrame,
            bert_tokenizer: str = 'bert-base-uncased',
    ):
        super(MalwareDetectionDataset, self).__init__()
        self._df = df
        self._tokenizer = BertTokenizer.from_pretrained(bert_tokenizer)

    def __len__(self):
        return self._df.shape[0]

    def __getitem__(self, item):
        row = self._df.iloc[item]

        if row.LABEL == 'malware':
            file_path = MALWARE_EXE_DIR / f'{row.HASH}.exe'
        elif row.LABEL == 'benign':
            file_path = BENIGN_EXE_DIR / f'{row.HASH}.exe'
        else:
            raise NotImplementedError()

        with file_path.open('rb') as _file:
            data = _file.read(224 * 224)

        padding = [0] * (224*224 - len(data))
        data.extend(padding)

        img = torch.reshape(
            input=torch.tensor(data, dtype=torch.short),
            shape=(224, 224),
        )

        # to rgb
        img = torch.stack([img, img, img])

        return {
            'static': img,
            'label_id': torch.tensor(row.LABEL_ID, dtype=torch.long),
        }


In [54]:
print(df_train.iloc[1].HASH)
print(df_train.iloc[1].LABEL)

0624469d0b2ecdef04549cea8cfe1300009c0b2fbdce70b78aa7f66d00220d0b
malware


In [55]:
with open(MALWARE_REPORTS_DIR / 'cda1646156e1514310a0e7a3f4e1d3889d136000a8bc21e9ae8729c2339d15e7.json', 'r') as file:
    data = json.load(file)

In [56]:
data

{'files': {'data': {'attributes': {'type_description': 'Win32 EXE',
    'tlsh': 'T1C4840152FC534D7AD199B6F5098A5622173DCD8A1B0D40071AFC38A4B9732BBCD8B1EE',
    'vhash': '035046751d757025z30048nz5fz',
    'type_tags': ['executable', 'windows', 'win32', 'pe', 'peexe'],
    'creation_date': 1332087689,
    'names': ['cda1646156e1514310a0e7a3f4e1d3889d136000a8bc21e9ae8729c2339d15e7.exe',
     'c977ca5f506ae887d16aca79845b0838010f6ec3.exe'],
    'last_modification_date': 1679309327,
    'type_tag': 'peexe',
    'times_submitted': 3,
    'total_votes': {'harmless': 0, 'malicious': 0},
    'size': 393216,
    'popular_threat_classification': {'suggested_threat_label': 'trojan.emotetu/byynvrfi',
     'popular_threat_category': [{'count': 24, 'value': 'trojan'},
      {'count': 10, 'value': 'fakeav'}],
     'popular_threat_name': [{'count': 7, 'value': 'emotetu'},
      {'count': 6, 'value': 'byynvrfi'},
      {'count': 2, 'value': 'obfuscator'}]},
    'authentihash': '5b63cc470dca2fa8d07eda731