In [12]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, LongformerTokenizerFast
from tqdm import tqdm

In [13]:
NOTEBOOKS_DIR = Path().resolve()
PROJECT_DIR = NOTEBOOKS_DIR.parent
DATA_DIR = PROJECT_DIR / 'data'
DATASET_DIR = DATA_DIR / 'pe-machine-learning-dataset'
REPORTS_DIR = DATASET_DIR / 'reports'
RANDOM_STATE = 741

In [14]:
import sys

sys.path.append(str(PROJECT_DIR))

In [15]:
df = pd.read_parquet(DATA_DIR / 'labeled_df.parquet')

In [5]:
df.shape

(36988, 7)

In [17]:
df.label.value_counts()

label
malware    19856
benign     17132
Name: count, dtype: int64

In [6]:
df_train, df_test = train_test_split(
    df,
    test_size=0.2,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df.label,
)

df_train, df_valid = train_test_split(
    df_train,
    test_size=0.1,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=df_train.label,
)

In [7]:
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)




In [None]:
df_train.to_parquet(DATA_DIR / 'df_train.parquet')
df_valid.to_parquet(DATA_DIR / 'df_valid.parquet')
df_test.to_parquet(DATA_DIR / 'df_test.parquet')

In [8]:
#tokenizer = LongformerTokenizerFast.from_pretrained('allenai/longformer-base-4096')
tokenizer = AutoTokenizer.from_pretrained('kazzand/ru-longformer-tiny-16384')

In [9]:
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=10_240, chunk_overlap=4_096,
)

In [10]:
from detectify.extractors import VirusTotalFeatureExtractor

In [11]:
for _df, _filename in (
    (df_train, 'df_train_chunks.parquet'),
    (df_valid, 'df_valid_chunks.parquet'),
    (df_test, 'df_test_chunks.parquet'),
):
    container_hash_text = {
        'FILENAME': [],
        'HASH': [],
        'TEXT': [],
        'LABEL': [],
        'LABEL_ID': [],
    }
    
    for idx, row in tqdm(_df.iterrows(), total=_df.shape[0]):
        report_path = REPORTS_DIR / f'{row.sha256}.json'
        
        extractor = VirusTotalFeatureExtractor.from_json(report_path)
        corpus = '\n'.join(extractor.extract_all(error=None))
        chunks = text_splitter.split_text(corpus)
    
        label = row.label
        label_id = row.label_id
    
        container_hash_text['FILENAME'].extend([row.filename] * len(chunks))
        container_hash_text['HASH'].extend([row.sha256] * len(chunks))
        container_hash_text['TEXT'].extend(chunks)
        container_hash_text['LABEL'].extend([label] * len(chunks))
        container_hash_text['LABEL_ID'].extend([label_id] * len(chunks))

    _df = pd.DataFrame(container_hash_text)
    _df.to_parquet(DATA_DIR / _filename)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 26631/26631 [14:03<00:00, 31.58it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2959/2959 [01:25<00:00, 34.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 7398/7398 [03:28<00:00, 35.40it/s]
