In [1]:
import os
import sys
import lief
import time
import struct

import numpy as np
import pickle

import threading
import subprocess
from multiprocessing import Manager, Pool

from tqdm import tqdm

In [2]:
datapath = "../dataset/test"

black_path = []
black_list = []
for parent, dirnames, filenames in os.walk(datapath):
    if "/miner" in parent:
        for filename in filenames:
            fp = os.path.join(parent, filename)
            black_path.append(fp)
            black_list.append(filename)
white_path = []
white_list = []
for parent, dirnames, filenames in os.walk(datapath):
    if "/not_miner" in parent:
        for filename in filenames:
            fp = os.path.join(parent, filename)
            white_path.append(fp)
            white_list.append(filename)
            
print("Found {0} miner samples.".format(len(black_path)))
print("Found {0} not miner samples.".format(len(white_path)))

with open("../models/black_path.pkl", "wb") as f:
    pickle.dump(black_path, f)
with open("../models/black_list.pkl", "wb") as f:
    pickle.dump(black_list, f)
with open("../models/white_path.pkl", "wb") as f:
    pickle.dump(white_path, f)
with open("../models/white_list.pkl", "wb") as f:
    pickle.dump(white_list, f)

test_path = []
for parent, dirnames, filenames in os.walk(datapath):
    for filename in filenames:
        fp = os.path.join(parent, filename)
        test_path.append(fp)

hash_list = [os.path.split(sp)[-1] for sp in test_path]
test_fixed_path = [os.path.join("../tmp", sp) for sp in hash_list]

test_num = len(hash_list)
print("Found {0} samples in total.".format(test_num))

with open("../models/hash_list.pkl", "wb") as f:
    pickle.dump(hash_list, f)

Found 10 miner samples.
Found 10 not miner samples.
Found 20 samples in total.


In [8]:
test_path

['../dataset/test/miner/00a8b0ff6c1a48a69f85657e57d9ed99',
 '../dataset/test/miner/00aaa7dbbd6cf67691a84ea2070fb80a',
 '../dataset/test/miner/00aebe8df3665fd3c9bffb27ed96091b',
 '../dataset/test/miner/00b4d53492887224c5ec460baa0c27c8',
 '../dataset/test/miner/00c0a0f05261d54d4d8d806b4ba888ee',
 '../dataset/test/miner/00ccce26144cb59a8b308c4ba1d5537a',
 '../dataset/test/miner/0a03d7673610d7f8c689213c483d885d',
 '../dataset/test/miner/0a1d16db82b4fc57a99e05f39e915daf',
 '../dataset/test/miner/0a2e1a50b710e21f3d501bde2cea7c9a',
 '../dataset/test/miner/0a4e987dc1c8641dfb7e69de460398d2',
 '../dataset/test/not_miner/000a8c7ef0700888b67c38e08c9cd245',
 '../dataset/test/not_miner/000af59aaf025c9b18dc8f94c45cb381',
 '../dataset/test/not_miner/000cbfd14958df9314a49953129a6ce6',
 '../dataset/test/not_miner/00a0f24620e0227126a062a2d5fba373',
 '../dataset/test/not_miner/00a8869cc1be6a443b4d05e1eaed86a9',
 '../dataset/test/not_miner/00a972a6b45f757a79fe26a01bc4a46d',
 '../dataset/test/not_miner/00ad

In [5]:
emp = threading.Semaphore(value=12)

# ---------------------文件头修复------------------------
def fix_header(fp, ha):
    with open(fp, 'rb') as f:
        data = f.read()
    e_lfnew = data[0x3C: 0x40]
    offset = int.from_bytes(e_lfnew, byteorder='little', signed=True)
    new_data = b"MZ" + data[2:offset] + b"PE\0\0" + data[offset+4:]

    new_path = "../tmp/{0}".format(ha)
    with open(new_path, 'wb') as f:
        f.write(new_data)
    emp.release()

# 修复MZ和PE头
os.system("rm -rf ../tmp")
os.makedirs("../tmp")
table = []
with tqdm(total=test_num, ncols=80, desc="fix") as pbar:
    for fp, ha in zip(test_path, hash_list):
        emp.acquire()
        t = threading.Thread(target=fix_header, args=(fp, ha), daemon=True)
        t.start()
        table.append(t)
        pbar.update(1)
for t in table:
    t.join()

fix: 100%|██████████████████████████████████████| 20/20 [00:00<00:00, 28.75it/s]


In [6]:
from raw_features import ByteHistogram, ByteEntropyHistogram, PEFeatureExtractor

# ---------------------直方图特征------------------------
def histogram_feature(sample_path):
    with open(sample_path, "rb") as f:
        data = f.read()
    file_size = len(data)
    Histogram = ByteHistogram().raw_features(data, None)
    Byte_Entropy = ByteEntropyHistogram().raw_features(data, None)

    Sum = 0
    for i in range(len(Byte_Entropy)):
        Sum += Byte_Entropy[i]

    Histogram = np.array(Histogram) / file_size
    Byte_Entropy = np.array(Byte_Entropy) / Sum

    feature = np.concatenate((Histogram, Byte_Entropy), axis=-1)
    feature = list(feature)
    path = sample_path.replace("tmp", "histogram") + ".txt"
    with open(path, 'w') as f:
        for i in feature:
            f.write("{}\n".format(str(i)))

In [7]:
# 直方图特征
os.system("rm -rf ../histogram")
os.makedirs("../histogram")
start_time = time.time()
with Pool(12) as pool:
    for fp in test_fixed_path:
        pool.apply_async(func=histogram_feature, args=(fp, ))
    pool.close()
    pool.join()
end_time = time.time()
print("hostogram: {0:.2f}s".format(end_time - start_time))
start_time = end_time

hostogram: 0.38s


In [9]:
# test for histogram, display the feature

from raw_features import ByteHistogram, ByteEntropyHistogram, PEFeatureExtractor

sample_path = test_path[0]
with open(sample_path, "rb") as f:
    data = f.read()
file_size = len(data)
file_size

1320448

In [13]:
Histogram = ByteHistogram().raw_features(data, None)
# function: calculate the occurence number of every byte data (0-255)
print(len(Histogram))
print(Histogram)

256
[507704, 5296, 4740, 3584, 5303, 3777, 3649, 4018, 6030, 5502, 2758, 3537, 4121, 3101, 2769, 5633, 7907, 2744, 2868, 2483, 3030, 3017, 2817, 2672, 3681, 2627, 2707, 2822, 3046, 2654, 5551, 2782, 4193, 2526, 2548, 2799, 4394, 2515, 2562, 2690, 3215, 2702, 2546, 3187, 2875, 2875, 2694, 2645, 3414, 3083, 3198, 4213, 3571, 3126, 3331, 3237, 3615, 3172, 3280, 3565, 3391, 3925, 3471, 3777, 3343, 3123, 2653, 2595, 3730, 5118, 3335, 2913, 3331, 3032, 2579, 2591, 3213, 4105, 2996, 2679, 3807, 3124, 2628, 3160, 3137, 3850, 3442, 3199, 2908, 3049, 2649, 2730, 3058, 3381, 2974, 2848, 3149, 2966, 2758, 2708, 3203, 3389, 3740, 2732, 3292, 2803, 3279, 2569, 2871, 2963, 2951, 3186, 3076, 2516, 3214, 3075, 4608, 4505, 2933, 2900, 3199, 2802, 2772, 2587, 2979, 2812, 2720, 2844, 3441, 3244, 3079, 5160, 3384, 4334, 2678, 2618, 2989, 5220, 2995, 9857, 2639, 4074, 2636, 2476, 2934, 2592, 2459, 2625, 2775, 2846, 2843, 2504, 2976, 2621, 2568, 2556, 2790, 2683, 2712, 2685, 2707, 2912, 2610, 2733, 2683, 251

In [14]:
Byte_Entropy = ByteEntropyHistogram().raw_features(data, None)
print(len(Byte_Entropy))
print(Byte_Entropy)

256
[897024, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5842, 19, 19, 24, 23, 24, 22, 19, 13, 13, 22, 18, 33, 13, 23, 17, 1938, 5, 7, 14, 11, 4, 7, 6, 4, 6, 10, 5, 12, 5, 9, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1784, 19, 62, 12, 15, 21, 16, 12, 25, 9, 20, 14, 9, 14, 10, 6, 6508, 445, 176, 84, 59, 55, 63, 92, 207, 57, 106, 71, 52, 66, 97, 54, 9509, 293, 403, 267, 201, 187, 218, 120, 112, 114, 128, 134, 188, 140, 130, 144, 14774, 590, 1259, 341, 327, 245, 260, 334, 330, 278, 337, 374, 264, 260, 254, 253, 1318, 39, 21, 34, 25, 24, 38, 151, 235, 23, 21, 23, 23, 23, 18, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14069, 3241, 857, 4711, 944, 733, 883, 1322, 1820, 596, 624, 706, 676, 713, 746, 2175, 23512, 5405, 2014, 11002, 2325, 2101, 2181, 1839, 2053, 1338, 1383, 1399, 1530, 1421, 1523, 2462, 16143, 4850, 1411, 1710, 1578, 1223, 1404, 1549, 2081, 1420, 1340, 1388, 1456, 1147, 1019, 1241, 10122, 2271, 1393, 2342, 2658, 1187, 1163, 1702, 4927, 1019, 800, 1549, 4114, 

In [16]:
Sum = 0
for i in range(len(Byte_Entropy)):
    Sum += Byte_Entropy[i]

# normalization
Histogram = np.array(Histogram) / file_size
Byte_Entropy = np.array(Byte_Entropy) / Sum

feature = np.concatenate((Histogram, Byte_Entropy), axis=-1)
feature = list(feature)

#print(feature)

In [87]:
from raw_features import PEFeatureExtractor

pe = PEFeatureExtractor()

# ---------------------PE静态特征------------------------

pe_raw_vectors = Manager().list([0] * test_num)

def get_pe_raw_vector(idx, fp, res_default):
    res = res_default
    try:
        with open(fp, 'rb') as f:
            raw_data = f.read()
        res = pe.feature_vector(raw_data)
    except Exception:
        pass
    pe_raw_vectors[idx] = res

In [88]:
# PE静态特征
os.system("rm -rf ../pe_raw")
os.makedirs("../pe_raw")
res_default = np.zeros(shape=(967,), dtype=np.float32)
with Pool(12) as pool:
    for i, fp in enumerate(test_fixed_path):
        pool.apply_async(func=get_pe_raw_vector, args=(i, fp, res_default))
    pool.close()
    pool.join()
with open("../pe_raw/pe_raw_vectors.pkl", "wb") as f:
    print(len(pe_raw_vectors))
    pickle.dump(list(pe_raw_vectors), f)
end_time = time.time()
print("pe raw: {0:.2f}s".format(end_time - start_time))
start_time = end_time

20
pe raw: 19.45s


In [19]:
test_fixed_path[0]

'../tmp/00a8b0ff6c1a48a69f85657e57d9ed99'

In [21]:
for i, fp in enumerate(test_fixed_path):
    print(i, fp)
    break

0 ../tmp/00a8b0ff6c1a48a69f85657e57d9ed99


In [23]:
# test for pe static features, and display them

res_default = np.zeros(shape=(967,), dtype=np.float32)
idx = 0
fp = test_fixed_path[0]
res = res_default
try:
    with open(fp, 'rb') as f:
        raw_data = f.read()
    res = pe.feature_vector(raw_data)
except Exception:
    pass
pe_raw_vectors[idx] = res
print(len(pe_raw_vectors[idx]))
print(pe_raw_vectors[idx])

967
[ 3.84490728e-01  4.01075976e-03  3.58969066e-03  2.71423021e-03
  4.01606131e-03  2.86039291e-03  2.76345597e-03  3.04290676e-03
  4.56663175e-03  4.16676747e-03  2.08868505e-03  2.67863623e-03
  3.12091038e-03  2.34844536e-03  2.09701550e-03  4.26597660e-03
  5.98811917e-03  2.07808265e-03  2.17199023e-03  1.88042235e-03
  2.29467568e-03  2.28483067e-03  2.13336688e-03  2.02355557e-03
  2.78769014e-03  1.98947638e-03  2.05006171e-03  2.13715341e-03
  2.30679289e-03  2.00992380e-03  4.20387648e-03  2.10686075e-03
  3.17543745e-03  1.91298709e-03  1.92964810e-03  2.11973512e-03
  3.32765840e-03  1.90465664e-03  1.94025063e-03  2.03718734e-03
  2.43477966e-03  2.04627519e-03  1.92813342e-03  2.41357484e-03
  2.17729132e-03  2.17729132e-03  2.04021670e-03  2.00310815e-03
  2.58548604e-03  2.33481359e-03  2.42190529e-03  3.19058378e-03
  2.70438520e-03  2.36737845e-03  2.52262875e-03  2.45144079e-03
  2.73770723e-03  2.40221503e-03  2.48400541e-03  2.69984128e-03
  2.56806775e-03  2.9

In [28]:
# test for five features

from raw_features import ByteHistogram, ByteEntropyHistogram, PEFeatureExtractor
from raw_features import GeneralFileInfo, HeaderFileInfo, ExportsInfo, SectionInfo

features = [
    ByteHistogram(), ByteEntropyHistogram(), GeneralFileInfo(),
    HeaderFileInfo(), ExportsInfo(), SectionInfo()
]
print(features) # Generate 6 objects

[histogram(256), byteentropy(256), general(10), header(62), exports(128), section(255)]


In [57]:
# test for pe.feature_vector and pe.raw_features

fp = test_fixed_path[0]
with open(fp, 'rb') as f:
    raw_data = f.read()

bytez = raw_data
lief_binary = lief.PE.parse(list(bytez))
print(lief_binary)

Dos Header
Magic:                        5a4d
Used Bytes In The LastPage:   90
File Size In Pages:           3
Number Of Relocation:         0
Header Size In Paragraphs:    4
Minimum Extra Paragraphs:     0
Maximum Extra Paragraphs:     ffff
Initial Relative SS:          0
Initial SP:                   b8
Checksum:                     0
Initial IP:                   0
Initial Relative CS:          0
Address Of Relocation Table:  40
Overlay Number:               0
OEM id:                       0
OEM info:                     0
Address Of New Exe Header:    130

Rich Header
Key: f6401d48
  - ID: 0x1020 Build ID: 0x64eb Count: 1
  - ID: 0x1000 Build ID: 0x64eb Count: 1
  - ID: 0x1040 Build ID: 0x64eb Count: 5
  - ID: 0x1090 Build ID: 0x64eb Count: 59
  - ID: 0x0000 Build ID: 0x0000 Count: 2
  - ID: 0x1050 Build ID: 0x64eb Count: 16
  - ID: 0x1000 Build ID: 0x0000 Count: 409
  - ID: 0x9300 Build ID: 0x7809 Count: 23
  - ID: 0x8300 Build ID: 0x7809 Count: 2
  - ID: 0x1040 Build ID: 0x64d2 C

In [None]:
# test for features

In [38]:
fe = features[0]

print(len(fe.raw_features(bytez, lief_binary)))
print({fe.name: fe.raw_features(bytez, lief_binary)})

256
{'histogram': [507700, 5296, 4740, 3584, 5303, 3777, 3649, 4018, 6030, 5502, 2758, 3537, 4121, 3101, 2769, 5633, 7907, 2744, 2868, 2483, 3030, 3017, 2817, 2672, 3681, 2627, 2707, 2822, 3046, 2654, 5551, 2782, 4193, 2526, 2548, 2799, 4394, 2515, 2562, 2690, 3215, 2702, 2546, 3187, 2875, 2875, 2694, 2645, 3414, 3083, 3198, 4213, 3571, 3126, 3331, 3237, 3615, 3172, 3280, 3565, 3391, 3925, 3471, 3777, 3343, 3123, 2653, 2595, 3730, 5119, 3335, 2913, 3331, 3032, 2579, 2591, 3213, 4106, 2996, 2679, 3808, 3124, 2628, 3160, 3137, 3850, 3442, 3199, 2908, 3049, 2650, 2730, 3058, 3381, 2974, 2848, 3149, 2966, 2758, 2708, 3203, 3389, 3740, 2732, 3292, 2803, 3279, 2569, 2871, 2963, 2951, 3186, 3076, 2516, 3214, 3075, 4608, 4505, 2933, 2900, 3199, 2802, 2772, 2587, 2979, 2812, 2720, 2844, 3441, 3244, 3079, 5160, 3384, 4334, 2678, 2618, 2989, 5220, 2995, 9857, 2639, 4074, 2636, 2476, 2934, 2592, 2459, 2625, 2775, 2846, 2843, 2504, 2976, 2621, 2568, 2556, 2790, 2683, 2712, 2685, 2707, 2912, 2610, 2

In [37]:
fe = features[2]

print(len(fe.raw_features(bytez, lief_binary)))
print({fe.name: fe.raw_features(bytez, lief_binary)})

10
{'general': {'size': 1320448, 'vsize': 1335296, 'has_debug': 1, 'exports': 1, 'imports': 0, 'has_relocations': 1, 'has_resources': 0, 'has_signature': 0, 'has_tls': 1, 'symbols': 0}}


In [39]:
fe = features[3]

print(len(fe.raw_features(bytez, lief_binary)))
print({fe.name: fe.raw_features(bytez, lief_binary)})

2
{'header': {'coff': {'timestamp': 1514479391, 'machine': 'I386', 'characteristics': ['CHARA_32BIT_MACHINE', 'DEBUG_STRIPPED', 'EXECUTABLE_IMAGE', 'DLL', 'LINE_NUMS_STRIPPED', 'LOCAL_SYMS_STRIPPED']}, 'optional': {'subsystem': 'WINDOWS_GUI', 'dll_characteristics': ['DYNAMIC_BASE', 'NX_COMPAT'], 'magic': 'PE32', 'major_image_version': 0, 'minor_image_version': 0, 'major_linker_version': 13, 'minor_linker_version': 25, 'major_operating_system_version': 5, 'minor_operating_system_version': 1, 'major_subsystem_version': 5, 'minor_subsystem_version': 1, 'sizeof_code': 502784, 'sizeof_headers': 1024, 'sizeof_heap_commit': 24576}}}


In [40]:
fe = features[4]

print(len(fe.raw_features(bytez, lief_binary)))
print({fe.name: fe.raw_features(bytez, lief_binary)})

1
{'exports': ['DllInstall']}


In [41]:
fe = features[5]

print(len(fe.raw_features(bytez, lief_binary)))
print({fe.name: fe.raw_features(bytez, lief_binary)})

2
{'section': {'entry': '.text', 'sections': [{'name': '.text', 'size': 502784, 'entropy': 7.8772955321013445, 'vsize': 502730, 'props': ['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']}, {'name': '.rdata', 'size': 118272, 'entropy': 7.359519722739576, 'vsize': 118258, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}, {'name': '.data', 'size': 675840, 'entropy': 3.2998353275291605, 'vsize': 683104, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']}, {'name': '.reloc', 'size': 22528, 'entropy': 5.653888730136691, 'vsize': 22392, 'props': ['CNT_INITIALIZED_DATA', 'MEM_DISCARDABLE', 'MEM_READ']}]}}


In [58]:
import re
import lief
import hashlib
import numpy as np
from sklearn.feature_extraction import FeatureHasher

features_new = {"sha256": hashlib.sha256(bytez).hexdigest()} # actually useless
features_new.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in features})
print(len(features_new))
print(features_new)

7
{'sha256': '677d0b02a531db3a31408cd5ef638a9b5f3ac1dd04b40aaa3c5d4391196f0783', 'histogram': [507700, 5296, 4740, 3584, 5303, 3777, 3649, 4018, 6030, 5502, 2758, 3537, 4121, 3101, 2769, 5633, 7907, 2744, 2868, 2483, 3030, 3017, 2817, 2672, 3681, 2627, 2707, 2822, 3046, 2654, 5551, 2782, 4193, 2526, 2548, 2799, 4394, 2515, 2562, 2690, 3215, 2702, 2546, 3187, 2875, 2875, 2694, 2645, 3414, 3083, 3198, 4213, 3571, 3126, 3331, 3237, 3615, 3172, 3280, 3565, 3391, 3925, 3471, 3777, 3343, 3123, 2653, 2595, 3730, 5119, 3335, 2913, 3331, 3032, 2579, 2591, 3213, 4106, 2996, 2679, 3808, 3124, 2628, 3160, 3137, 3850, 3442, 3199, 2908, 3049, 2650, 2730, 3058, 3381, 2974, 2848, 3149, 2966, 2758, 2708, 3203, 3389, 3740, 2732, 3292, 2803, 3279, 2569, 2871, 2963, 2951, 3186, 3076, 2516, 3214, 3075, 4608, 4505, 2933, 2900, 3199, 2802, 2772, 2587, 2979, 2812, 2720, 2844, 3441, 3244, 3079, 5160, 3384, 4334, 2678, 2618, 2989, 5220, 2995, 9857, 2639, 4074, 2636, 2476, 2934, 2592, 2459, 2625, 2775, 2846, 284

In [51]:
# test process_raw_features

fe = features[0]
raw_obj = features_new

print(raw_obj[fe.name])
print(len(fe.process_raw_features(raw_obj[fe.name])))
print(fe.process_raw_features(raw_obj[fe.name])) # normalization

[507700, 5296, 4740, 3584, 5303, 3777, 3649, 4018, 6030, 5502, 2758, 3537, 4121, 3101, 2769, 5633, 7907, 2744, 2868, 2483, 3030, 3017, 2817, 2672, 3681, 2627, 2707, 2822, 3046, 2654, 5551, 2782, 4193, 2526, 2548, 2799, 4394, 2515, 2562, 2690, 3215, 2702, 2546, 3187, 2875, 2875, 2694, 2645, 3414, 3083, 3198, 4213, 3571, 3126, 3331, 3237, 3615, 3172, 3280, 3565, 3391, 3925, 3471, 3777, 3343, 3123, 2653, 2595, 3730, 5119, 3335, 2913, 3331, 3032, 2579, 2591, 3213, 4106, 2996, 2679, 3808, 3124, 2628, 3160, 3137, 3850, 3442, 3199, 2908, 3049, 2650, 2730, 3058, 3381, 2974, 2848, 3149, 2966, 2758, 2708, 3203, 3389, 3740, 2732, 3292, 2803, 3279, 2569, 2871, 2963, 2951, 3186, 3076, 2516, 3214, 3075, 4608, 4505, 2933, 2900, 3199, 2802, 2772, 2587, 2979, 2812, 2720, 2844, 3441, 3244, 3079, 5160, 3384, 4334, 2678, 2618, 2989, 5220, 2995, 9857, 2639, 4074, 2636, 2476, 2934, 2592, 2459, 2625, 2775, 2846, 2843, 2504, 2976, 2621, 2568, 2556, 2790, 2683, 2712, 2685, 2707, 2912, 2610, 2733, 2683, 2519, 2

In [52]:
fe = features[2]
raw_obj = features_new

print(raw_obj[fe.name])
print(len(fe.process_raw_features(raw_obj[fe.name])))
print(fe.process_raw_features(raw_obj[fe.name])) # normalization

{'size': 1320448, 'vsize': 1335296, 'has_debug': 1, 'exports': 1, 'imports': 0, 'has_relocations': 1, 'has_resources': 0, 'has_signature': 0, 'has_tls': 1, 'symbols': 0}
10
[1.320448e+06 1.335296e+06 1.000000e+00 1.000000e+00 0.000000e+00
 1.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00]


In [54]:
fe = features[3]
raw_obj = features_new

print(len(raw_obj[fe.name]))
print(raw_obj[fe.name])
print(len(fe.process_raw_features(raw_obj[fe.name])))
print(fe.process_raw_features(raw_obj[fe.name])) # normalization

2
{'coff': {'timestamp': 1514479391, 'machine': 'I386', 'characteristics': ['CHARA_32BIT_MACHINE', 'DEBUG_STRIPPED', 'EXECUTABLE_IMAGE', 'DLL', 'LINE_NUMS_STRIPPED', 'LOCAL_SYMS_STRIPPED']}, 'optional': {'subsystem': 'WINDOWS_GUI', 'dll_characteristics': ['DYNAMIC_BASE', 'NX_COMPAT'], 'magic': 'PE32', 'major_image_version': 0, 'minor_image_version': 0, 'major_linker_version': 13, 'minor_linker_version': 25, 'major_operating_system_version': 5, 'minor_operating_system_version': 1, 'major_subsystem_version': 5, 'minor_subsystem_version': 1, 'sizeof_code': 502784, 'sizeof_headers': 1024, 'sizeof_heap_commit': 24576}}
62
[ 1.5144794e+09  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  1.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00 -1.0000000e+00
  0.0000000e+00 -1.0000000e+00  1.0000000e+00 -1.0000000e+00
 -1.0000000e+00  1.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.00000

In [55]:
fe = features[4]
raw_obj = features_new

print(len(raw_obj[fe.name]))
print(raw_obj[fe.name])
print(len(fe.process_raw_features(raw_obj[fe.name])))
print(fe.process_raw_features(raw_obj[fe.name])) # normalization

1
['DllInstall']
128
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [56]:
fe = features[5]
raw_obj = features_new

print(len(raw_obj[fe.name]))
print(raw_obj[fe.name])
print(len(fe.process_raw_features(raw_obj[fe.name])))
print(fe.process_raw_features(raw_obj[fe.name])) # normalization

2
{'entry': '.text', 'sections': [{'name': '.text', 'size': 502784, 'entropy': 7.8772955321013445, 'vsize': 502730, 'props': ['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']}, {'name': '.rdata', 'size': 118272, 'entropy': 7.359519722739576, 'vsize': 118258, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}, {'name': '.data', 'size': 675840, 'entropy': 3.2998353275291605, 'vsize': 683104, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']}, {'name': '.reloc', 'size': 22528, 'entropy': 5.653888730136691, 'vsize': 22392, 'props': ['CNT_INITIALIZED_DATA', 'MEM_DISCARDABLE', 'MEM_READ']}]}
255
[ 4.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00
  1.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00 -2.2528000e+04  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  1.1827200e+05
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e

In [None]:
# [histogram(256), byteentropy(256), general(10), header(62), exports(128), section(255)]
# 256+256+10+62+128+255=967

In [68]:
# test for FeatureHasher, use SectionInfo for example
# in class SectionInfo, func: process_raw_features

fe = features[5]
raw_obj = {fe.name: fe.raw_features(bytez, lief_binary)}[fe.name]
print(raw_obj)

{'entry': '.text', 'sections': [{'name': '.text', 'size': 502784, 'entropy': 7.8772955321013445, 'vsize': 502730, 'props': ['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']}, {'name': '.rdata', 'size': 118272, 'entropy': 7.359519722739576, 'vsize': 118258, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}, {'name': '.data', 'size': 675840, 'entropy': 3.2998353275291605, 'vsize': 683104, 'props': ['CNT_INITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']}, {'name': '.reloc', 'size': 22528, 'entropy': 5.653888730136691, 'vsize': 22392, 'props': ['CNT_INITIALIZED_DATA', 'MEM_DISCARDABLE', 'MEM_READ']}]}


In [69]:
sections = raw_obj['sections']

# 第一部分：基础统计特征（5维）
general = [
    len(sections),                     # 总节区数量
    sum(1 for s in sections if s['size'] == 0),  # 空尺寸节区数量
    sum(1 for s in sections if s['name'] == ""),  # 无名节区数量 
    sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']),  # 可读可执行节区
    sum(1 for s in sections if 'MEM_WRITE' in s['props'])  # 可写节区
]
print(general)

[4, 0, 0, 1, 1]


In [64]:
# 第二部分：哈希技巧转换（每个FeatureHasher生成50维）
# 节区尺寸哈希（名称+尺寸）
section_sizes = [(s['name'], s['size']) for s in sections]
section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0]
print(section_sizes)
print(section_sizes_hashed)

[('.text', 502784), ('.rdata', 118272), ('.data', 675840), ('.reloc', 22528)]
[      0.       0.       0.       0.       0.  -22528.       0.       0.
       0.       0.       0.       0.       0.       0.  118272.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
 -675840.       0.       0.       0.       0.       0.       0.       0.
       0. -502784.       0.       0.       0.       0.       0.       0.
       0.       0.]


In [65]:
# 节区熵值哈希（名称+熵）
section_entropy = [(s['name'], s['entropy']) for s in sections]
section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0]
print(section_entropy)
print(section_entropy_hashed)

[('.text', 7.8772955321013445), ('.rdata', 7.359519722739576), ('.data', 3.2998353275291605), ('.reloc', 5.653888730136691)]
[ 0.          0.          0.          0.          0.         -5.65388873
  0.          0.          0.          0.          0.          0.
  0.          0.          7.35951972  0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         -3.29983533  0.          0.          0.
  0.          0.          0.          0.          0.         -7.87729553
  0.          0.          0.          0.          0.          0.
  0.          0.        ]


In [66]:
# 节区虚拟大小哈希（名称+虚拟大小）
section_vsize = [(s['name'], s['vsize']) for s in sections]
section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0]
print(section_vsize)
print(section_vsize_hashed)

[('.text', 502730), ('.rdata', 118258), ('.data', 683104), ('.reloc', 22392)]
[      0.       0.       0.       0.       0.  -22392.       0.       0.
       0.       0.       0.       0.       0.       0.  118258.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
       0.       0.       0.       0.       0.       0.       0.       0.
 -683104.       0.       0.       0.       0.       0.       0.       0.
       0. -502730.       0.       0.       0.       0.       0.       0.
       0.       0.]


In [70]:
# 入口节名称哈希
entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0]
print(raw_obj['entry'])
print(entry_name_hashed)

.text
[ 0.  0.  0. -1.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0. -2.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]


In [71]:
# 入口节属性哈希
characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']]
characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
print(characteristics)
print(characteristics_hashed)

['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. -1.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [73]:
# 第三部分：特征拼接（5 + 50*5 = 255维）
result = np.hstack([
    general,                     # 5
    section_sizes_hashed,       # 50
    section_entropy_hashed,     # 50
    section_vsize_hashed,       # 50
    entry_name_hashed,          # 50
    characteristics_hashed     # 50
]).astype(np.float32)
print(result)

[ 4.0000000e+00  0.0000000e+00  0.0000000e+00  1.0000000e+00
  1.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00 -2.2528000e+04  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  1.1827200e+05
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00 -6.7584000e+05  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00 -5.0278400e+05  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
 -5.6538887e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00
  0.0000000e+00  0.00000

In [77]:
print(len(pe_raw_vectors))

20


In [130]:
from feature_engineering import Feature_engineering

fn = Feature_engineering()

# ---------------------特征工程------------------------

feature_engineering_features = Manager().list([0] * test_num)

def get_fn(idx, fp):
    with open(fp, 'rb') as f:
        data = f.read()
    res = fn.get_feature_engineering(data)
    feature_engineering_features[idx] = res
    print(len(res))
    print(res)

In [131]:
# 特征工程
os.system("rm -rf ../feature_engineering")
os.makedirs("../feature_engineering")
with Pool(12) as pool:
    for i, fp in enumerate(test_fixed_path):
        pool.apply_async(func=get_fn, args=(i, fp))
    pool.close()
    pool.join()
end_time = time.time()
print("feature engineering: {0:.2f}s".format(end_time - start_time))

with open("../feature_engineering/feature_engineering_features.pkl", 'wb') as f:
    pickle.dump(list(feature_engineering_features), f)

feature engineering: 2909.40s


In [132]:
# test get_feature_engineering
from collections import ChainMap

from feature_engineering import Feature_engineering
fn = Feature_engineering()

fp = test_fixed_path[0]
with open(fp, 'rb') as f:
    sample_data = f.read()

In [121]:
# test get_section_information

tmp_section = fn.get_section_infomation(sample_data)
section_keys = ["size_R", "size_W", "size_X", "entr_R", "entr_W", "entr_X"]
for k in section_keys:
    file_size = tmp_section['file_size']
    tmp = tmp_section[k]
    tmp_section["{}_weight".format(k)] = tmp / file_size
print(len(tmp_section))
print(tmp_section)

16
{'entry': 5, 'size_R': 329856.0, 'size_W': 675840.0, 'size_X': 502784.0, 'entr_R': 6.047634828126693, 'entr_W': 3.2998353275291605, 'entr_X': 7.8772955321013445, 'rsrc_num': 0, 'section_num': 4, 'file_size': 1320448, 'size_R_weight': 0.24980612640558356, 'size_W_weight': 0.5118262892594029, 'size_X_weight': 0.3807677394338891, 'entr_R_weight': 4.579987116589743e-06, 'entr_W_weight': 2.499027093478244e-06, 'entr_X_weight': 5.96562343394162e-06}


In [122]:
tmp_match = fn.string_match(sample_data)
print(len(tmp_match))
print(tmp_match)

26
{'btc_count': 0, 'btc_mean': 0, 'ltc_count': 1, 'ltc_mean': 26.0, 'xmr_count': 0, 'xmr_mean': 0, 'paths_count': 2, 'paths_mean': 4.0, 'regs_count': 1, 'regs_mean': 3.0, 'urls_count': 0, 'urls_mean': 0, 'ips_count': 1, 'ips_mean': 7.0, 'mz_count': 11, 'mz_mean': 2.0, 'pe_count': 19, 'pe_mean': 2.0, 'pool_count': 7, 'pool_mean': 4.0, 'cpu_count': 10, 'cpu_mean': 3.0, 'gpu_count': 0, 'gpu_mean': 0, 'coin_count': 0, 'coin_mean': 0}


In [123]:
tmp_yara = fn.yara_match(sample_data)
print(len(tmp_yara))
print(tmp_yara)

2
{'packer_count': 1, 'yargen_count': 1}


In [124]:
tmp_count = fn.string_count(sample_data)
print(len(tmp_count))
print(tmp_count)

5
{'av_count': 9, 'dbg_count': 0, 'pool_name_count': 14, 'algorithm_name_count': 2, 'coin_name_count': 7}


In [125]:
tmp_opcode = fn.opcodes(sample_data)
print(len(tmp_opcode))
print(tmp_opcode)

7
{'opcode_min': 3, 'opcode_max': 950, 'opcode_sum': 17497, 'opcode_mean': 52.86102719033233, 'opcode_var': 5681.738994715272, 'opcode_count': 331, 'opcode_uniq': 179}


In [141]:
res_dict = ChainMap(tmp_section, tmp_match, tmp_yara, tmp_count, tmp_opcode)
res = [res_dict[key] for key in fn.feature_keys]
print(res)

[5, 329856.0, 675840.0, 502784.0, 6.047634828126693, 3.2998353275291605, 7.8772955321013445, 0, 4, 1320448, 0.24980612640558356, 0.5118262892594029, 0.3807677394338891, 4.579987116589743e-06, 2.499027093478244e-06, 5.96562343394162e-06, 0, 0, 1, 26.0, 0, 0, 2, 4.0, 1, 3.0, 0, 0, 1, 7.0, 11, 2.0, 19, 2.0, 7, 4.0, 10, 3.0, 0, 0, 0, 0, 1, 1, 9, 0, 14, 2, 7, 3, 950, 17497, 52.86102719033233, 5681.738994715272, 331, 179]
