# 系统演示

In [1]:
!dir

 驱动器 F 中的卷是 新加卷
 卷的序列号是 A8C1-0BCD

 F:\course\gp\Packer-Classifier 的目录

2019/05/27  17:49    <DIR>          .
2019/05/27  17:49    <DIR>          ..
2019/05/27  13:11    <DIR>          .ipynb_checkpoints
2019/05/10  22:07    <DIR>          .vscode
2019/05/27  17:03    <DIR>          Datasets
2019/05/27  16:56    <DIR>          experiments
2019/05/27  17:12    <DIR>          gadgets
2019/05/27  17:18            12,509 img_train.py
2019/05/18  00:28            15,774 ins_train.py
2019/05/27  17:19            17,872 main_train.py
2019/05/27  17:12    <DIR>          my_models
2019/05/27  13:42    <DIR>          my_sandbox
2019/05/12  22:07    <DIR>          pin
2019/05/27  17:49            18,847 presentation.ipynb
2019/05/27  13:46    <DIR>          pre_data
2019/04/20  09:13               407 README.md
2019/05/27  17:20    <DIR>          __pycache__
               5 个文件         65,409 字节
              12 个目录 64,239,796,224 可用字节


## 相关参数

In [2]:
from pathlib import Path

In [3]:

config = {
    "seed": 519,
    "vectorizer_file": "vectorizer.json",
    "model_state_file": "model.pth",
    "save_dir": Path.cwd() / "experiments" / "main" / "1558276575_74f6ab40-7a43-11e9-8b2b-0242ac1c0002",
    # ODEnet
    "input_dim": 3,
    "state_dim": 64,
    "tol": 5e-5,
    # GRU
    "cutoff": 25,
    "num_layers": 1,
    "embedding_dim": 100,
    "kernels": [1, 3],
    "num_filters": 100,
    "rnn_hidden_dim": 64,
    "hidden_dim": 36,
    "dropout_p": 0.5,
    "bidirectional": False,
    # 超参数, [训练, 验证, 测试]
    "state_size": [0.7, 0.15, 0.15],
    "batch_size": 26,
    "num_epochs": 50,
    "early_stopping_criteria": 5,
    "learning_rate": 1e-5
}

## 特征提取准备

### Sandbox类

* vmrun_path：vmrun.exe路径
* vmx_path：虚拟机.vmx路径
* vm_snapshot：虚拟机快照名real
* vm_user：虚拟机用户名
* vm_pass：虚拟机密码
* script_path：虚拟机内Python脚本路径
* python_path：虚拟机内Python路径
* malware_path：虚拟机内暂存样本路径
* timeout：运行脚本超时时间

In [4]:
from my_sandbox.get_features import Sandbox

In [5]:
sb = Sandbox(
        vmrun_path=r"E:\VMware\vmrun.exe",
        vmx_path=r"D:\虚拟机\Win10\Windows 10 x64.vmx",
        vm_snapshot="real",
        vm_user="msi",
        vm_pass="123456",
        script_path=r"C:\Users\msi\Desktop\my_sandbox_script.py",
        python_path=r"C:\Users\msi\AppData\Local\Programs\Python\Python35\python.exe",
        malware_path=r"C:\Malware",
        timeout=10)

## 预处理准备

In [6]:
from pre_data.preprocess import Preproce

In [7]:
pp = Preproce()

## 加载向量器

In [8]:
import json
from Datasets.datasets import Vectorizer

In [9]:
vectorizer_filepath = config['save_dir'] / config['vectorizer_file']
with vectorizer_filepath.open() as fp:
    vectorizer = Vectorizer.from_serializable(json.load(fp))

In [10]:
print(vectorizer.image_vocab)
print(vectorizer.ins_word_vocab)
print(vectorizer.ins_char_vocab)
print(vectorizer.packer_vocab)

<IMG_SequenceVocabulary(train_means: [128.47567749023438, 93.32111358642578, 38.726646423339844], train_stds: [59.69920349121094, 89.16869354248047, 61.76454162597656]>
<INS_SequenceVocabulary(size=106)>
<INS_SequenceVocabulary(size=34)>
<Vocabulary(size=7)>


## 加载模型

In [11]:
from main_train import MainModel
import torch

In [12]:
model = MainModel(
    input_dim=config["input_dim"],
    state_dim=config["state_dim"],
    tol=config["tol"],
    embedding_dim=config["embedding_dim"],
    num_word_embeddings=len(vectorizer.ins_word_vocab),
    num_char_embeddings=len(vectorizer.ins_char_vocab),
    kernels=config["kernels"],
    num_input_channels=config["embedding_dim"],
    num_output_channels=config["num_filters"],
    rnn_hidden_dim=config["rnn_hidden_dim"],
    hidden_dim=config["hidden_dim"],
    output_dim=len(vectorizer.packer_vocab),
    num_layers=config["num_layers"],
    bidirectional=config["bidirectional"],
    dropout_p=config["dropout_p"],
    word_padding_idx=vectorizer.ins_word_vocab.mask_index,
    char_padding_idx=vectorizer.ins_char_vocab.mask_index)

model.load_state_dict(torch.load(config['save_dir'] / config['model_state_file']))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [13]:
model = model.to("cpu")
print (model.named_modules)

<bound method Module.named_modules of MainModel(
  (img_layer): IngModel(
    (downsampling_layers): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1))
      (1): GroupNorm(32, 64, eps=1e-05, affine=True)
      (2): ReLU(inplace)
      (3): Conv2d(64, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
      (4): GroupNorm(32, 64, eps=1e-05, affine=True)
      (5): ReLU(inplace)
      (6): Conv2d(64, 64, kernel_size=(4, 4), stride=(2, 2), padding=(1, 1))
    )
    (feature_layers): ODEBlock(
      (odefunc): ODEfunc(
        (norm1): GroupNorm(32, 64, eps=1e-05, affine=True)
        (relu): ReLU(inplace)
        (conv1): Conv2d(65, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (norm2): GroupNorm(32, 64, eps=1e-05, affine=True)
        (conv2): Conv2d(65, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (norm3): GroupNorm(32, 64, eps=1e-05, affine=True)
      )
    )
    (fc_layers): Sequential(
      (0): GroupNorm(32, 64, eps=1e-0

## 预测类

In [14]:
class Inference(object):
    def __init__(self, model, vectorizer):
        self.model = model
        self.vectorizer = vectorizer

    def predict_packer(self, image, ins):
        # 向量化
        image_vector, ins_word_vector, ins_char_vector, ins_length = self.vectorizer.vectorize(image, ins)
        
        image_vector = torch.FloatTensor(image_vector).unsqueeze(0)
        ins_word_vector = torch.tensor(ins_word_vector).unsqueeze(0)
        ins_char_vector = torch.tensor(ins_char_vector).unsqueeze(0)
        ins_length = torch.tensor([ins_length]).long()

        # 预测
        self.model.eval()
        attn_scores, y_pred = self.model(x_img=image_vector,
                                         x_word=ins_word_vector,
                                         x_char=ins_char_vector,
                                         x_lengths=ins_length,
                                         device="cpu",
                                         apply_softmax=True)

        # 取概率最大
        y_prob, indices = y_pred.max(dim=1)
        index = indices.item()

        # 预测壳类型
        packer = vectorizer.packer_vocab.lookup_index(index)
        probability = y_prob.item()
        return {'packer': packer, 'probability': probability, 
                'attn_scores': attn_scores}

    def predict_top_k(self, image, ins, k):
        # 向量化
        image_vector, ins_word_vector, ins_char_vector, ins_length = self.vectorizer.vectorize(image, ins)
        
        image_vector = torch.FloatTensor(image_vector).unsqueeze(0)
        ins_word_vector = torch.tensor(ins_word_vector).unsqueeze(0)
        ins_char_vector = torch.tensor(ins_char_vector).unsqueeze(0)
        ins_length = torch.tensor([ins_length]).long()

         # 预测
        self.model.eval()
        _, y_pred = self.model(x_img=image_vector,
                               x_word=ins_word_vector,
                               x_char=ins_char_vector,
                               x_lengths=ins_length,
                               device="cpu",
                               apply_softmax=True)
        
        # Top k
        y_prob, indices = torch.topk(y_pred, k=k)
        probabilities = y_prob.detach().numpy()[0]
        indices = indices.detach().numpy()[0]

        # 结果
        results = []
        for probability, index in zip(probabilities, indices):
            packer = self.vectorizer.packer_vocab.lookup_index(index)
            results.append({'packer': packer, 'probability': probability})

        return results

# 开始

## 输入

In [15]:
input_path = r"C:\Users\msi\Desktop\aspack变形.exe"

## 特征提取

In [16]:
# 输入要检测的文件路径，得到特征
features = sb.get_features(input_path)

--> revertToSnapshot
--> start
--> copyFileFromHostToGuest
--> runProgramInGuest
--> copyFileFromGuestToHost
--> stop
--> get features
--> completed!


In [17]:
features.split("\n")[0]

'0x0044d001 9c --> pushfd '

## 预处理

In [18]:
data_dict = pp.preprocess(features)

In [19]:
img = data_dict['image'][0]
ins = data_dict['ins'][0]

## 分类

In [20]:
inference = Inference(model=model, vectorizer=vectorizer)
prediction = inference.predict_packer(data_dict['image'][0], data_dict['ins'][0])
print("{} → {} (p={:0.2f})".format(input_path, prediction['packer'], 
                                    prediction['probability']))

C:\Users\msi\Desktop\aspack变形.exe → ASPack (p=1.00)


## 输出

In [21]:
# Top-k
top_k = inference.predict_top_k(img, ins, k=len(vectorizer.packer_vocab))
print ("{}: ".format(input_path))
for result in top_k:
    print ("{} (p={:0.2f})".format(result['packer'], 
                                   result['probability']))

C:\Users\msi\Desktop\aspack变形.exe: 
ASPack (p=1.00)
PeSpin (p=0.00)
Molebox (p=0.00)
Normal (p=0.00)
NsPack (p=0.00)
PECompact (p=0.00)
UPX (p=0.00)
