# Emulation Reports Preprocessing

## Imports

### Standard Library Imports

In [13]:
from typing import Iterable
from pathlib import Path
from collections import Counter
import sys
import numpy as np
import pandas as pd
import re
import logging
import pickle

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

### Repo Imports

In [2]:
REPO_ROOT = Path("../../")
sys.path.append(REPO_ROOT.as_posix())

from preprocessing.reports import report_to_apiseq
from preprocessing.array import rawseq2array
from utils.functions import flatten

## API Call Sequences
### Data Import

In [3]:
SHA256_PATTERN = r'[a-f0-9]{64}'

def get_name_stem(file_path) -> str:
    """
    Get the base name stem from a file path.
        e.g. "path/to/file/stem.ext" -> "stem"

    Args:
        file_path (str|Path): The path, or the file name.

    Returns:
        str: The base name stem, which is the file name excluding the path
            and the extension.
    """
    base_name = Path(file_path).name
    return base_name.split(".")[0]


def get_api_sequences(emulation_dataset_path: Path, 
                      skip_clean=False) -> pd.DataFrame:
    """
    Extract the sequence of API calls and associated family class, given the 
    base path to the emulation dataset folder containing the JSON-formatted 
    reports. Each family class should be named `report_<FAMILY_CLASS>`, where
    <FAMILY_CLASS> could be e.g. "ransomware" or "clean". Furthermore, each
    emulation report should be named as `<SHA256>.json`, where <SHA256> is the
    digest of the corresponding PE binary.

    Args:
        emulation_dataset_path (Path): base path to the emulation dataset. 
        skip_clean (bool, optional): flag for skipping the extraction of API
            calls by benignware. Defaults to False.

    Returns:
        pd.DataFrame: Table containing the extracted information structured as:
            - pe_hash (str): the sha256 digest of the PE binary;
            - family (str): the family class of the executable;
            - api_sequence (list[str]): the sequence of API calls. 
    """
    logger = logging.getLogger("get_api_sequences")

    assert emulation_dataset_path.exists()
    api_sequences = []
    
    # Iterate over every malware family folder
    # (includes benignware in "report_clean")
    for family_reports_dir in emulation_dataset_path.glob("report_*"):
        family_name = family_reports_dir.name.split("_")[-1]
        if skip_clean and family_name == "clean":
            # Skip benignware reports
            logger.info(f"Skipping benignware ...")
            continue
        
        logger.info(f"Getting family '{family_name}' ...")
        
        # Iterate over all JSON files in the family folder
        # (files with extension `.err` are empty)
        for report_path in family_reports_dir.glob("*.json"):
            malware_hash = get_name_stem(report_path)
            if re.match(SHA256_PATTERN, malware_hash) is None:
                # If the filename is not SHA256 it's not an emulation report
                logger.warning(f"Skipping non-SHA256 '{report_path}' ...")
                continue
            
            report_features = report_to_apiseq(report_path)
            api_sequence = report_features["api.seq"]
            assert report_features["api.seq.len"] == len(api_sequence)

            api_sequences.append((malware_hash, family_name, api_sequence))
        
    return pd.DataFrame(api_sequences, columns=["pe_hash", "family", 
                                                "api_sequence"])


In [4]:
EMULATION_DATASET_PATH = REPO_ROOT.joinpath("data/emulation.dataset")
PRELOADED_APIS_PATH = Path("./api_sequences.pickle")

reports_df = None
if PRELOADED_APIS_PATH.exists():
    reports_df = pd.read_pickle(PRELOADED_APIS_PATH)
else:
    reports_df = get_api_sequences(EMULATION_DATASET_PATH)
    reports_df.to_pickle(PRELOADED_APIS_PATH)

display(reports_df)
display(reports_df["family"].value_counts())

Unnamed: 0,pe_hash,family,api_sequence
0,0009064322cdc719a82317553b805cbbc64230a9212d3b...,backdoor,"[kernel32.getsystemtimeasfiletime, kernel32.ge..."
1,001180134ff5ae8227e97f5eb8411e5c8ef599e2500eb3...,backdoor,"[kernel32.virtualalloc, kernel32.virtualfree, ..."
2,00166c0bab8daf5076c4c4fb5319a178cc8c6744820a3c...,backdoor,"[kernel32.getsystemtimeasfiletime, kernel32.ge..."
3,0024c4ef15fd7ecc3642669b9c79f0f42679858d134e2d...,backdoor,"[crtdll.__getmainargs, kernel32.getcommandline..."
4,002a878755f82b7e2e04fabc845438718df3cfab044292...,backdoor,"[kernel32.getsystemtimeasfiletime, kernel32.ge..."
...,...,...,...
90851,ffd226d3ba1864f81b46ba0ce92ff9b01c03f8bc2b07e9...,trojan,"[kernel32.loadlibrarya, kernel32.getprocaddres..."
90852,ffd674f0b9db3f930d6c8c8451f9d4745a1f7a7d16e692...,trojan,"[kernel32.getsystemtimeasfiletime, kernel32.ge..."
90853,ffe16bbe7704df47f50f38471301ef065d1ee1b855bfba...,trojan,"[kernel32.getversion, kernel32.heapcreate, ker..."
90854,fff0ddbb6cbe9c91bf0fc9bb8b22744d648685eb16dc52...,trojan,"[crtdll.__getmainargs, kernel32.getcommandline..."


clean         25291
trojan        12780
dropper       11150
backdoor      11062
ransomware     9627
rat            9479
coinminer      6893
keylogger      4574
Name: family, dtype: int64

In [5]:
# Some of these samples belong to more than one family
multi_family_reports = reports_df[
    # Select all the duplicate records and keep both in the results
    reports_df.duplicated("pe_hash", keep=False)
    ].sort_values("pe_hash")
print("Number of records with more than one family:",
      len(multi_family_reports.drop_duplicates("pe_hash")))

# But none of them belongs to more than two families
print("Highest amount of family classes per unique sample:",
      multi_family_reports.value_counts("pe_hash").max())

display(multi_family_reports)

Number of records with more than one family: 831
Highest amount of family classes per unique sample: 2


Unnamed: 0,pe_hash,family,api_sequence
78094,0044bcd77b1d93a45ba883f5b0083ae27b25dccd082ad6...,trojan,"[gdiplus.gdiplusstartup, user32.getdesktopwind..."
43264,0044bcd77b1d93a45ba883f5b0083ae27b25dccd082ad6...,dropper,"[gdiplus.gdiplusstartup, user32.getdesktopwind..."
15,0062c46371328306ac7aed15be7a1d2157827099df6d5b...,backdoor,"[crtdll.__getmainargs, kernel32.getcommandline..."
78100,0062c46371328306ac7aed15be7a1d2157827099df6d5b...,trojan,"[crtdll.__getmainargs, kernel32.getcommandline..."
78101,006e18ee15c89e54fba783106e93e30d611de889d7031b...,trojan,"[user32.loadicona, user32.loadcursora, user32...."
...,...,...,...
54362,ff23b2ed2a4378cdcf84b5904962dd56b5e3d2000d427d...,dropper,"[kernel32.gettickcount, kernel32.getmodulehand..."
11038,ff87adc14e0af5b56819d2a6edef0d3fa6671bb2cd1034...,backdoor,"[crtdll.__getmainargs, kernel32.getcommandline..."
90834,ff87adc14e0af5b56819d2a6edef0d3fa6671bb2cd1034...,trojan,"[crtdll.__getmainargs, kernel32.getcommandline..."
11044,ffa0e971ccd8e2204412340608bd91f4aef6ba847151ed...,backdoor,"[crtdll.__getmainargs, kernel32.getcommandline..."


#### Preprocessing

We encode the input vector *X*, consisting of the API call sequences:
1. Get the list of API sequences and count which APIs are the most frequent.
2. Retain only the $V$ most occurring APIs, where $V$ is the vocabulary size.
    - Default is $V=150$, but published paper selects $V=600$.
3. Assign a numeric label to each API.
    - The lower the number, the most frequent the use of that API.
    - The higher the number, the more that API is encountered rarely.
    - Counter starts at 2. I still need to fully understand this:
        - 0 -> padding;
        - 1 -> rare APIs (e.g. rarer than $V$ ..?)
        - So then why aren't the least frequent APIs encoded with lower numbers?
4. Define a padding length
    - The repo's default value is 150.
    - Based on the model scheme in the published paper padding seems to be 96.
    - But the emulation reports stop after recording 500 API calls.
    - **So which one is it?**

In [6]:
API_VOCABULARY_SIZE = 600
PADDING_LENGTH = 96

def preprocess_api_sequences(api_sequences: np.ndarray) -> np.ndarray:
    """
    Encodes sequences of API calls to integer values based on a globally-defined 
    vocabulary size (V) and padding length.
    
    0      -> Reserved for padding.
    1      -> For APIs rarer than V most frequently occurring APIs.
    2..V+2 -> APIs ranked based on their frequency of occurrence, with the most
                  frequent APIs being encoded with lower values.

    Args:
        api_sequences (np.ndarray): Dataset as bi-dimensional array where each 
            row contains the sequence of API calls made by one sample.

    Returns:
        np.ndarray: The API sequences encoded as integers within [0, V+2].
    """
      
    # Retain only the V most occurring APIs, where V is the vocabulary size  
    api_counter = Counter(flatten(api_sequences))
    apis_preserved = [
        x[0] for x in api_counter.most_common(API_VOCABULARY_SIZE)]
    
    # Encode each API with numeric value
    api_map = dict(zip(apis_preserved, range(2, API_VOCABULARY_SIZE+2)))
    
    return np.vstack([rawseq2array(x, api_map, PADDING_LENGTH) 
                      for x in api_sequences])
    

def labels_list_to_map(labels: Iterable) -> dict:
    """Converts series of unique labels to map where label -> index"""
    return dict([(label, index) for index, label in enumerate(labels)])


def get_labels_list(labelled_data: Iterable) -> list:
    """
    Removes duplicates from labelled data and returns a list with the 'clean' 
    label in the first place
    """
    labels = list(set(labelled_data))
    
    # Sort list to get consistent results
    labels = sorted(labels)
    
    # I like having the "clean" label as the first in the list
    if "clean" in labels:
        labels.remove("clean")
        labels.insert(0, "clean")
        
    return labels

    
def preprocess_labels(labelled_data: Iterable) -> np.ndarray:
    """Encodes dataset of labels to integer values"""
    labels = get_labels_list(labelled_data)
    
    # Encode all family classes with numeric values
    labels_map = labels_list_to_map(labels)
    return np.array([labels_map[label] for label in labelled_data])

    

In [15]:
PREPROCESSED_PATH = Path("./preprocessed_api_sequences.pickle")

X = preprocess_api_sequences(reports_df["api_sequence"].values)
y = preprocess_labels(reports_df["family"].values)

assert len(X) == len(y)

full_dataset = tuple([X, y])

with open(PREPROCESSED_PATH, "wb") as preprocessed_file:
    pickle.dump(full_dataset, preprocessed_file)