# Emulation Reports Preprocessing

## Imports

### Standard Library Imports

In [1]:
from typing import Iterable
from pathlib import Path
from collections import Counter
import sys
import numpy as np
import pandas as pd
import re
import logging
import pickle

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from sklearn.preprocessing import MultiLabelBinarizer

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

### Repo Imports

In [2]:
REPO_ROOT = Path("../../")
sys.path.append(REPO_ROOT.as_posix())

from preprocessing.reports import report_to_apiseq
from preprocessing.array import rawseq2array
from utils.functions import flatten

## API Call Sequences
### Data Import

In [3]:
SHA256_PATTERN = r'[a-f0-9]{64}'

def get_name_stem(file_path) -> str:
    """
    Get the base name stem from a file path.
        e.g. "path/to/file/stem.ext" -> "stem"

    Args:
        file_path (str|Path): The path, or the file name.

    Returns:
        str: The base name stem, which is the file name excluding the path
            and the extension.
    """
    base_name = Path(file_path).name
    return base_name.split(".")[0]


def get_api_sequences(emulation_dataset_path: Path, 
                      skip_clean=False) -> pd.DataFrame:
    """
    Extract the sequence of API calls and associated family class, given the 
    base path to the emulation dataset folder containing the JSON-formatted 
    reports. Each family class should be named `report_<FAMILY_CLASS>`, where
    <FAMILY_CLASS> could be e.g. "ransomware" or "clean". Furthermore, each
    emulation report should be named as `<SHA256>.json`, where <SHA256> is the
    digest of the corresponding PE binary.

    Args:
        emulation_dataset_path (Path): base path to the emulation dataset. 
        skip_clean (bool, optional): flag for skipping the extraction of API
            calls by benignware. Defaults to False.

    Returns:
        pd.DataFrame: Table containing the extracted information structured as:
            - pe_hash (str): the sha256 digest of the PE binary;
            - family (str): the family class of the executable;
            - api_sequence (list[str]): the sequence of API calls. 
    """
    logger = logging.getLogger("get_api_sequences")

    assert emulation_dataset_path.exists()
    api_sequences = []
    
    # Iterate over every malware family folder
    # (includes benignware in "report_clean")
    for family_reports_dir in emulation_dataset_path.glob("report_*"):
        family_name = family_reports_dir.name.split("_")[-1]
        if skip_clean and family_name == "clean":
            # Skip benignware reports
            logger.info(f"Skipping benignware ...")
            continue
        
        logger.info(f"Getting family '{family_name}' ...")
        
        # Iterate over all JSON files in the family folder
        # (files with extension `.err` are empty)
        for report_path in family_reports_dir.glob("*.json"):
            malware_hash = get_name_stem(report_path)
            if re.match(SHA256_PATTERN, malware_hash) is None:
                # If the filename is not SHA256 it's not an emulation report
                logger.warning(f"Skipping non-SHA256 '{report_path}' ...")
                continue
            
            report_features = report_to_apiseq(report_path)
            api_sequence = tuple(report_features["api.seq"])
            assert report_features["api.seq.len"] == len(api_sequence)

            api_sequences.append((malware_hash, family_name, api_sequence))
        
    return pd.DataFrame(api_sequences, columns=["pe_hash", "family", 
                                                "api_sequence"])


In [4]:
EMULATION_DATASET_PATH = REPO_ROOT.joinpath("data/emulation.dataset")
PRELOADED_APIS_PATH = Path("./api_sequences.pickle")

reports_df = None
if PRELOADED_APIS_PATH.exists():
    reports_df = pd.read_pickle(PRELOADED_APIS_PATH)
else:
    reports_df = get_api_sequences(EMULATION_DATASET_PATH)
    reports_df.to_pickle(PRELOADED_APIS_PATH)

# Account for multiple classes
reports_df = reports_df.groupby(
    ["pe_hash", "api_sequence"])["family"].apply(tuple).reset_index()

display(reports_df)
display(reports_df["family"].value_counts())

Unnamed: 0,pe_hash,api_sequence,family
0,00004fa66d2c7f7bd21bedcc4c6db127684ef4ee2725c6...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(clean,)"
1,0001d0c1054136e470d7742aef8bab28af29c9187f6103...,"(msvcrt.__set_app_type, msvcrt.__p__fmode, msv...","(dropper,)"
2,0002229c96f323e973c59f4f891bd529027a520846b131...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(keylogger,)"
3,00029544a86c16fdc6f6d2e78a3931e830f0c93eb39fef...,"(kernel32.getmodulehandlew, kernel32.setthread...","(clean,)"
4,00029a5f91accf32eaa4aab3c661e115424ad5336164fc...,"(kernel32.getmodulehandlea, user32.getkeyboard...","(trojan,)"
...,...,...,...
90020,fffc3b392021a16b621e4aacf10a1483bd940f9befb077...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(coinminer,)"
90021,fffdd200ba06ae0bc51ee77f91d7ccf830ae1703ccec21...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(clean,)"
90022,fffdef1712ef3713d1895390a0115319eac9afb69a73fd...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(coinminer,)"
90023,fffecf791b7337eae1f02531f34d41ea95fbaaffdaa9a8...,"(kernel32.getsystemtimeasfiletime, kernel32.ge...","(dropper,)"


family
(clean,)              25291
(trojan,)             11961
(dropper,)            10888
(backdoor,)           10494
(ransomware,)          9627
(rat,)                 9467
(coinminer,)           6893
(keylogger,)           4573
(backdoor, trojan)      567
(dropper, trojan)       252
(dropper, rat)           10
(backdoor, rat)           1
(keylogger, rat)          1
Name: count, dtype: int64

#### Preprocessing

We encode the input vector *X*, consisting of the API call sequences:
1. Get the list of API sequences and count which APIs are the most frequent.
2. Retain only the $V$ most occurring APIs, where $V$ is the vocabulary size.
    - Default is $V=150$, but published paper selects $V=600$.
3. Assign a numeric label to each API.
    - The lower the number, the most frequent the use of that API.
    - The higher the number, the more that API is encountered rarely.
    - Counter starts at 2. I still need to fully understand this:
        - 0 -> padding;
        - 1 -> rare APIs (e.g. rarer than $V$ ..?)
        - So then why aren't the least frequent APIs encoded with lower numbers?
4. Define a padding length
    - The repo's default value is 150.
    - Based on the model scheme in the published paper padding seems to be 96.
    - But the emulation reports stop after recording 500 API calls.
    - **So which one is it?**

In [26]:
API_VOCABULARY_SIZE = 600
PADDING_LENGTH = 150

def preprocess_api_sequences(api_sequences: np.ndarray) -> np.ndarray:
    """
    Encodes sequences of API calls to integer values based on a globally-defined 
    vocabulary size (V) and padding length.
    
    0      -> Reserved for padding.
    1      -> For APIs rarer than V most frequently occurring APIs.
    2..V+2 -> APIs ranked based on their frequency of occurrence, with the most
                  frequent APIs being encoded with lower values.

    Args:
        api_sequences (np.ndarray): Dataset as bi-dimensional array where each 
            row contains the sequence of API calls made by one sample.

    Returns:
        np.ndarray: The API sequences encoded as integers within [0, V+2].
    """
      
    # Retain only the V most occurring APIs, where V is the vocabulary size  
    api_counter = Counter(flatten(api_sequences))
    apis_preserved = [
        x[0] for x in api_counter.most_common(API_VOCABULARY_SIZE)]
    
    # Encode each API with numeric value
    api_map = dict(zip(apis_preserved, range(2, API_VOCABULARY_SIZE+2)))
    
    return np.vstack([rawseq2array(x, api_map, PADDING_LENGTH) 
                      for x in api_sequences])
    

def get_labels_map(labels: Iterable) -> dict:
    """Converts series of unique labels to map where label -> index"""
    return dict([(label, index) for index, label in enumerate(labels)])


def get_labels_list(labelled_data: Iterable) -> list:
    """
    Removes duplicates from labelled data and returns a list with the 'clean' 
    label in the first place
    """
    labels = list(set(flatten(labelled_data)))
    
    # Sort list to get consistent results
    labels = sorted(labels)
    
    # I like having the "clean" label as the first in the list
    if "clean" in labels:
        labels.remove("clean")
        labels.insert(0, "clean")
        
    return labels


def encode_labels(labelled_data: Iterable, labels_map: dict) -> list:
    labels_encoded = []
    for label_tuple in labelled_data:
        labels_encoded.append(labels_map[label_tuple[0]])
        
    return np.ndarray(labels_encoded)

    
def preprocess_labels(labelled_data: Iterable) -> np.ndarray:
    """Encodes dataset of labels to integer values"""
    labels = get_labels_list(labelled_data)
    
    # Encode all family classes with numeric values
    labels_map = get_labels_map(labels)
    return encode_labels(labelled_data, labels_map)

    

In [27]:
PREPROCESSED_PATH = Path("./encoded_api_sequences.pickle")

X = preprocess_api_sequences(reports_df["api_sequence"].values)
y = preprocess_labels(reports_df["family"].values)

assert len(X) == len(y)

full_dataset = tuple([X, y])

with open(PREPROCESSED_PATH, "wb") as preprocessed_file:
    pickle.dump(full_dataset, preprocessed_file)

ValueError: maximum supported dimension for an ndarray is 32, found 90025

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# Combine training and testing data for visualization
X_combined_vect = np.vstack((X_train, X_test))
# y_combined = np.hstack([
#     [family[0] for family in y_train], 
#     [family[0] for family in y_test]
# ])
y_combined = np.concatenate()

# Perform t-SNE
tsne = TSNE(n_components=2, random_state=42, n_iter=2000)
X_embedded = tsne.fit_transform(X_combined_vect)

# Prepare the labels for coloring
mlb = MultiLabelBinarizer()
y_combined_bin = mlb.fit_transform(y_combined)




TypeError: 'numpy.int64' object is not iterable

In [38]:
def decode_labels(labelled_data: Iterable, labels_decoder: dict) -> np.ndarray:
    labels_decoded = []
    for labels in labelled_data:
        for label in labels:
            labels_decoded.append(np.array([labels_decoder[label]]))
            
    return np.array(labels_decoded)

In [1]:
labels_map = get_labels_map(get_labels_list(reports_df["family"].values))
labels_decoder = {value: key for (key, value) in labels_map.items()}


primary_labels = [labels_decoder[labels[0]] for labels in y_combined]
label_to_color = {label: idx for idx, label in enumerate(set(primary_labels))}

# Map labels to colors
colors = [label_to_color[label] for label in primary_labels]

# Plot the t-SNE results
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=colors, 
                      cmap='tab10', alpha=0.7)

# Create a legend
handles, _ = scatter.legend_elements(num=len(label_to_color))
plt.legend(handles, label_to_color.keys(), title="Malware Families", 
           bbox_to_anchor=(1.05, 1), loc='upper left')

plt.title('t-SNE Visualization of Enumeration-Embedded API Call Sequences')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.show()

NameError: name 'y_combined' is not defined