# Import Useful Data

## Data Paths Setup

In [None]:
from pathlib import Path
import sys

ROOT_DIR = Path("../..")
sys.path.append(ROOT_DIR.as_posix())
DATA_DIR = ROOT_DIR.joinpath("data")

EMULATED_DATA_DIR = DATA_DIR.joinpath("emulation.dataset")
assert EMULATED_DATA_DIR.exists()

PATHS_DATA_DIR = DATA_DIR.joinpath("path.dataset")
assert PATHS_DATA_DIR.exists()
PATHS_BENIGN_TXT = PATHS_DATA_DIR.joinpath("dataset_benign_win10.txt")
PATHS_MALICIOUS_TXT = PATHS_DATA_DIR.joinpath("dataset_malicious_augmented.txt")
assert PATHS_BENIGN_TXT.exists() and PATHS_MALICIOUS_TXT.exists()

In [None]:

PATHS_DATA_DIR = DATA_DIR.joinpath("path.dataset")
assert PATHS_DATA_DIR.exists()
PATHS_BENIGN_TXT = PATHS_DATA_DIR.joinpath("dataset_benign_win10.txt")
PATHS_MALICIOUS_TXT = PATHS_DATA_DIR.joinpath("dataset_malicious_augmented.txt")
assert PATHS_BENIGN_TXT.exists() and PATHS_MALICIOUS_TXT.exists()

## Importing the Datasets

In [2]:
import pandas as pd
import numpy as np
from notebooks.misc.utils import get_api_sequences

# Setting up the blank datasets dictionary 
datasets = { 
    # Dictionary where: data_type --[maps to]-> (X, y)
    "paths": (None, None),
    "emulation": (None, None)
}

### Importing Paths

In [42]:
with open(PATHS_BENIGN_TXT, "r") as file:
    benign_paths = file.readlines()
    
with open(PATHS_MALICIOUS_TXT, "r") as file:
    malicious_paths = file.readlines()

assert benign_paths is not None and malicious_paths is not None
all_paths = []

all_paths.extend(benign_paths)
paths_is_malicious = [False for _ in benign_paths]

all_paths.extend(malicious_paths)
paths_is_malicious.extend([True for _ in malicious_paths])

print("Overall paths amount:\t", len(all_paths))
print("Benign paths amount:\t", len(benign_paths))
print("Malicious paths amount:\t", len(malicious_paths))

datasets["paths"] = (all_paths, paths_is_malicious)

Overall paths amount:	 346689
Benign paths amount:	 122409
Malicious paths amount:	 224280


### Importing Emulation Reports

In [4]:
PRELOADED_APIS_PATH = Path("./api_sequences.pickle")

reports_df = None
if PRELOADED_APIS_PATH.exists():
    reports_df = pd.read_pickle(PRELOADED_APIS_PATH)
else:
    reports_df = get_api_sequences(EMULATED_DATA_DIR)
    reports_df.to_pickle(PRELOADED_APIS_PATH)
    
reports_df.drop_duplicates("pe_hash", keep=False, inplace=True)
reports_df.reset_index(drop=True, inplace=True)

datasets["emulation"] = (reports_df["api_sequence"].values,
                         (reports_df["family"] != "clean").values) 

In [5]:
datasets["emulation"]

(array([('kernel32.getsystemtimeasfiletime', 'kernel32.getcurrentprocessid', 'kernel32.getcurrentthreadid', 'kernel32.gettickcount', 'kernel32.queryperformancecounter', 'kernel32.getstartupinfow', 'kernel32.heapsetinformation', 'kernel32.heapcreate', 'kernel32.getmodulehandlew', 'kernel32.getprocaddress', 'kernel32.getprocaddress', 'kernel32.getprocaddress', 'kernel32.getprocaddress', 'kernel32.tlsalloc', 'kernel32.tlssetvalue', 'kernel32.encodepointer', 'kernel32.encodepointer', 'kernel32.encodepointer', 'kernel32.encodepointer', 'kernel32.encodepointer', 'kernel32.encodepointer', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.initializecriticalsectionandspincount', 'kernel32.i

# Training ML Models

## Paths

In [50]:
from preprocessing.text import normalize_path
from preprocessing.array import pad_array, byte_filter, remap


PADDING_LENGTH = 150


def path_to_vector(path, padding_length=PADDING_LENGTH) -> np.ndarray:    
    path_bytes = normalize_path(path).encode("utf-8", "ignore")
    vector = np.array(list(path_bytes), dtype=int)
    return pad_array(vector, padding_length)

def preprocess_paths(paths, padding_length=PADDING_LENGTH) -> np.ndarray:
    paths_vectors = []
    for path in paths:
        paths_vectors.append(path_to_vector(path, padding_length))
    return np.vstack(paths_vectors)

### Preprocessing

#### Paths Normalisation and Encoding
Firstly, it is necessary to **normalise** a path. This means substituting: 
1. drive letters with [drive] 
2. network hosts with [net] 
3. arbitrary, non-default usernames with [user] 
4. environment variables with fullpath equivalent

Then, the resulting string undergoes UTF-8 **encoding** — ignoring errors.

Finally, the byte array is padded or truncated to the predefined length (default
is 150).

In [36]:
preprocessed_paths = preprocess_paths(datasets["paths"][0])
preprocessed_paths

array([[ 91, 100, 114, ...,   0,   0,   0],
       [ 91, 100, 114, ...,   0,   0,   0],
       [ 91, 100, 114, ...,   0,   0,   0],
       ...,
       [ 91, 100, 114, ...,   0,   0,   0],
       [ 91, 100, 114, ...,   0,   0,   0],
       [ 91, 100, 114, ...,   0,   0,   0]])

#### Byte Filtering
The N most occurring bytes are selected and filtered from each byte vector. The
default value for the kept bytes is 150.

Any character that does not fall in the 150 most common characters is replaced
as with the integer value '1'. 

The authors select the N+1 most common characters, because the padding label
(i.e. '0') is likely going to be the most common.

In [68]:
from collections import Counter


N_KEEP_BYTES = 150

byte_counter = Counter(preprocessed_paths.flatten())
# Select the first column of the byte counts — i.e. the byte value
keep_bytes = np.array(byte_counter.most_common(N_KEEP_BYTES+1))[:, 0]
filtered_paths = byte_filter(preprocessed_paths, keep_bytes)

#### Remapping Bytes
The bytes are remapped for embedding purposes. This involves sorting each byte
appearing in the filtered dataset, adding the label values '0' and '1', and
enumerating them.

In [90]:
def get_bytes_map(used_bytes) -> dict[int, int]:
    bytes_set = set([0, 1] + list(used_bytes))
    return {byte_value: index
            for index, byte_value
            in enumerate(bytes_set)}

    

In [None]:
bytes_map = get_bytes_map(keep_bytes)

remapped_paths = remap(filtered_paths, bytes_map)

array([[25, 32, 46, ...,  0,  0,  0],
       [25, 32, 46, ...,  0,  0,  0],
       [25, 32, 46, ...,  0,  0,  0],
       ...,
       [25, 32, 46, ...,  0,  0,  0],
       [25, 32, 46, ...,  0,  0,  0],
       [25, 32, 46, ...,  0,  0,  0]])

In [94]:
assert len(remapped_paths) == len(datasets["paths"][1])

datasets["paths"] = (remapped_paths, datasets["paths"][1])

#### Path Normalisation Demo

In [93]:
example_path = datasets["paths"][0][-1]
example_path_norm = normalize_path(example_path)
example_path_vect = path_to_vector(example_path)
# Reducing the keep bytes for demonstration purposes
# Had to make a copy because byte_filter applies changes to the input vector 
example_path_filt = byte_filter(example_path_vect.copy(), keep_bytes[:15])
example_path_rmap = remap(example_path_filt, get_bytes_map(keep_bytes[:15]))


print("Example path:\n\t", example_path)
print("Normalised path:\n\t", example_path_norm)
print("Encoded path:\n", example_path_vect)
print("Filtered path:\n", example_path_filt)
print("Remapped path:\n", example_path_rmap)


Example path:
	 C:\Users\myuser\AppData\Roaming\\Microsoft\Windows\Start Menu\VkKeUfTH.iso

Normalised path:
	 [drive]\users\[user]\appdata\roaming[net]\windows\start menu\vkkeufth.iso
Encoded path:
 [ 91 100 114 105 118 101  93  92 117 115 101 114 115  92  91 117 115 101
 114  93  92  97 112 112 100  97 116  97  92 114 111  97 109 105 110 103
  91 110 101 116  93  92 119 105 110 100 111 119 115  92 115 116  97 114
 116  32 109 101 110 117  92 118 107 107 101 117 102 116 104  46 105 115
 111   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0]
Filtered path:
 [  1 100 114 105   1 101   1  92 117 115 101 114 115  92   1 117 115 101
 114   1  92  97   1   1 100  97 116  97  92 114 111  97 109 105 110   1
   1 110 101 116   1  92 119

### Model Setup and Training

In [104]:
import torch
from torch import nn, optim
from models import Filepath

EMBEDDING_SIZE = 64
N_EPOCHS = 50

X = remapped_paths.copy()
y = np.array(datasets["paths"][1], dtype=np.int)

In [105]:
train_loader = torch.utils.data.DataLoader(
        torch.utils.data.TensorDataset(
                torch.LongTensor(X), torch.LongTensor(y)),
        batch_size = 1024, shuffle=True)

In [106]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
quovadis_path = Filepath(keep_bytes, device, embedding_dim=EMBEDDING_SIZE)
optimizer = optim.Adam(quovadis_path.model.parameters(), lr=1e-3, 
                       weight_decay=0)
loss_function = nn.CrossEntropyLoss()
quovadis_path.fit(N_EPOCHS, optimizer, loss_function, train_loader)

        [!] Tue Nov  5 17:17:11 2024: Dumped results:
                model: 1730827031-model.torch
                train loss list: 1730827031-train_losses.pickle
                train metrics : 1730827031-train_metrics.pickle
                duration: 1730827031-duration.pickle
