# Import Libraries

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import time
import os
import pefile
import io
import pickle

import colorama
from colorama import Fore, Style, Back

colorama.init(autoreset=True)

from collections import Counter

sns.set_style("whitegrid")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from mlxtend.plotting import plot_confusion_matrix
from scikitplot.metrics import plot_roc_curve

from IPython.display import Markdown

def bold(string):
    display(Markdown("**" + string + "**"))

PROJECT_ROOT_DIR = ".."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
np.random.seed(42)

def save_fig(title):
    path = os.path.join(IMAGES_PATH, title + ".png")
    plt.tight_layout()
    plt.savefig(path, format="png", dpi=300)

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Load Data

In [None]:
data = pd.read_csv("../dataset/file_pe.csv")
data.head()

In [None]:
df = data.drop(["Name", "SuspiciousImportFunctions", "SuspiciousNameSection", "DirectoryEntryImportSize"], axis=1)

In [None]:
df.shape

In [None]:
def df_stats(data):
    bold("**" + " SHAPE ".center(50, "#") + "**")
    print("ROWS: {}".format(data.shape[0]))
    print("COLUMNS: {}".format(data.shape[1]))
    bold("**" + " TYPES ".center(50, "#") + "**")
    print(data.dtypes)
    bold("**" + " MISSING VALUES ".center(50, "#") + "**")
    print(data.isnull().sum())
    bold("**" + " DUPLICATED VALUES ".center(50, "#") + "**")
    print("NUMBER OF DUPLICATED VALUES: {}".format(data.duplicated().sum()))
    bold("**" + " MEMORY USAGE ".center(50, "#") + "**")
    buf = io.StringIO()
    data.info(buf=buf)
    info = buf.getvalue().split("\n")[-2].split(":")[1].strip()
    print("Memory Usage: {}".format(info))
    #bold("**" + " DESCRIBE ".center(50, "#") + "**")
    #print(data.describe().T)

In [None]:
df_stats(df)

# EDA

In [None]:
target = "Malware"
numerical_variables = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col]) and col != target]
categorical_variables = [col for col in df.columns if pd.api.types.is_categorical_dtype(df[col]) or df[col].dtype == "O" and col != target]

In [None]:
bold("NUMERICAL VARIABLES")
print(numerical_variables)

In [None]:
bold("CATEGORICAL VARIABLES")
print(categorical_variables)

# TARGET VALUE DISTRIBUTION

In [None]:
def plot_count(df, col, title):
    fig, ax = plt.subplots(1 ,2, figsize=(18, 6))
    plt.subplots_adjust(wspace=0.2)

    values = df[col].value_counts()
    N = len(values)

    outer_pie = values
    inner_pie = values / N

    ax[0].pie(
        outer_pie,
        labels=values.index.tolist(),
        startangle=90,
        frame=False,
        radius=1.3,
        explode=([0.05] * (N-1) + [0.3]),
        wedgeprops={"linewidth": 1, "edgecolor": "white"},
        textprops={"fontsize": 12, "weight": "bold"}
    )

    ax[0].pie(
        inner_pie,
        radius=1,
        startangle=90,
        autopct="%1.f%%",
        explode=([0.1] * (N-1) + [0.3]),
        pctdistance=0.8,
        textprops={"size": 13, "weight": "bold", "color": "white"}
    )

    center_circle = plt.Circle((0, 0), 0.7, color="black", fc="white", linewidth=0)
    ax[0].add_artist(center_circle)

    sns.barplot(x=values, y=values.index.tolist(), orient="horizontal")

    for i, v in enumerate(values):
        ax[1].text(v, i, str(v), color="black", fontweight="bold", fontsize=13)

    plt.setp(ax[1].get_yticklabels(), fontweight="bold")
    plt.setp(ax[1].get_xticklabels(), fontweight="bold")
    ax[1].set_xlabel(col, fontweight="bold", color="black")
    ax[1].set_ylabel("count", fontweight="bold", color="black")

    fig.suptitle(f"{title}", fontsize=18, fontweight="bold")
    plt.tight_layout()
    save_fig("target_value_distribution")
    plt.show()

In [None]:
plot_count(df, "Malware", f"Malware Distribution")

In [None]:
def plot_num(df, columns):
    for i, column in enumerate(columns):
        plt.subplot(int(len(columns) / 2) + 1, 2, i + 1)
        sns.histplot(x=column, data=df, bins=30, kde=True)
        plt.axvline(df[column].mean(), color="r", linestyle="--", label="Mean")
        plt.axvline(df[column].median(), color="g", linestyle="-", label="Median")
        plt.grid()
        plt.title(f"{column} Distribution")
        plt.tight_layout()

    save_fig("numerical_variable_distribution")
    plt.show()

In [None]:
plt.figure(figsize=(20, len(numerical_variables) * 2.5))
plot_num(df, numerical_variables)

# Feature Scaling

In [None]:
df = df.sample(frac=1)
X = df.drop("Malware", axis=1)
y = df["Malware"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train

In [None]:
models = {}
models["Random Forest"] = RandomForestClassifier()
models["AdaBoost"] = AdaBoostClassifier()
models["Gradient Boosting"] = GradientBoostingClassifier()
models["LinearSVC"] = LinearSVC()
models["SVC"] = SVC()
models["Decision Tree"] = DecisionTreeClassifier()
models["Extra Tree"] = ExtraTreeClassifier()
models["Logistic Regression"] = LogisticRegression()
models["SGD"] = SGDClassifier()
models["XGB"] = XGBClassifier()
models["LGBM"] = LGBMClassifier(verbose=0)
models["CatBoost"] = CatBoostClassifier(verbose=0)

In [None]:
train_time, test_time, accuracy, precision, recall, f1 = {}, {}, {}, {}, {}, {}

for key in models.keys():
    start_time = time.time()
    models[key].fit(X_train, y_train)
    end_time = time.time()
    train_time[key] = end_time - start_time

    start_time = time.time()
    predictions = models[key].predict(X_test)
    end_time = time.time()
    test_time[key] = end_time - start_time

    accuracy[key] = accuracy_score(y_test, predictions)
    precision[key] = precision_score(y_test, predictions)
    recall[key] = recall_score(y_test, predictions)
    f1[key] = f1_score(y_test, predictions)

    cm = confusion_matrix(y_test, predictions)

    pkl_filename = f"../models/{key}.pkl"
    with open(pkl_filename, "wb") as file:
        pickle.dump(models[key], file)

    plt.figure(figsize=(8, 8))
    plot_confusion_matrix(conf_mat=cm, show_absolute=True, show_normed=True, colorbar=True, figsize=(8, 8))
    plt.title(f"{key}")
    save_fig(f"{key} Confusion Matrix")

    classification_report(y_test, predictions, target_names=["benign", "malware"])

# Results

In [None]:
results_df = pd.DataFrame({"Train Time": train_time.values(),
                           "Test Time": test_time.values(),
                           "Accuracy": accuracy.values(),
                           "Precision": precision.values(),
                           "Recall": recall.values(),
                           "F1": f1.values()}, index=models.keys())

results_df

In [None]:
ax = results_df.plot.barh()
ax.legend(
    ncol=len(models.keys()), 
    bbox_to_anchor=(0, 1), 
    loc='lower left', 
    prop={'size': 7}
)
plt.tight_layout()

In [None]:
plt.figure(figsize=(15, 25))

plt.subplot(611)
ax = sns.barplot(data=results_df, x=results_df.index, y="Accuracy")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Accuracy Score")

plt.subplot(612)
ax = sns.barplot(data=results_df, x=results_df.index, y="F1")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / F1 Score")

plt.subplot(613)
ax = sns.barplot(data=results_df, x=results_df.index, y="Precision")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Precision Score")

plt.subplot(614)
ax = sns.barplot(data=results_df, x=results_df.index, y="Recall")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Recall Score")

plt.subplot(615)
ax = sns.barplot(data=results_df, x=results_df.index, y="Train Time")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Train Time")

plt.subplot(616)
ax = sns.barplot(data=results_df, x=results_df.index, y="Test Time")
for container in ax.containers:
    ax.bar_label(container)
plt.title("Model / Test Time")

save_fig("results")
plt.show()

# Test

In [None]:
def analyze(df):
    for i in range(len(df)):
        file_path = str(df.loc[i, "Name"])
        try:
            pe = pefile.PE(file_path)
        except:
            continue
        df.loc[i, "e_magic"] = pe.DOS_HEADER.e_magic
        df.loc[i, "e_cblp"] = pe.DOS_HEADER.e_cblp
        df.loc[i, "e_cp"] = pe.DOS_HEADER.e_cp
        df.loc[i, "e_crlc"] = pe.DOS_HEADER.e_crlc
        df.loc[i, "e_cparhdr"] = pe.DOS_HEADER.e_cparhdr
        df.loc[i, "e_minalloc"] = pe.DOS_HEADER.e_minalloc
        df.loc[i, "e_maxalloc"] = pe.DOS_HEADER.e_maxalloc
        df.loc[i, "e_ss"] = pe.DOS_HEADER.e_ss
        df.loc[i, "e_sp"] = pe.DOS_HEADER.e_sp
        df.loc[i, "e_csum"] = pe.DOS_HEADER.e_csum
        df.loc[i, "e_ip"] = pe.DOS_HEADER.e_ip
        df.loc[i, "e_cs"] = pe.DOS_HEADER.e_cs
        df.loc[i, "e_lfarlc"] = pe.DOS_HEADER.e_lfarlc
        df.loc[i, "e_ovno"] = pe.DOS_HEADER.e_ovno
        df.loc[i, "e_oemid"] = pe.DOS_HEADER.e_oemid
        df.loc[i, "e_oeminfo"] = pe.DOS_HEADER.e_oeminfo
        df.loc[i, "e_lfanew"] = pe.DOS_HEADER.e_lfanew
        df.loc[i, "Machine"] = pe.FILE_HEADER.Machine
        df.loc[i, "NumberOfSections"] = pe.FILE_HEADER.NumberOfSections
        df.loc[i, "TimeDateStamp"] = pe.FILE_HEADER.TimeDateStamp
        df.loc[i, "PointerToSymbolTable"] = pe.FILE_HEADER.PointerToSymbolTable
        df.loc[i, "NumberOfSymbols"] = pe.FILE_HEADER.NumberOfSymbols
        df.loc[i, "SizeOfOptionalHeader"] = pe.FILE_HEADER.SizeOfOptionalHeader
        df.loc[i, "Characteristics"] = pe.FILE_HEADER.Characteristics
        df.loc[i, "Magic"] = pe.OPTIONAL_HEADER.Magic
        df.loc[i, "MajorLinkerVersion"] = pe.OPTIONAL_HEADER.MajorLinkerVersion
        df.loc[i, "MinorLinkerVersion"] = pe.OPTIONAL_HEADER.MinorLinkerVersion
        df.loc[i, "SizeOfCode"] = pe.OPTIONAL_HEADER.SizeOfCode
        df.loc[i, "SizeOfInitializedData"] = pe.OPTIONAL_HEADER.SizeOfInitializedData
        df.loc[i, "SizeOfUninitializedData"] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
        df.loc[i, "AddressOfEntryPoint"] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
        df.loc[i, "BaseOfCode"] = pe.OPTIONAL_HEADER.BaseOfCode
        df.loc[i, "ImageBase"] = pe.OPTIONAL_HEADER.ImageBase
        df.loc[i, "SectionAlignment"] = pe.OPTIONAL_HEADER.SectionAlignment
        df.loc[i, "FileAlignment"] = pe.OPTIONAL_HEADER.FileAlignment
        df.loc[i, "MajorOperatingSystemVersion"] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
        df.loc[i, "MinorOperatingSystemVersion"] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
        df.loc[i, "MajorImageVersion"] = pe.OPTIONAL_HEADER.MajorImageVersion
        df.loc[i, "MinorImageVersion"] = pe.OPTIONAL_HEADER.MinorImageVersion
        df.loc[i, "MajorSubsystemVersion"] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
        df.loc[i, "MinorSubsystemVersion"] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
        df.loc[i, "SizeOfHeaders"] = pe.OPTIONAL_HEADER.SizeOfHeaders
        df.loc[i, "CheckSum"] = pe.OPTIONAL_HEADER.CheckSum
        df.loc[i, "SizeOfImage"] = pe.OPTIONAL_HEADER.SizeOfImage
        df.loc[i, "Subsystem"] = pe.OPTIONAL_HEADER.Subsystem
        df.loc[i, "DllCharacteristics"] = pe.OPTIONAL_HEADER.DllCharacteristics
        df.loc[i, "SizeOfStackReserve"] = pe.OPTIONAL_HEADER.SizeOfStackReserve
        df.loc[i, "SizeOfStackCommit"] = pe.OPTIONAL_HEADER.SizeOfStackCommit
        df.loc[i, "SizeOfHeapReserve"] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
        df.loc[i, "SizeOfHeapCommit"] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
        df.loc[i, "LoaderFlags"] = pe.OPTIONAL_HEADER.LoaderFlags
        df.loc[i, "NumberOfRvaAndSizes"] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
        df.loc[i, "SectionsLength"] = len(pe.sections)
        
        section_entropy_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            entropy = section.get_entropy()
            section_entropy_dict[section_name] = entropy
            
        df.loc[i, "SectionMinEntropy"] = min(section_entropy_dict.values())
        df.loc[i, "SectionMaxEntropy"] = max(section_entropy_dict.values())
        
        section_raw_size_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            raw_size = section.SizeOfRawData
            section_raw_size_dict[section_name] = raw_size

        df.loc[i, "SectionMinRawsize"] = min(section_raw_size_dict.values())
        df.loc[i, "SectionMaxRawsize"] = max(section_raw_size_dict.values())
        
        section_virt_size_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            virt_size = section.Misc_VirtualSize
            section_virt_size_dict[section_name] = virt_size
            
        df.loc[i, "SectionMinVirtualsize"] = min(section_virt_size_dict.values())
        df.loc[i, "SectionMaxVirtualsize"] = max(section_virt_size_dict.values())
        
        section_physical_addr_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            physical = section.Misc_PhysicalAddress
            section_physical_addr_dict[section_name] = physical
            
        df.loc[i, "SectionMaxPhysical"] = max(section_physical_addr_dict.values())
        df.loc[i, "SectionMinPhysical"] = min(section_physical_addr_dict.values())
        
        section_virt_addr_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            virtual = section.VirtualAddress
            section_virt_addr_dict[section_name] = virtual
    
        df.loc[i, "SectionMaxVirtual"] = max(section_virt_addr_dict.values())
        df.loc[i, "SectionMinVirtual"] = min(section_virt_addr_dict.values())
        
        section_pointer_data_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            pointer_data = section.PointerToRawData
            section_pointer_data_dict[section_name] = pointer_data
            
        df.loc[i, "SectionMaxPointerData"] = max(section_pointer_data_dict.values())
        df.loc[i, "SectionMinPointerData"] = min(section_pointer_data_dict.values())

        section_char_dict = {}
        for section in pe.sections:
            section_name = section.Name.decode('utf-8').strip('\x00')
            chars = section.Characteristics
            section_char_dict[section_name] = chars
            
        df.loc[i, "SectionMaxChar"] = max(section_char_dict.values())
        df.loc[i, "SectionMainChar"] = min(section_char_dict.values())
        
        try:
            df.loc[i, "DirectoryEntryImport"] = len(pe.DIRECTORY_ENTRY_IMPORT)
        except:
            df.loc[i, "DirectoryEntryImport"] = 0
        try:
            df.loc[i, "DirectoryEntryExport"] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
        except:
            df.loc[i, "DirectoryEntryExport"] = 0
        
        df.loc[i, "ImageDirectoryEntryExport"] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXPORT']].Size
        df.loc[i, "ImageDirectoryEntryImport"] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_IMPORT']].Size
        df.loc[i, "ImageDirectoryEntryResource"] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_RESOURCE']].Size
        df.loc[i, "ImageDirectoryEntryException"] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_EXCEPTION']].Size
        df.loc[i, "ImageDirectoryEntrySecurity"] = pe.OPTIONAL_HEADER.DATA_DIRECTORY[pefile.DIRECTORY_ENTRY['IMAGE_DIRECTORY_ENTRY_SECURITY']].Size
    return df

In [None]:
def test(file_path, models, scan_file):
    test_df = pd.DataFrame({"Name": file_path})
    result_df = analyze(test_df)
    test = result_df.drop("Name", axis=1)
    if scan_file:
        print(f"[***] File: {Style.BRIGHT}{file_path}{Style.NORMAL} [***]")
    
    for i in range(len(test)):
        total_benign, total_malicious = 0, 0
        for key in models.keys():
            result = models[key].predict(test)
            if result[i] == 0:
                total_benign += 1
                if scan_file:
                    print(f"{Fore.GREEN}[+]{Fore.RESET} Model {Style.BRIGHT}{key}{Style.NORMAL} labeled {Fore.GREEN}benign{Fore.RESET}.")
            else:
                total_malicious += 1
                if scan_file:
                    print(f"{Fore.RED}[-]{Fore.RESET} Model {Style.BRIGHT}{key}{Style.NORMAL} labeled {Fore.RED}malware{Fore.RESET}.")
    
        if total_benign > total_malicious:
            if not scan_file:
                print(f"[*] Scanning file: {file_path[i]}.")
            print(f"{Fore.YELLOW}[=] File {Fore.RESET}{Back.GREEN}{round((total_benign / 12), 2)}%{Back.RESET}{Fore.YELLOW} is benign{Fore.RESET}.")

        elif total_malicious > total_benign:
            if not scan_file:
                print(f"[*] Scanning file: {file_path[i]}.")
            print(f"{Fore.YELLOW}[=] File {Fore.RESET}{Back.RED}{round((total_malicious / 12), 2)}%{Back.RESET}{Fore.YELLOW} is malicious{Fore.RESET}.")


In [None]:
test(["../test/python-3.12.0-amd64.exe", "../test/python-3.12.0-amd64.exe"], models, scan_file=False)