In this exercise, we build on the previous exercises to prepare a labeled dataset of binary feature vectors, and use it to train a *Random Forest* binary classifier of malware/benign feature vectors. 

In [1]:
!pip install nltk 
!pip install pefile
!pip install scikit-learn==1.2.1

Collecting pefile
  Downloading pefile-2023.2.7-py3-none-any.whl.metadata (1.4 kB)
Downloading pefile-2023.2.7-py3-none-any.whl (71 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.8/71.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pefile
Successfully installed pefile-2023.2.7
Collecting scikit-learn==1.2.1
  Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.1.post1
    Uninstalling scikit-learn-1.4.1.post1:
      Successfully uninstalled scikit-learn-1.4.1.post1
Successfully installed scikit-learn-1.2.1


In [2]:
import os
directoriesWithLabels = [("Samples/Benign",0), ("Samples/Malware",1)]
listOfSamples = []
labels = []
for datasetPath, label in directoriesWithLabels:
    samples = [f for f in os.listdir(datasetPath)]
    for file in samples:
        filePath = os.path.join(datasetPath, file)
        listOfSamples.append(filePath)
        labels.append(label)

In [3]:
#Train-Test data split
from sklearn.model_selection import train_test_split
samples_train, samples_test, labels_train, labels_test = train_test_split(listOfSamples, labels, test_size=0.33, stratify=labels, random_state=42)

In [4]:
import collections
from nltk import ngrams
import numpy as np
import pefile

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

def getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list):
    K1 = len(K1_most_common_Ngrams_list)
    fv = K1*[0]
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_common_Ngrams_list[i]]
    return fv

def preprocessImports(listOfDLLs):
    processedListOfDLLs = []
    temp = [x.decode().split(".")[0].lower() for x in listOfDLLs]
    return " ".join(temp)

def getImports(pe):
    listOfImports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        listOfImports.append(entry.dll)
    return preprocessImports(listOfImports)

def getSectionNames(pe):
    listOfSectionNames = []
    for eachSection in pe.sections:
        refined_name = eachSection.Name.decode().replace('\x00','').lower()
        listOfSectionNames.append(refined_name)
    return " ".join(listOfSectionNames)

In [5]:
# Generate 2-Grams, 
# and produce feature vectors based on the frequency method
# This may take a few minutes to run
N=2
totalNgramCount = collections.Counter([])
for file in samples_train:
    totalNgramCount += extractNgramCounts(file, N)
K1 = 100
K1_most_common_Ngrams = totalNgramCount.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [6]:
# Extract N-gram features based on the frequency method
# Also, extracts some metadata such as DLL imports, 
# and PE Sections. We will combine these with
# our N-gram features to enrich the sample representation.
# This will take a few minutes to run.
# Some samples will generate errors such as 'not a PE file',
# 'DOS header not found', and 'invalid attribute'. These are OK.
importsCorpus_train = []
numSections_train = []
sectionNames_train = []
NgramFeaturesList_train = []
y_train = []
for i in range(len(samples_train)):
    file = samples_train[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_train.append(imports)
        numSections_train.append(nSections)
        sectionNames_train.append(secNames)
        NgramFeaturesList_train.append(NGramFeatures)
        y_train.append(labels_train[i])
    except Exception as e: 
        print(file+":")
        print(e)

Samples/Benign/LogCollector.exe:
'DOS Header magic not found.'
Samples/Benign/malias.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/ldifde.exe:
'DOS Header magic not found.'
Samples/Malware/VirusShare_7a30183b105b4200fc201925aba4886c.exe:
'utf-8' codec can't decode byte 0xb8 in position 0: invalid start byte
Samples/Benign/InstallUtil.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/RegAsm.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/lc.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/pmsort.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/SettingSyncHost.exe:
'DOS Header magic not found.'
Samples/Benign/oisicon.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/urlproxy.exe:
'Invalid NT Headers signature. Probably a NE file'
Samples/Benign/fsynonym.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/aspnetca.exe:
'DOS Header m

In the following lines, we define a pipeline of sequential transforms (HashingVectorizer and TfidfTransformer) to extract N-gram featurs and construct feature vectors from the DLL imports and Section names extracted for each sample. 

In [7]:
from sklearn.feature_extraction.text import HashingVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
imports_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
section_names_featurizer = Pipeline([('vect', HashingVectorizer(input='content', ngram_range=(1, 2))),('tfidf', TfidfTransformer(use_idf=True, )),])
importsCorpus_train_transformed = imports_featurizer.fit_transform(importsCorpus_train)
sectionNames_train_transformed = section_names_featurizer.fit_transform(sectionNames_train)

In [8]:
# Combine the binary N-gram features with 
# the DLL imports and section names features to create
# vectorized training samples
from scipy.sparse import hstack, csr_matrix
X_train = hstack([NgramFeaturesList_train, importsCorpus_train_transformed,sectionNames_train_transformed, csr_matrix(numSections_train).transpose()])

In [9]:
#Train the Random Forest classifier
# This may take a few minutes.
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=1)
clf = clf.fit(X_train,y_train)

In [10]:
# Training accuracy
clf.score(X_train, y_train)

0.9958071278825996

In [11]:
# Generate feature vectors for the test samples
# This may take a few minutes
importsCorpus_test = []
numSections_test = []
sectionNames_test = []
NgramFeaturesList_test = []
y_test = []
for i in range(len(samples_test)):
    file = samples_test[i]
    try:
        NGramFeatures = getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list)
        pe = pefile.PE(file)
        imports = getImports(pe)
        nSections = len(pe.sections)
        secNames = getSectionNames(pe)
        importsCorpus_test.append(imports)
        numSections_test.append(nSections)
        sectionNames_test.append(secNames)
        NgramFeaturesList_test.append(NGramFeatures)
        y_test.append(labels_test[i])
    except Exception as e: 
        print(file+":")
        print(e)

Samples/Malware/VirusShare_1a89b7d4fb8ded72e1f8e81ee9352262.exe:
'utf-8' codec can't decode byte 0xb1 in position 0: invalid start byte
Samples/Benign/pmgrant.exe:
'Invalid e_lfanew value, probably not a PE file'
Samples/Benign/Common.DBConnection64.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Malware/VirusShare_14f3035781bb698c37ad287483af569e.exe:
'utf-8' codec can't decode byte 0x8d in position 0: invalid start byte
Samples/Benign/adaminstall.exe:
'DOS Header magic not found.'
Samples/Benign/evntwin.exe:
'DOS Header magic not found.'
Samples/Benign/sysprep.exe:
'DOS Header magic not found.'
Samples/Benign/FixSqlRegistryKey_ia64.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'
Samples/Benign/BootExpCfg.exe:
'DOS Header magic not found.'
Samples/Benign/FixSqlRegistryKey_x64.exe:
'PE' object has no attribute 'DIRECTORY_ENTRY_IMPORT'


In [12]:
importsCorpus_test_transformed = imports_featurizer.transform(importsCorpus_test)
sectionNames_test_transformed = section_names_featurizer.transform(sectionNames_test)
X_test = hstack([NgramFeaturesList_test, importsCorpus_test_transformed,sectionNames_test_transformed, csr_matrix(numSections_test).transpose()])

In [13]:
clf.score(X_test, y_test)

0.9871244635193133

In [14]:
import joblib
joblib.dump(clf, "model.joblib")
joblib.dump(imports_featurizer, 'imports_featurizer.pkl')
joblib.dump(section_names_featurizer, 'section_names_featurizer.pkl')

['section_names_featurizer.pkl']