## PE 헤더 특성화
malware/benign 샘플 분류기 구축에 사용할 PE헤더에서 특성 추출

In [31]:
import pefile
from os import listdir, path
from os.path import isfile, join

desired_file = "../python-3.7.2-amd64.exe"
pe = pefile.PE(desired_file)
directories=["Benign PE Samples", "Malicious PE samles"]

In [2]:
# 파일의 섹션 이름을 수집하고 함수의 가독성과 정규화를 위해 섹션 이름을 전처리
def get_section_names(pe):
    """Gets a list of section names from a PE file."""
    list_of_section_names = []
    for sec in pe.sections:
        normalized_name = sec.Name.decode().replace("\x00", "").lower()
        list_of_section_names.append(normalized_name)
    return list_of_section_names

In [3]:
# 들어온 것을 전처리한 후 이름을 소문자로 표준화하는 함수를 정의
def preprocess_imports(list_of_DLLs):
    """Normalize the naming of the imports of a PE file."""
    return [x.decode().split(".")[0].lower() for x in list_of_DLLs]

# pefile을 통해 파일에서 들여온 것을 수집하는 함수 정의
def get_imports(pe):
    """Get a list of the imports of a PE file."""
    list_of_imports = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        list_of_imports.append(entry.dll)
    return preprocess_imports(list_of_imports)

In [32]:
# 모든 파일에 대해 반복하고 특성을 저장할 리스트를 만듦
imports_corpus = []
num_sections = []
section_names = []
# 위의 특성을 수집하는 것 이외에 파일의 섹션 개수도 수집
for dataset_path in directories:
    samples = [f for f in listdir(dataset_path) if isfile(join(dataset_path, f))]
    for file in samples:
        file_path = dataset_path + "/" + file
        try:
            pe = pefile.PE(file_path)
            imports = get_imports(pe)
            n_sections = len(pe.sections)
            sec_names = get_section_names(pe)
            imports_corpus.append(imports)
            num_sections.append(n_sections)
            section_names.append(sec_names)
# 파일의 헤더 구문을 분석할 수 없는 경우를 대비한 예외처리
        except Exception as e:
            print(e)
            print("Unable to obtain imports from " + file_path)

FileNotFoundError: [Errno 2] No such file or directory: '../../PE Samples Dataset/Benign PE samples'

In [9]:
print(imports_corpus[0:5])
print(num_sections[0:5])
print(section_names[0:5])

[['mscoree'], ['mscoree'], ['mscoree'], ['wincorlib', 'api-ms-win-eventing-provider-l1-1-0', 'api-ms-win-core-libraryloader-l1-2-0', 'api-ms-win-core-localization-l1-2-0', 'api-ms-win-core-processthreads-l1-1-0', 'api-ms-win-core-heap-l1-1-0', 'api-ms-win-core-debug-l1-1-0', 'api-ms-win-core-errorhandling-l1-1-0', 'api-ms-win-core-handle-l1-1-0', 'api-ms-win-core-synch-l1-1-0', 'api-ms-win-core-synch-l1-2-0', 'api-ms-win-core-com-l1-1-0', 'ext-ms-win-shell32-shellfolders-l1-1-0', 'api-ms-win-core-string-l1-1-0', 'api-ms-win-core-registry-l1-1-0', 'api-ms-win-core-util-l1-1-0', 'api-ms-win-core-winrt-error-l1-1-0', 'api-ms-win-core-winrt-error-l1-1-1', 'api-ms-win-core-winrt-string-l1-1-0', 'msvcrt', 'ntdll', 'api-ms-win-core-profile-l1-1-0', 'api-ms-win-core-sysinfo-l1-1-0'], ['advapi32', 'kernel32', 'msvcrt', 'ntdll', 'ole32', 'oleaut32', 'wintrust', 'fltlib', 'shell32', 'version', 'activeds']]
[3, 3, 3, 6, 6]
[['.text', '.rsrc', '.reloc'], ['.text', '.rsrc', '.reloc'], ['.text', '.rs