In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
%run utils.ipynb
%run features.ipynb

In [3]:
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec, KeyedVectors
from gensim.models.word2vec import PathLineSentences
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def feature_ember(obj, sample_path, inter_path):
    """ Feature engineering for ember features. """
    dirs = sample_path.split('/')
    # Data belongs to 'train' or 'test' and file belongs to 'pe' or 'asm'
    data_type, file_type = dirs[-2], dirs[-1]
    with open(f"{inter_path}/{data_type}_filename.txt", 'r') as fp:
        filename = fp.read().split()
    arr = np.zeros((len(filename), obj.dim))

    if file_type == 'pe':
        with tqdm(total=len(filename), ncols=80, desc=f"{data_type}_{obj.name}") as pbar:
            for i, sample in enumerate(filename):
                with open(f"{sample_path}/{sample}", "rb") as f:
                    bytez = f.read()
                arr[i, :] = obj.feature_vector(bytez)
                pbar.update(1)
    else:  # file_type == 'asm'
        with tqdm(total=len(filename), ncols=80, desc=f"{data_type}_{obj.name}") as pbar:
            for i, sample in enumerate(filename):
                with open(f"{sample_path}/{sample}.asm", "rb") as f:
                    stringz = f.read().decode('utf-8', errors='ignore')
                arr[i, :] = obj.feature_vector(stringz)
                pbar.update(1)

    np.save(f"{inter_path}/feature/{data_type}_{obj.name}.npy", arr)

In [5]:
def feature_tfidf_df(obj, sample_path, inter_path):
    """ Save the words of all samples to a DataFrame for tf-idf input. """
    dirs = sample_path.split('/')
    data_type, file_type = dirs[-2], dirs[-1]
    with open(f"{inter_path}/{data_type}_filename.txt", 'r') as fp:
        filename = fp.read().split()
    if file_type == 'asm':
        filename = [f + '.asm' for f in filename]
    all_word_feature = []
    with tqdm(total=len(filename), ncols=80, desc=f"{obj.name_tfidf}_{data_type}") as pbar:
        for sample in filename:
            with open(f"{sample_path}/{sample}", "r", encoding='utf-8', errors='ignore') as f:
                all_word_feature.append(obj.tfidf_features(f))
            pbar.update(1)

    word_feature = pd.DataFrame({'filename': filename, "word_feature": all_word_feature})
    word_feature.to_csv(f"{inter_path}/feature/{data_type}_{obj.name_tfidf}_tfidf.csv", index=False)

In [6]:
def model_tfidf(obj, inter_path, tfidf_params):
    """ Save the tf-idf model. """
    
    train_words_ = pd.read_csv(f"{inter_path}/feature/train_{obj.name_tfidf}_tfidf.csv")
    test_words_ = pd.read_csv(f"{inter_path}/feature/test_{obj.name_tfidf}_tfidf.csv")
    all_words_ = train_words_.append(test_words_)

    vectorizer = TfidfVectorizer(**tfidf_params)
    vectorizer.fit(all_words_.word_feature.fillna(' ').tolist())

    joblib.dump(vectorizer,
                open(f"{inter_path}/models/TFIDF_model_{obj.name_tfidf}_{tfidf_params['max_features']}.pth", "wb"))

In [7]:
def feature_tfidf_np(data_type, name_tfidf, inter_path, max_features):
    """ Save the tf-idf feature into numpy format. """
    vectorizer = joblib.load(open(f"{inter_path}/models/TFIDF_model_{name_tfidf}_{max_features}.pth", "rb"))
    vectorizer.max_features = max_features
    words_ = pd.read_csv(f"{inter_path}/feature/{data_type}_{name_tfidf}_tfidf.csv")
    words = vectorizer.transform(words_.word_feature.fillna(' ').tolist())
    np.save(f"{inter_path}/feature/{data_type}_{name_tfidf}_{max_features}.npy", words.toarray())

In [8]:
def feature_asm2txt(sample_path, inter_path):
    """ Save the opcode of all samples to a txt for asm2vec input. """
    dirs = sample_path.split('/')
    data_type, file_type = dirs[-2], dirs[-1]

    def asm2txt_by_datatype(spath, dtype):
        with open(f"{inter_path}/{dtype}_filename.txt", 'r') as fp:
            filenames = fp.read().split()

#         if dtype == 'train':
#             # Deal with overlap samples in training and test set
#             delect789 = list(np.load(f"{inter_path}/train_filename_de.npy"))
#             filenames = list(set(filenames) - set(delect789))

#         elif dtype == 'test':
        if dtype == 'test':
            spath = spath.replace('train', 'test')

        with tqdm(total=len(filenames), ncols=80, desc=f"{dtype}_asm2txt") as pbar:
            for filename in filenames:
                with open(os.path.join(spath, filename) + '.asm', "r", encoding='utf-8', errors='ignore') as fp:
                    opline_list = OpcodeInfo().asm_to_txt(fp)
                with open(os.path.join(f"{inter_path}/semantic/", filename) + '.txt', 'w+', encoding='utf-8') as f:
                    for line in opline_list:
                        f.write(line + '\n')
                pbar.update(1)

    asm2txt_by_datatype(sample_path, data_type)
    asm2txt_by_datatype(sample_path, 'test')

In [9]:
def feature_asm2vec(data_type, inter_path):
    """Feature engineering for asm2vec feature."""

    if data_type == "train":
        # Train a Word2vec model by mixing training set and test set
        sentences = PathLineSentences(f"{inter_path}/semantic/")
        model = Word2Vec(sentences=sentences, vector_size=1024, window=5, min_count=5, workers=5)
        model.wv.save_word2vec_format(f"{inter_path}/models/asm2vec.bin", binary=True, sort_attr='count')

    # Load the trained Word2vec model
    model_wv = KeyedVectors.load_word2vec_format(f"{inter_path}/models/asm2vec.bin", binary=True)

    with open(f"{inter_path}/{data_type}_filename.txt", 'r') as fp:
        filename = fp.read().split()
    # Feature engineering for generating string vector features
    obj = StringVector()
    arr = np.zeros((len(filename), obj.dim))
    with tqdm(total=len(filename), ncols=80, desc=obj.name) as pbar:
        for i, file in enumerate(filename):
            with open(f"{inter_path}/semantic/{file}.txt", "rb") as f:
                stringz = f.read().decode('utf-8', errors='ignore')
            lines = ' '.join(stringz.split('\n'))
            raw_words = list(set(lines.split()))
            arr[i, :] = obj.feature_vector((model_wv, raw_words))
            pbar.update(1)
    arr[np.isnan(arr)] = 0
    np.save(f"{inter_path}/feature/{data_type}_semantic.npy", arr)

In [10]:
def feature_fusion(data_type, fused_label, features, inter_path):
    arr = [np.load(f"{inter_path}/feature/{data_type}_{f}.npy") for f in features]
    np.save(f"{inter_path}/feature/{data_type}_{fused_label}.npy", np.hstack(arr).astype(np.float32))

In [1]:
def feature_engineering(data_type, data_path, inter_path):
    """ Feature engineering code. """

    pe_path = f"{data_path}/{data_type}/pe"
    asm_path = f"{data_path}/{data_type}/asm"
    
    # Generate byte features
    pe_objs = [ByteHistogram(), ByteEntropyHistogram(), StringExtractor()]
    for obj in pe_objs:
        feature_ember(obj, pe_path, inter_path)

    # Generate format features
    asm_objs = [SectionInfo(), ImportsInfo(), ExportsInfo()]
    for obj in asm_objs:
        feature_ember(obj, asm_path, inter_path)


    if data_type == 'train':
        # Generate statistical features
        feature_tfidf_df(StringExtractor(), pe_path, inter_path)
        feature_tfidf_df(StringExtractor(), pe_path.replace('train', 'test'), inter_path)

        feature_tfidf_df(OpcodeInfo(), asm_path, inter_path)
        feature_tfidf_df(OpcodeInfo(), asm_path.replace('train', 'test'), inter_path)

        # Generate TF-IDF models
        words_tf_params1 = {'max_features': 1000}
        model_tfidf(StringExtractor(), inter_path, words_tf_params1)
        words_tf_params2 = {'max_features': 300}
        model_tfidf(StringExtractor(), inter_path, words_tf_params2)
        ins_tf_params = {'ngram_range': (1, 3), 'max_features': 1000}
        model_tfidf(OpcodeInfo(), inter_path, ins_tf_params)

        feature_asm2txt(asm_path, inter_path)

    # Generate TF-IDF feature
    feature_tfidf_np(data_type, 'words', inter_path, max_features=300)
    feature_tfidf_np(data_type, 'words', inter_path, max_features=1000)
    feature_tfidf_np(data_type, 'ins', inter_path, max_features=1000)

    # Generate asm2vec feature
    feature_asm2vec(data_type, inter_path)

    # Feature fusion
    feature_fusion(data_type, 'ember', ['histogram', 'byteentropy', 'strings'], inter_path)
    feature_fusion(data_type, 'ember_section_ins_words', ['ember', 'section', 'ins_1000', 'words_300'], inter_path)
    feature_fusion(data_type, 'ember_section_ins_semantic', ['ember', 'section', 'ins_1000', 'semantic'], inter_path)