In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.externals import joblib

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("fliter_train_data.csv")
safe_type = data["safe_type"]
features = data.iloc[:, 2:]

In [7]:
train_data, test_data, train_label, test_label = train_test_split(features, 
                                                                  safe_type, 
                                                                  test_size=0.2, 
                                                                  random_state=0)

In [9]:
def plot(test_label, y_pred, model):
    font = {"color": "darkred",
            "size": 13, 
            "family" : "serif"}

    accs = accuracy_score(test_label, y_pred)
    fpr, tpr, _ = metrics.roc_curve(test_label,  y_pred)
    auc = metrics.roc_auc_score(test_label, y_pred)
    plt.style.use("fivethirtyeight")
    fig, ax = plt.subplots()
    ax.plot(fpr, tpr, label="{}, auc=".format(model)+str(auc), color='green', linewidth=2)
    ax.set_title("ROC curve", fontdict=font)
    leg = ax.legend(loc="best")
    text = leg.get_texts()
    _ = plt.setp(text, color="blue") 

In [8]:
model = XGBClassifier()               
model.fit(train_data, train_label)            
y_pred = model.predict(test_data)

In [3]:
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [0.1, 0.25, 0.5, 0.75, 1.0]
grid_learn = [0.01, 0.03, 0.05, 0.1, 0.25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, 0.03, 0.05, 0.10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]

layer_1 = [
            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
#             ('ada', AdaBoostClassifier()),
            ('bc', BaggingClassifier()),
#             ('etc', ExtraTreesClassifier()),
            ('gbc', GradientBoostingClassifier()),
#             ('rfc', RandomForestClassifier()),

            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
#             ('gpc', GaussianProcessClassifier()),

            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
            ('lr', LogisticRegression()),

            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
            ('bnb', naive_bayes.BernoulliNB()),
#             ('gnb', naive_bayes.GaussianNB()),

            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
            ('knn', neighbors.KNeighborsClassifier()),

            #SVM: http://scikit-learn.org/stable/modules/svm.html
#             ('svc', svm.SVC(probability=True)),

            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
           ('xgb', XGBClassifier())

          ]

grid_param = [
#                 [{
#                 #AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
#                 'n_estimators': grid_n_estimator, #default=50
#                 'learning_rate': grid_learn, #default=1
#                 #'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
#                 'random_state': grid_seed
#                 }],


                [{
                #BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
                'n_estimators': grid_n_estimator, #default=10
                'max_samples': grid_ratio, #default=1.0
                'random_state': grid_seed
                 }],


#                 [{
#                 #ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
#                 'n_estimators': grid_n_estimator, #default=10
#                 'criterion': grid_criterion, #default=”gini”
#                 'max_depth': grid_max_depth, #default=None
#                 'random_state': grid_seed
#                  }],


                [{
                #GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
                #'loss': ['deviance', 'exponential'], #default=’deviance’
                'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
                'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
                #'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
                'max_depth': grid_max_depth, #default=3   
                'random_state': grid_seed
                 }],


#                 [{
#                 #RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
#                 'n_estimators': grid_n_estimator, #default=10
#                 'criterion': grid_criterion, #default=”gini”
#                 'max_depth': grid_max_depth, #default=None
#                 'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
#                 'random_state': grid_seed
#                  }],

#                 [{    
#                 #GaussianProcessClassifier
#                 'max_iter_predict': grid_n_estimator, #default: 100
#                 'random_state': grid_seed
#                 }],


                [{
                #LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
                'fit_intercept': grid_bool, #default: True
                #'penalty': ['l1','l2'],
                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
                'random_state': grid_seed
                 }],


                [{
                #BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
                'alpha': grid_ratio, #default: 1.0
                 }],


                #GaussianNB - 
                [{}],

                [{
                #KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
                'n_neighbors': [1,2,3,4,5,6,7], #default: 5
                'weights': ['uniform', 'distance'], #default = ‘uniform’
                'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
                }],


#                 [{
#                 #SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
#                 #http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
#                 #'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
#                 'C': [1,2,3,4,5], #default=1.0
#                 'gamma': grid_ratio, #edfault: auto
#                 'decision_function_shape': ['ovo', 'ovr'], #default:ovr
#                 'probability': [True],
#                 'random_state': grid_seed
#                  }],


                [{
                #XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
                'learning_rate': grid_learn, #default: .3
                'max_depth': [1,2,4,6,8,10], #default 2
                'n_estimators': grid_n_estimator, 
                'seed': grid_seed  
                 }]   
            ]



start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (layer_1, grid_param): #https://docs.python.org/3/library/functions.html#zip

    #print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
    #print(param)
    
    
    start = time.perf_counter()        
    best_search = GridSearchCV(estimator = clf[1], param_grid = param, cv = 5, scoring = 'roc_auc')
    best_search.fit(features, safe_type)
    run = time.perf_counter() - start

    best_param = best_search.best_params_
    print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))
    clf[1].set_params(**best_param) 


run_total = time.perf_counter() - start_total
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
print('-'*10)

The best parameter for BaggingClassifier is {'max_samples': 0.5, 'n_estimators': 300, 'random_state': 0} with a runtime of 1259.84 seconds.
The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'random_state': 0} with a runtime of 2480.75 seconds.
The best parameter for LogisticRegression is {'fit_intercept': False, 'random_state': 0, 'solver': 'newton-cg'} with a runtime of 300.54 seconds.
The best parameter for BernoulliNB is {'alpha': 0.1} with a runtime of 1.86 seconds.
The best parameter for KNeighborsClassifier is {} with a runtime of 67.43 seconds.
The best parameter for XGBClassifier is {'algorithm': 'auto', 'n_neighbors': 1, 'weights': 'uniform'} with a runtime of 771.57 seconds.
Total optimization time was 81.37 minutes.
----------


In [4]:
layer_1 = [
            #Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
#             ('ada', AdaBoostClassifier()),
            ('bc', BaggingClassifier()),
#             ('etc', ExtraTreesClassifier()),
            ('gbc', GradientBoostingClassifier()),
#             ('rfc', RandomForestClassifier()),

            #Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
#             ('gpc', GaussianProcessClassifier()),

            #GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
            ('lr', LogisticRegression()),

            #Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
            ('bnb', naive_bayes.BernoulliNB()),
#             ('gnb', naive_bayes.GaussianNB()),

            #Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
            ('knn', neighbors.KNeighborsClassifier()),

            #SVM: http://scikit-learn.org/stable/modules/svm.html
            ('svc', svm.SVC(probability=True)),

            #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
           ('xgb', XGBClassifier())

          ]

In [5]:
vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')
vote_soft_cv = cross_validate(vote_soft, features, safe_type, cv=5)
vote_soft.fit(features, safe_type)

print("Soft Voting Training w/bin score mean: {:.2f}".format(vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}".format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)

Soft Voting Training w/bin score mean: 98.43
Soft Voting Test w/bin score mean: 97.14
Soft Voting Test w/bin score 3*std: +/- 0.68
----------


In [9]:
gv_vote_soft = VotingClassifier(estimators=layer_1 , voting = 'soft')
gv_vote_soft_cv = cross_validate(gv_vote_soft, features, safe_type, cv=5)
gv_vote_soft.fit(features, safe_type)

print("Soft Voting Training w/bin score mean: {:.2f}".format(gv_vote_soft_cv['train_score'].mean()*100)) 
print("Soft Voting Test w/bin score mean: {:.2f}".format(gv_vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format(gv_vote_soft_cv['test_score'].std()*100*3))
print('-'*10)

Soft Voting Training w/bin score mean: 98.79
Soft Voting Test w/bin score mean: 97.85
Soft Voting Test w/bin score 3*std: +/- 0.46
----------


In [10]:
joblib.dump(gv_vote_soft, "gv_vote_soft.m")

['gv_vote_soft.m']

In [11]:
test = pd.read_csv("fliter_test_data.csv")
id_ = test["id"]
test_features = test.iloc[:, 1:]

In [12]:
predict = gv_vote_soft.predict(test_features)

In [13]:
result = pd.DataFrame()
result["id"] = id_
result["safe_type"] = predict
result.to_csv("result.csv", encoding="utf-8", index=False)

In [21]:
bc = joblib.load("./models/bc_gr_model.m")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import glob
def read_data(file_type):
    data = []
    for path in glob.glob("./stage1_dataset/train/{}/*".format(file_type)):
        with open(path, "r") as fp:
            data.append(fp.read())
    return data

vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=3000)
white_data = read_data("white")
black_data = read_data("black")
data = white_data + black_data
white = [0 for _ in range(len(white_data))]
black = [1 for _ in range(len(black_data))]
safe_type = white + black
features = vectorizer.fit_transform(data)

In [45]:
def get_oof(model, x_train, y_train, x_test, n_splits):
    """
    :@param x_train: feature matrix.
    :type x: np.array(M X N) or list(M X N).
    :@param y_train: class label.
    :type y: int.
    :@param x_test: test set feature matrix.
    :type x_test: np.array(M X N) or list(M X N).
    :@param n_splits: K-fold parameter.
    :type n_splits: int.
    """
    n_train, n_test = x_train.shape[0], x_test.shape[0]
    kf = StratifiedKFold(n_splits=n_splits, random_state=0)
    oof_train = np.empty((n_train, ))
    oof_test = np.empty((n_test, ))
    oof_test_skf = np.empty((n_splits, n_test))
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        kf_x_train = x_train[train_index]
        kf_y_train = y_train[train_index]
        kf_x_test = x_train[test_index]
        model.fit(kf_x_train, kf_y_train)
        oof_train[test_index] = model.predict(kf_x_test)
        oof_test_skf[i, :] = model.predict(x_test)
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [67]:
import numpy as np
from PIL import Image
import binascii

def getMatrixfrom_bin(filename, width):
    with open(filename, 'rb') as f:
        content = f.read()
    hexst = binascii.hexlify(content)  #将二进制文件转换为十六进制字符串
    fh = np.array([int(hexst[i: i+2], 16) for i in range(0, len(hexst), 2)])  #按字节分割
    rn = len(fh) // width
    fh = np.reshape(fh[:rn * width], (-1, width))  #根据设定的宽度生成矩阵
    fh = np.uint8(fh)
    return fh


In [68]:
filename = "./pandalearning.exe"
im = Image.fromarray(getMatrixfrom_bin(filename, 512)) #转换为图像
# im.save("your_img_filename.png")

In [69]:
im.show()

In [70]:
import pefile
PEfile_Path = "pandalearning.exe"
pe = pefile.PE(PEfile_Path)

In [79]:
with open("test.txt", "w") as fp:
    fp.write(str(pe))

In [80]:
import re
from collections import *
# 从.asm文件获取Opcode序列
def getOpcodeSequence(filename):
    opcode_seq = []
    p = re.compile(r'\s([a-fA-F0-9]{2}\s)+\s*([a-z]+)')
    with open(filename) as f:
        for line in f:
            if line.startswith(".text"):
                m = re.findall(p, line)
                if m:
                    opc = m[0][10]
                    if opc != "align":
                        opcode_seq.append(opc)
    return opcode_seq
# 根据Opcode序列，统计对应的n-gram
def getOpcodeNgram(ops ,n = 3):
    opngramlist = [tuple(ops[i:i+n]) for i in range(len(ops)-n)]
    opngram = Counter(opngramlist)
    return opngram
file = "test.txt"
ops = getOpcodeSequence(file)
opngram = getOpcodeNgram(ops)

In [82]:
data = str(pe)