In [1]:
import os
import re
import copy
import time
import lief
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [2]:
with open("/home/jovyan/feature_engineering/feature_engineering_features.pkl", 'rb') as f:
    feature_engineering_features = pickle.load(f)
with open("models/keys.pkl", 'rb') as f:
    keys = pickle.load(f)

In [3]:
with open("/home/datacon/malware/XXX/black.txt", 'r') as f:
    black_list = f.read().strip().split()

with open("/home/datacon/malware/XXX/white.txt", 'r') as f:
    white_list = f.read().strip().split()

with open("models/hash_list.pkl", 'rb') as f:
    hash_list = pickle.load(f)

train_features = []
for ha in hash_list:
    if ha in black_list:
        train_features.append(1)
    else:
        train_features.append(0)

train_features = np.array(train_features, dtype=np.int32)

In [4]:
train_features.shape

(11647,)

In [5]:
train_df = pd.DataFrame(feature_engineering_features, columns=keys)

In [13]:
params = {'num_leaves': 20,
          'min_data_in_leaf': 1,
          'objective': 'binary', #定义的目标函数
          'max_depth': 4,
          'learning_rate': 0.01,
          "min_sum_hessian_in_leaf": 4,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.9,
          "bagging_seed": 11,
          "nthread": 10,
          'metric': {'binary_logloss'},  
          "random_state": 6666,
}

n_splits = 5

kf = StratifiedKFold(n_splits=n_splits, random_state=2200, shuffle=True)

prob_oof = np.zeros((len(train_features), ))

feature_importance_df = pd.DataFrame()

lgb_models = []

for fold_idx, (train_index, test_index) in enumerate(kf.split(train_df, train_features)):
    print("fold {}".format(fold_idx+1))
    trn_data = lgb.Dataset(train_df.iloc[train_index], label=train_features[train_index])
    val_data = lgb.Dataset(train_df.iloc[test_index], label=train_features[test_index])

    lgb_model = lgb.train(params,
                          trn_data,
                          3000,
                          valid_sets=[trn_data, val_data],
                          early_stopping_rounds=50,
                          verbose_eval=500)
    prob_oof[test_index] = lgb_model.predict(train_df.iloc[test_index], num_iteration=lgb_model.best_iteration)

    lgb_models.append(copy.deepcopy(lgb_model))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = keys
    fold_importance_df["importance"] = lgb_model.feature_importance()
    fold_importance_df["fold"] = fold_idx + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

fold 1
Training until validation scores don't improve for 50 rounds
[500]	training's binary_logloss: 0.0622826	valid_1's binary_logloss: 0.0692178
[1000]	training's binary_logloss: 0.0380324	valid_1's binary_logloss: 0.054276
[1500]	training's binary_logloss: 0.026614	valid_1's binary_logloss: 0.0492136
[2000]	training's binary_logloss: 0.0193443	valid_1's binary_logloss: 0.046569
[2500]	training's binary_logloss: 0.0148735	valid_1's binary_logloss: 0.045438
Early stopping, best iteration is:
[2740]	training's binary_logloss: 0.0132744	valid_1's binary_logloss: 0.0452006
Finished loading model, total used 2740 iterations
fold 2
Training until validation scores don't improve for 50 rounds
[500]	training's binary_logloss: 0.0592194	valid_1's binary_logloss: 0.0770158
[1000]	training's binary_logloss: 0.0367177	valid_1's binary_logloss: 0.0628496
[1500]	training's binary_logloss: 0.0256375	valid_1's binary_logloss: 0.0581123
[2000]	training's binary_logloss: 0.0187458	valid_1's binary_log

In [14]:
feature_importance_df.sort_values(by="importance", ascending=False) # .to_csv("importance.csv")

Unnamed: 0,Feature,importance,fold
33,entr_X,1166,1
33,entr_X,1089,2
34,size_R_weight,1032,1
10,size_X_weight,1016,2
10,size_X_weight,996,1
...,...,...,...
26,pe_mean,0,1
2,mz_mean,0,4
2,mz_mean,0,2
26,pe_mean,0,3


In [15]:
sum([0 if i < 0.5 else 1 for i in prob_oof])

3775

In [16]:
# with open("oof/feature_engineerin_train.pkl", "wb") as fp:
#     pickle.dump(prob_oof.reshape((len(train_features), 1)), fp)

In [17]:
with open("models/lgb_models.pkl", "wb") as fp:
    pickle.dump(lgb_models, fp)