In [4]:
import os
import re
import copy
import time
import lief
import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import StratifiedKFold

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

In [5]:
with open("../feature_engineering/feature_engineering_features.pkl", 'rb') as f:
    feature_engineering_features = pickle.load(f)
with open("../models/feature_engineering_keys.pkl", 'rb') as f:
    keys = pickle.load(f)

In [8]:
with open("../models/hash_list.pkl", "rb") as f:
    hash_list = pickle.load(f)

with open("../models/black_list.pkl", "rb") as f:
    black_list = pickle.load(f)

with open("../models/hash_list.pkl", 'rb') as f:
    hash_list = pickle.load(f)

train_features = []
for ha in hash_list:
    if ha in black_list:
        train_features.append(1)
    else:
        train_features.append(0)

train_features = np.array(train_features, dtype=np.int32)

In [11]:
train_features.shape # 因为测试样本只有10+10个

(20,)

In [4]:
train_features.shape

(11647,)

In [12]:
train_df = pd.DataFrame(feature_engineering_features, columns=keys)

In [20]:
params = {'num_leaves': 20,
          'min_data_in_leaf': 1,
          'objective': 'binary', #定义的目标函数
          'max_depth': 4,
          'learning_rate': 0.01,
          "min_sum_hessian_in_leaf": 4,
          "boosting": "gbdt",
          "feature_fraction": 0.9,  #提取的特征比率
          "bagging_freq": 1,
          "bagging_fraction": 0.9,
          "bagging_seed": 11,
          "nthread": 10,
          'metric': {'binary_logloss'},  
          "random_state": 6666,
}

n_splits = 5

kf = StratifiedKFold(n_splits=n_splits, random_state=2200, shuffle=True)

prob_oof = np.zeros((len(train_features), ))

feature_importance_df = pd.DataFrame()

lgb_models = []

for fold_idx, (train_index, test_index) in enumerate(kf.split(train_df, train_features)):
    print("fold {}".format(fold_idx+1))
    trn_data = lgb.Dataset(train_df.iloc[train_index], label=train_features[train_index])
    val_data = lgb.Dataset(train_df.iloc[test_index], label=train_features[test_index])

    lgb_model = lgb.train(params,
                          trn_data,
                          3000,
                          valid_sets=[trn_data, val_data],
                          early_stopping_rounds=50,
                          verbose_eval=500)
    prob_oof[test_index] = lgb_model.predict(train_df.iloc[test_index], num_iteration=lgb_model.best_iteration, boost_from_average=False)

    lgb_models.append(copy.deepcopy(lgb_model))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = keys
    fold_importance_df["importance"] = lgb_model.feature_importance()
    fold_importance_df["fold"] = fold_idx + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

fold 1
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 8, number of negative: 8
[LightGBM] [Info] Total Bins 250
[LightGBM] [Info] Number of data: 16, number of used features: 54
Training until validation scores don't improve for 50 rounds
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Early stopping, best iteration is:
[1]	training's binary_logloss: 0.693147	valid_1's binary_logloss: 0.693147
Finished loading model, total used 1 iterations
fold 2
This may cause significantly different results comparing to the previous versions of LightGBM.
Try to set boost_from_average=false, if your old models produce bad results
[LightGBM] [Info] Number of positive: 8, number of negative: 8
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data: 16, number of used features: 54
[LightGBM] [Info] [bin

In [21]:
feature_importance_df.sort_values(by="importance", ascending=False) # .to_csv("importance.csv")

Unnamed: 0,Feature,importance,fold
0,entry,0,1
16,btc_count,0,4
22,paths_count,0,4
21,xmr_mean,0,4
20,xmr_count,0,4
...,...,...,...
43,yargen_count,0,2
44,av_count,0,2
45,dbg_count,0,2
46,pool_name_count,0,2


In [22]:
sum([0 if i < 0.5 else 1 for i in prob_oof])

20

In [23]:
with open("../oof/feature_engineerin_train.pkl", "wb") as fp:
    pickle.dump(prob_oof.reshape((len(train_features), 1)), fp)

In [24]:
with open("../models/lgb_models.pkl", "wb") as fp:
    pickle.dump(lgb_models, fp)