In [1]:
import re
import glob
import pandas as pd
import pickle
from collections import Counter
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import MinMaxScaler
from sklearn.externals import joblib

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [16]:
def yield_origin_csv(file_type):
    flag = 1
    id_, api_name_list, call_pid_list, ret_value_list = [], [], [], []
    api_name_regex = re.compile('<action api_name="(.*?)" call_name')
    call_pid_regex = re.compile('call_pid="(.*?)" call_time=')
    ret_value_regex = re.compile('ret_value="(.*?)"')
    for path in glob.glob("./stage1_dataset/train/{}/*".format(file_type)):
        with open(path, "r") as fp:
            xml = fp.read()
        api_names = re.findall(api_name_regex, xml)
        call_pids = re.findall(call_pid_regex, xml)
        ret_values = re.findall(ret_value_regex, xml)
        
        id_.append(path.split(".")[1].split("/")[-1]) 
        api_name_list.append(" ".join(api_names))
        call_pid_list.append(" ".join(call_pids))
        ret_value_list.append(" ".join(ret_values))
        
        
        if flag % 300 == 0:
            print(flag)
        flag += 1
    df = pd.DataFrame()
    df["id"] = id_
    df["api_name"] = api_name_list
    df["call_pid"] = call_pid_list
    df["ret_value"] = ret_value_list
    return df

In [24]:
# black = yield_origin_csv("black")
# white = yield_origin_csv("white")
# origin_test = yield_origin_csv("test")

In [21]:
origin_data = pd.concat([white, black])
origin_data["safe_type"] = [0 for _ in range(20000)] + [1 for _ in range(10000)]

In [27]:
# origin_data.to_csv("origin_data.csv", encoding="utf-8", index=False)
# origin_test.to_csv("origin_test.csv", encoding="utf-8", index=False)

In [None]:
'''
api_distinct_cnt: file调用了多少不同的API ;
api_cnt: file调用api的总数；
api_cnt_mean: file调用API的平均值；
call_pid_distinct_cnt: file调用了多少不同的进程；
call_pid_cnt_max,tid_api_cnt_min,tid_api_cnt_mean: ","file中的线程调用的 最多/最少/平均 api数目;

value_equals0_cnt: file返回值为0的样本数;
value_equals0_rate： file返回值为0的样本比率;
value_distinct_cnt: file有多少不同的返回值;

'''

In [28]:
def get_value(x, kind="mean"):
    dict_ = Counter(x.split())
    tmp = sorted(dict_.values())
    if kind == "mean":
        return sum(dict_.values()) / len(dict_)   
    if kind == "max":
        return tmp[-1]
    if kind == "min":
        return tmp[0]
    

In [31]:
def make_features(data):
    data["api_cnt"] = data["api_name"].apply(lambda x: len(x.split()))
    data["api_distinct_cnt"] = data["api_name"].apply(lambda x: len(set(x.split())))
    data["api_cnt_mean"] = data["api_name"].apply(lambda x: get_value(x))
    data["call_pid_distinct_cnt"] = data["call_pid"].apply(lambda x: len(set(x.split())))
    data["call_pid_cnt_mean"] = data["call_pid"].apply(lambda x: get_value(x))
    data["call_pid_cnt_max"] = data["call_pid"].apply(lambda x: get_value(x, kind="max"))
    data["call_pid_cnt_min"] = data["call_pid"].apply(lambda x: get_value(x, kind="min"))
    data["ret_value_equals0_cnt"] = data["ret_value"].apply(lambda x: x.split().count('0'))
    data["ret_value_equals0_rate"] = data["ret_value"].apply(lambda x: x.split().count('0') / (len(x.split())))
    data["ret_value_distinct_cnt"] = data["ret_value"].apply(lambda x: len(set(x.split())))
    data.drop(["call_pid", "api_name", "ret_value"], axis=1, inplace=True)
    return data

In [32]:
train_features = make_features(origin_data)
test_features = make_features(origin_test)

In [41]:
train_features.to_csv("train_features.csv", encoding="utf-8", index=False)
test_features.to_csv("test_features.csv", encoding="utf-8", index=False)

## <center>Get n-gram features</center>

In [3]:
origin_train_data = pd.read_csv("origin_data.csv")
origin_test_data = pd.read_csv("origin_test.csv")

In [45]:
train_data_api_name = origin_train_data["api_name"]
test_data_api_name = origin_test_data["api_name"]

In [51]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)
train_tfidf_features = vectorizer.fit_transform(train_data_api_name.tolist())
test_tfidf_features = vectorizer.transform(test_data_api_name.tolist())

In [82]:
with open("train_tfidf_features.pkl", "wb") as fp:
    pickle.dump(train_tfidf_features, fp)
with open("test_tfidf_features.pkl", "wb") as fp:
    pickle.dump(test_tfidf_features, fp)

In [4]:
train_data_ret_value = origin_train_data["ret_value"]
test_data_ret_value = origin_test_data["ret_value"]

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9)
train_tfidf_features = vectorizer.fit_transform(train_data_ret_value.tolist())
test_tfidf_features = vectorizer.transform(test_data_ret_value.tolist())

In [15]:
with open("train_ret_value_tfidf_features.pkl", "wb") as fp:
    pickle.dump(train_tfidf_features, fp)
    
with open("test_ret_value_tfidf_features.pkl", "wb") as fp:
    pickle.dump(test_tfidf_features, fp)