In [1]:
import re
import glob
import time
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn import svm
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.svm import LinearSVC
# from xgboost import XGBClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.externals import joblib

%config InlineBackend.figure_format = 'svg'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [4]:
def yield_origin_csv():
    flag = 1
    id_, api_name_list, exinfos_list = [], [], []
    api_name_regex = re.compile('<action api_name="(.*?)" call_name')
    exinfos_regex = re.compile('<exInfo value="(.*?)"')
    for path in glob.glob("./stage2_dataset/*"):
        with open(path, "r") as fp:
            xml = fp.read()
        api_names = re.findall(api_name_regex, xml)
        exinfos = re.findall(exinfos_regex, xml)
        api_name_list.append(" ".join(api_names))
        dll_exinfos = [ef.split("\\")[-1].split('.')[0] for ef in exinfos 
                       if ef.endswith(".dll") and ef.startswith("C:")]
        id_.append(path.split(".")[1].split("/")[-1]) 
        exinfos_list.append(" ".join(dll_exinfos))
        
        
        if flag % 300 == 0:
            print(flag)
        flag += 1
    df = pd.DataFrame()
    df["id"] = id_
    df["api_name"] = api_name_list
    df["exinfos"] = exinfos_list
    return df

In [10]:
# data = yield_origin_csv()
data = pd.read_csv("stage2_api_name_exinfos.csv")

In [3]:
data.head()

Unnamed: 0,id,api_name,exinfos
0,3ec88410420dd913bf5676b2ba0ae4baa41dad0d55df9b...,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,user32 gdi32 mfc42 msvcrt imm32 advapi32 rpcrt...
1,2dfd653c6b862500ff7c47615ad0725a8ce88ddb8ee083...,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...,mpr advapi32 rpcrt4 secur32 user32 gdi32 imm32...
2,fb7ae8ad837ee4c2afc58bc321e6bfddb6564a6bce3743...,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...,user32 gdi32 advapi32 rpcrt4 secur32 oleaut32 ...
3,c97a29518ee63fecae29dd973941b8395bd3aaceb11c52...,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,user32 gdi32 advapi32 rpcrt4 secur32 iphlpapi ...
4,fb146a3d534cfc36b325bc1c4d7995122b722eb5ae04d5...,AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...,mfc42 msvcrt gdi32 user32 imm32 advapi32 rpcrt...


In [4]:
data.fillna(method="ffill", inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 3 columns):
id          60000 non-null object
api_name    60000 non-null object
exinfos     60000 non-null object
dtypes: object(3)
memory usage: 1.4+ MB


In [13]:
api_name_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)
api_name_train_tfidf_features = api_name_vectorizer.fit_transform(data["api_name"].tolist())

exinfos_vectorizer = TfidfVectorizer(ngram_range=(1, 5), min_df=3, max_df=0.9, max_features=100000)
exinfos_train_tfidf_features = exinfos_vectorizer.fit_transform(data["exinfos"].tolist())

In [10]:
with open("api_name_train_tfidf_features.pkl", "wb") as fp:
    pickle.dump(api_name_train_tfidf_features, fp)
with open("exinfos_train_tfidf_features.pkl", "wb") as fp:
    pickle.dump(exinfos_train_tfidf_features, fp)

In [2]:
api_name_train_tfidf_features = pd.read_pickle("api_name_train_tfidf_features.pkl")
exinfos_train_tfidf_features = pd.read_pickle("exinfos_svded_features.pkl")

In [None]:
svd = TruncatedSVD(n_components=1000, algorithm="arpack", random_state=0)
svded_train = svd.fit_transform(api_name_train_tfidf_features.tolil())
svd = TruncatedSVD(n_components=10000, algorithm="arpack", random_state=0)
exinfos_svded_train = svd.fit_transform(exinfos_train_tfidf_features.tolil())
with open("api_name_svded_10000_features.pkl", "wb") as fp:
    pickle.dump(svded_train, fp)
with open("exinfos_svded_10000_features.pkl", "wb") as fp:
    pickle.dump(exinfos_svded_train, fp)

In [3]:
api_name_svded_train = pd.read_pickle("api_name_svded_features.pkl")
exinfos_svded_train = pd.read_pickle("exinfos_svded_features.pkl")

In [4]:
merge_data = np.hstack([api_name_svded_train, exinfos_svded_train])

In [5]:
merge_data.shape

(60000, 2000)

In [7]:
kmeans = KMeans(n_clusters=50, random_state=0)

In [8]:
y_pred = kmeans.fit_predict(merge_data)

In [11]:
result = pd.DataFrame()
result["id"] = data["id"]
result["family_id"] = y_pred

In [14]:
result.to_csv("result.csv", encoding="utf-8", index=False)

In [25]:
y_pred = pd.read_csv("result.csv")["family_id"]

In [44]:
exinfos = pd.read_pickle("exinfos_svded_features.pkl")

In [6]:
from sklearn.manifold import TSNE

X_tsne = TSNE(n_components=2, random_state=33).fit_transform(merge_data)

In [None]:
with open("api_name_exinfos_stne_data.pkl", "wb") as fp:
    pickle.dump(X_tsne, fp)

In [2]:
X_tsne =  pd.read_pickle("call_name_tsne_data.pkl")

In [9]:
font = {"color": "darkred",
        "size": 13, 
        "family" : "serif"}

plt.style.use("dark_background")
plt.figure()
plt.scatter(X_tsne[:, 0], X_tsne[:, 1])
plt.title("origin_data_t-SNE", fontdict=font)

In [10]:
y_pred = pd.read_csv("34.78_k=100.csv")["family_id"]

In [19]:
font = {"color": "darkred",
        "size": 13, 
        "family" : "serif"}

plt.style.use("dark_background")
plt.figure()
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred.values, alpha=0.6, 
            cmap=plt.cm.get_cmap('rainbow', 100))
plt.title("api_name_and_exinfos_t-SNE", fontdict=font)
cbar = plt.colorbar() 
cbar.set_label(label='family id', fontdict=font)
plt.clim(-5, 100)
plt.tight_layout()
plt.savefig("api_name_and_exinfos_TSNE.pdf")

In [21]:
call_name_svded_features = pd.read_pickle("call_name_svded_features.pkl")
api_name_svded_features = pd.read_pickle("api_name_svded_features.pkl")
exinfos_svded_features = pd.read_pickle("exinfos_svded_features.pkl")
merge_data = np.hstack([api_name_svded_features, exinfos_svded_features, call_name_svded_features])

In [26]:
kmeans = KMeans(n_clusters=100, random_state=0)
y_pred = kmeans.fit_predict(merge_data)

In [27]:
cluster = 100
result = pd.DataFrame()
result["id"] = pd.read_csv("id.csv", names=["id"])["id"]
result["family_id"] = y_pred

result.to_csv(f"k-means_cluster={cluster}_result.csv", encoding="utf-8", index=False)

In [48]:
10.53 - 7.89

2.6399999999999997

In [49]:
2.63 * 3

7.89