-
Notifications
You must be signed in to change notification settings - Fork 1
/
detector.py
85 lines (60 loc) · 2.41 KB
/
detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
from datetime import datetime
# numpy library
import numpy as np
# Matplotlib library
from matplotlib import pyplot as plt
# scikit-learn library
from sklearn import metrics
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import normalize
from sklearn.svm import LinearSVC
vec = DictVectorizer()
def load_features(shas=False):
with open('data/apg-X.json', 'rt') as f:
X = json.load(f)
with open('data/apg-y.json', 'rt') as f:
y = json.load(f)
return X, y
def vectorize(X, y):
X = vec.fit_transform(X)
features_names = vec.get_feature_names()
y = np.asarray(y)
return X, y, features_names
def get_split_dataset(X, y):
split_dataset = train_test_split(X, y, test_size=0.33, stratify= y)
return split_dataset
def TPR_FPR(y_true, y_pred):
tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
tpr = tp/(tp+fn)
fpr = fp/(fp+tn)
print(f'TPR = {tpr}, FPR = {fpr}')
return tpr, fpr
def report(y_true, y_pred):
print('---- default threshold classification report ---- \n',
metrics.classification_report(y_true,
y_pred,
labels=[1, 0],
target_names=['Malware', 'Goodware']))
def train(num_features = 10000):
X_load, y_load = load_features()
X, y, feature_names = vectorize(X_load, y_load)
# spliting the data
x_train, x_test, y_train, y_test = get_split_dataset(X, y)
print(x_train.shape)
print(x_test.shape)
selector_estimator = LinearSVC(C=0.01, dual=True, max_iter=10000)
selector = SelectFromModel(estimator=selector_estimator, prefit=False, max_features=num_features)
X_train_new = selector.fit_transform(x_train, y_train)
X_test_new = selector.transform(x_test)
print(f'training data = {X_train_new.shape}, testing data = {X_test_new.shape}')
'''parameters = {'C': [0.1, 1]}
GridSearchCV(LinearSVC(dual=True, max_iter=10000), parameters, cv=5, scoring='f1', n_jobs=-1)'''
clf = LinearSVC(C=0.1, dual=True, max_iter=10000)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)
TPR_FPR(y_test, y_pred)
report(y_test, y_pred)
return clf, 0