In [1]:
import pandas as pd
import numpy as np
import pickle
import joblib
import sklearn.ensemble as ske
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values

In [2]:
print('Researching important feature based on %i total features\n' % X.shape[1])

fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)

features = []

Researching important feature based on 54 total features



In [3]:
print('%i features identified as important:' % nb_features)

indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
    print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2+f])

12 features identified as important:
1. feature DllCharacteristics (0.150006)
2. feature Machine (0.096744)
3. feature Characteristics (0.078339)
4. feature VersionInformationSize (0.064946)
5. feature SectionsMaxEntropy (0.062634)
6. feature Subsystem (0.059955)
7. feature MajorSubsystemVersion (0.058530)
8. feature ImageBase (0.056111)
9. feature SizeOfOptionalHeader (0.050628)
10. feature ResourcesMinEntropy (0.040687)
11. feature ResourcesMaxEntropy (0.040418)
12. feature MajorOperatingSystemVersion (0.026422)


In [4]:
algorithms = {
        "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
        "RandomForest": ske.RandomForestClassifier(n_estimators=50),
        "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
        "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
        "GNB": GaussianNB()
    }

results = {}
print("\nNow testing algorithms")
for algo in algorithms:
    clf = algorithms[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score


Now testing algorithms
DecisionTree : 99.127128 %
RandomForest : 99.391525 %
GradientBoosting : 98.779428 %
AdaBoost : 98.525896 %
GNB : 70.365809 %


In [5]:
winner = max(results, key=results.get)
print('\n Algorithm with highest accuracy on train/test is %s with a %f %% success' % (winner, results[winner]*100))


 Algorithm with highest accuracy on train/test is RandomForest with a 99.391525 % success


In [6]:
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'wb').write(pickle.dumps(features))
print('Saved')

Saving algorithm and feature list in classifier directory...
Saved


In [7]:
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

False positive rate : 0.458102 %
False negative rate : 0.965534 %
