In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [4]:
data_original = pd.read_csv('../data.csv', sep='|')
dasmalwerk = pd.read_csv('../dasmalwerk_data.csv', sep='|')
frames = [data_original, dasmalwerk]
result = pd.concat(frames)

In [6]:
#separate labels
X = result.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = result['legitimate'].values

#feature reduction
fsel = ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)

X_new = model.transform(X)
nb_features = X_new.shape[1]
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
    print("%d. feature %s (%f)" % (f + 1, result.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

1. feature DllCharacteristics (0.191842)
2. feature Machine (0.144488)
3. feature Characteristics (0.106757)
4. feature Subsystem (0.055404)
5. feature ResourcesMinEntropy (0.054552)
6. feature SectionsMinEntropy (0.050408)
7. feature VersionInformationSize (0.048475)
8. feature ResourcesMaxEntropy (0.043858)
9. feature ImageBase (0.036713)
10. feature MajorSubsystemVersion (0.030514)
11. feature SizeOfOptionalHeader (0.028957)
12. feature SectionsMaxEntropy (0.026164)


In [7]:
algorithms = {
        "DecisionTree": DecisionTreeClassifier(max_depth=10),
        "RandomForest": RandomForestClassifier(n_estimators=50),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=50),
        "AdaBoost": AdaBoostClassifier(n_estimators=100),
        "GNB": GaussianNB()
    }

In [8]:
results = {}
X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.25)
print("\nNow testing algorithms")
for algo in algorithms:
    clf = algorithms[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score

winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))


Now testing algorithms
GNB : 76.213201 %
DecisionTree : 98.893169 %
RandomForest : 99.310028 %
AdaBoost : 98.470561 %
GradientBoosting : 98.804048 %

Winner algorithm is RandomForest with a 99.310028 % success
