In [81]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util


def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
        
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids


In [99]:
#for debugging purposes
tree = ET.parse(os.path.join('train','00bee48acc9d1774e4edf96f9582fac06b2ec1f14.None.xml'))

In [82]:
def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
        
    cols = []
    rows = []
    data = []        
    for i in xrange(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].iteritems():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict
    

In [83]:
## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

def system_call_count_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
      as well as:
         - number of each kind of tag
         - proportion of each kind of tag
         - average number of threads per process
         - average number of systems calls per section
         - average number of system calls per process
         - total number of tags
    """
    #initiate counters
    c = Counter()
    c_all = Counter()
    n_el = 0
    in_all_section = False
    for el in tree.iter():
        #keep track of all the kind of tags, and the total number of tags
        c_all["num_"+str(el.tag)] += 1
        n_el += 1        
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    #calculate proportions for each tag, and merge everything in a dictionary      
    for key, val in c_all.items():
        c[key] = val
        c["ratio-"+key] = float(val)/n_el
    c["n_el"] = n_el
    if c['num_processes'] != 0:
        c["ratio-threads-processes"] = float(c['num_threads'])/c['num_processes']        
    if c['num_sections'] != 0:
        c["ratio-system_calls-sections"] = float(c['num_system_calls'])/c['num_sections']
    if c['num_processes'] != 0:
        c["ratio-system_calls-processes"] = float(c['num_system_calls'])/c['num_processes']
    return c


In [235]:
#def main():
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# extract features from smaller dataset

train_dir = "train_small"
test_dir = "test_small"
outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [first_last_system_call_feats, system_call_count_feats]


print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
print "done extracting training features"
print


print "extracting test features..."
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
print "done extracting test features"
print

#MLP parameters
solver = 'adam'
max_iter = 100


alpha = 1e-6
tol = 1e-12
rate_init = 0.01
learning_rate = 'adaptive'
early_stopping = True

accuracy = []

#Vary number of Nodes 
N_nodes = np.linspace(1, 201, 41)
N_layers = 3

for nodes in N_nodes:

    layers = np.multiply(nodes, np.ones(N_layers)) 
    scaler = StandardScaler(with_mean = False)
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    mlp = MLPClassifier(hidden_layer_sizes=layers, max_iter=max_iter, alpha=alpha,
                        solver=solver, verbose=10, tol=tol, random_state=1,
                        learning_rate_init=rate_init, learning_rate = learning_rate,
                       early_stopping = early_stopping)
    mlp.fit(X_train_scaled, t_train)
    print "done learning"
    print

    X_test_scaled = scaler.transform(X_test)
    preds = mlp.predict(X_test_scaled)
    #print "done making predictions"
    #print

    #print "writing predictions..."
    #util.write_predictions(preds, test_ids, outputfile)
    #print "done!"

    #test predictions
    num_correct = np.sum(t_ignore == preds);
    accuracy = np.append(accuracy,num_correct/float(t_ignore.size))
    #print('Accuracy  = '+str(accuracy))
plt.clf()
plt.plot(N_nodes, accuracy, linestyle='--', marker='o', color='b')
#plt.title('N_nodes = '+ str(N_nodes)+'_N_layers = ' + str(N_layers))
plt.xlabel('Number of nodes')
plt.ylabel('Accuracy')
#plt.xlim((n_vals[0]-1, n_vals[-1]+1))
#plt.ylim((min(RMSE), max(RMSE)))
#plt.show()
plt.savefig('Plots/'+_N_nodes_'+ str(N_nodes[[1]])+'-'+str(N_nodes[[-1]])+'_N_layers' + str(N_layers)+'.pdf')

Iteration 1, loss = 2.48107582
Validation score: 0.533981
Iteration 2, loss = 2.32736243
Validation score: 0.533981
Iteration 3, loss = 2.19125051
Validation score: 0.533981
Iteration 4, loss = 2.07697919
Validation score: 0.533981
Validation score did not improve more than tol=0.000000 for two consecutive epochs. Stopping.
done learning

Iteration 1, loss = 2.24406234
Validation score: 0.533981
Iteration 2, loss = 1.74400484
Validation score: 0.533981
Iteration 3, loss = 1.40400943
Validation score: 0.527508
Iteration 4, loss = 1.14270241
Validation score: 0.624595
Iteration 5, loss = 0.91486502
Validation score: 0.792880
Iteration 6, loss = 0.77249607
Validation score: 0.796117
Iteration 7, loss = 0.67192259
Validation score: 0.799353
Iteration 8, loss = 0.62762974
Validation score: 0.812298
Iteration 9, loss = 0.59437660
Validation score: 0.844660
Iteration 10, loss = 0.56524034
Validation score: 0.844660
Iteration 11, loss = 0.54533759
Validation score: 0.851133
Iteration 12, loss 

<matplotlib.text.Text at 0x14f969b0>

In [139]:
##
#Vary number of layers

N_nodes = 50
N_layers = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 18, 22, 26, 30])
accuracy = []

for nlayers in N_layers:

    layers = np.multiply(N_nodes, np.ones(nlayers)) 
    scaler = StandardScaler(with_mean = False)
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    mlp = MLPClassifier(hidden_layer_sizes=layers, max_iter=max_iter, alpha=alpha,
                        solver=solver, verbose=10, tol=tol, random_state=1,
                        learning_rate_init=rate_init)
    mlp.fit(X_train_scaled, t_train)

    X_test_scaled = scaler.transform(X_test)
    preds = mlp.predict(X_test_scaled)

    #test predictions
    num_correct = np.sum(t_ignore == preds);
    accuracy = np.append(accuracy,num_correct/float(t_ignore.size))

plt.clf()
plt.plot(N_layers, accuracy, linestyle='--', marker='o', color='b')
#plt.title('N_nodes = '+ str(N_nodes)+'_N_layers = ' + str(N_layers))
plt.xlabel('Number of layers')
plt.ylabel('Accuracy')
plt.savefig('Plots/'+_N_nodes_'+ str(N_nodes)+'_N_layers' + str(N_layers)+'.pdf')

Iteration 1, loss = 2.18325777
Iteration 2, loss = 0.91712465
Iteration 3, loss = 0.61017820
Iteration 4, loss = 0.44364811
Iteration 5, loss = 0.35111230
Iteration 6, loss = 0.29204699
Iteration 7, loss = 0.25798270
Iteration 8, loss = 0.22419806
Iteration 9, loss = 0.20543412
Iteration 10, loss = 0.19113647
Iteration 11, loss = 0.17874617
Iteration 12, loss = 0.15886617
Iteration 13, loss = 0.14713972
Iteration 14, loss = 0.13732509
Iteration 15, loss = 0.12546136
Iteration 16, loss = 0.11614377
Iteration 17, loss = 0.10977312
Iteration 18, loss = 0.10478274
Iteration 19, loss = 0.10186774
Iteration 20, loss = 0.09619029
Iteration 21, loss = 0.09096884
Iteration 22, loss = 0.08841356
Iteration 23, loss = 0.08755624
Iteration 24, loss = 0.08324524
Iteration 25, loss = 0.07908845
Iteration 26, loss = 0.08118494
Iteration 27, loss = 0.07662288
Iteration 28, loss = 0.07310190
Iteration 29, loss = 0.07082420
Iteration 30, loss = 0.07041504
Iteration 31, loss = 0.07278512
Iteration 32, los

In [None]:
#Try decition tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,t_train)
preds = clf.predict(X_test)
accuracy = np.sum(t_ignore == preds)/float(t_ignore.size)
print(accuracy)



In [292]:
#Try random forest
X_subset_train, X_subset_test, t_subset_train, t_subset_test = train_test_split(X_train, t_train, test_size=0.2, random_state=1)
scaler = StandardScaler(with_mean = False)
scaler.fit(X_subset_train)
X_train_scaled = scaler.transform(X_subset_train)
clf = RandomForestClassifier(n_jobs=-1, n_estimators = 10000)# class_weight='balanced')
#y, _ = pd.factorize(train['species'])
clf.fit(X_train_scaled, t_subset_train)

X_test_scaled = scaler.transform(X_subset_test)
preds = clf.predict(X_test_scaled)
accuracy = np.sum(t_subset_test == preds)/float(t_subset_test.size)
print(accuracy)

0.888349514563


In [244]:
#predict using MLP

train_dir = "train"
test_dir = "test"
outputfile = "MLP_preds.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [first_last_system_call_feats, system_call_count_feats]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
print "done extracting training features"
print


print "extracting test features..."
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
print "done extracting test features"
print

solver = 'adam'
#momentum = 0.9 #only is sgd
#nesterovs_momentum = True # only if sgd

max_iter =100
batch_size = 200
solver = 'adam'
max_iter = 500


alpha = 1e-4
tol = 1e-12
rate_init = 0.01
learning_rate = 'adaptive' #only matters for sgd
activation = 'relu'
early_stopping = True
#validation_fraction = 0.1
#warm_start = False

N_nodes = 25
N_layers = 3

layers = np.multiply(N_nodes, np.ones(N_layers)) 
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
mlp = MLPClassifier(hidden_layer_sizes=layers, max_iter=max_iter, alpha=alpha,
                        solver=solver, verbose=10, tol=tol, random_state=1,
                        learning_rate_init=rate_init, activation = activation,
                   learning_rate = learning_rate, early_stopping=early_stopping,
                   #validation_fraction = validation_fraction, warm_start = warm_start, batch_size = batch_size, 
                   #momentum = momentum, nesterovs_momentum=nesterovs_momentum
                   )
mlp.fit(X_train_scaled, t_train)

X_test_scaled = scaler.transform(X_test)
preds = mlp.predict(X_test_scaled)


print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"
#num_correct = np.sum(t_ignore == preds);
#accuracy = num_correct/float(t_ignore.size)
#print(accuracy)

Iteration 1, loss = 1.65149104
Validation score: 0.783172
Iteration 2, loss = 0.76746184
Validation score: 0.796117
Iteration 3, loss = 0.56940306
Validation score: 0.841424
Iteration 4, loss = 0.45678843
Validation score: 0.857605
Iteration 5, loss = 0.39835289
Validation score: 0.870550
Iteration 6, loss = 0.35105035
Validation score: 0.877023
Iteration 7, loss = 0.31999594
Validation score: 0.880259
Iteration 8, loss = 0.28653051
Validation score: 0.877023
Iteration 9, loss = 0.27527829
Validation score: 0.883495
Iteration 10, loss = 0.25088694
Validation score: 0.899676
Iteration 11, loss = 0.24069678
Validation score: 0.899676
Iteration 12, loss = 0.23830228
Validation score: 0.889968
Iteration 13, loss = 0.24147302
Validation score: 0.899676
Validation score did not improve more than tol=0.000000 for two consecutive epochs. Stopping.
writing predictions...
done!


In [278]:
#Try SVM


X_subset_train, X_subset_test, t_subset_train, t_subset_test = train_test_split(X_train, t_train, test_size=0.2, random_state=1)

kernel = 'rbf'
tol = 1e-3
#class_weight = 'balanced'

scaler = StandardScaler(with_mean = False)
scaler.fit(X_subset_train)
X_train_scaled = scaler.transform(X_subset_train)
clf = svm.SVC(verbose = True, kernel = kernel, tol = tol)
clf.fit(X_train_scaled, t_subset_train)

X_test_scaled = scaler.transform(X_subset_test)
preds = clf.predict(X_test_scaled)

#test predictions
num_correct = np.sum(t_subset_test == preds);
accuracy = num_correct/float(t_subset_test.size)
print(accuracy)

[LibSVM]0.864077669903


In [280]:
#Predict using SVM
outputfile = 'SVM_preds.csv'
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
clf = svm.SVC(verbose = True, kernel = kernel, tol = tol)
clf.fit(X_train_scaled, t_train)

X_test_scaled = scaler.transform(X_test)
preds = clf.predict(X_test_scaled)
print "writing predictions..."
util.write_predictions(preds, test_ids, outputfile)
print "done!"

[LibSVM]writing predictions...
done!
