In [55]:
## This file provides starter code for extracting features from the xml files and
## for doing some learning.
##
## The basic set-up: 
## ----------------
## main() will run code to extract features, learn, and make predictions.
## 
## extract_feats() is called by main(), and it will iterate through the 
## train/test directories and parse each xml file into an xml.etree.ElementTree, 
## which is a standard python object used to represent an xml file in memory.
## (More information about xml.etree.ElementTree objects can be found here:
## http://docs.python.org/2/library/xml.etree.elementtree.html
## and here: http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/)
## It will then use a series of "feature-functions" that you will write/modify
## in order to extract dictionaries of features from each ElementTree object.
## Finally, it will produce an N x D sparse design matrix containing the union
## of the features contained in the dictionaries produced by your "feature-functions."
## This matrix can then be plugged into your learning algorithm.
##
## The learning and prediction parts of main() are largely left to you, though
## it does contain code that randomly picks class-specific weights and predicts
## the class with the weights that give the highest score. If your prediction
## algorithm involves class-specific weights, you should, of course, learn 
## these class-specific weights in a more intelligent way.
##
## Feature-functions:
## --------------------
## "feature-functions" are functions that take an ElementTree object representing
## an xml file (which contains, among other things, the sequence of system calls a
## piece of potential malware has made), and returns a dictionary mapping feature names to 
## their respective numeric values. 
## For instance, a simple feature-function might map a system call history to the
## dictionary {'first_call-load_image': 1}. This is a boolean feature indicating
## whether the first system call made by the executable was 'load_image'. 
## Real-valued or count-based features can of course also be defined in this way. 
## Because this feature-function will be run over ElementTree objects for each 
## software execution history instance, we will have the (different)
## feature values of this feature for each history, and these values will make up 
## one of the columns in our final design matrix.
## Of course, multiple features can be defined within a single dictionary, and in
## the end all the dictionaries returned by feature functions (for a particular
## training example) will be unioned, so we can collect all the feature values 
## associated with that particular instance.
##
## Two example feature-functions, first_last_system_call_feats() and 
## system_call_count_feats(), are defined below.
## The first of these functions indicates what the first and last system-calls 
## made by an executable are, and the second records the total number of system
## calls made by an executable.
##
## What you need to do:
## --------------------
## 1. Write new feature-functions (or modify the example feature-functions) to
## extract useful features for this prediction task.
## 2. Implement an algorithm to learn from the design matrix produced, and to
## make predictions on unseen data. Naive code for these two steps is provided
## below, and marked by TODOs.
##
## Computational Caveat
## --------------------
## Because the biggest of any of the xml files is only around 35MB, the code below 
## will parse an entire xml file and store it in memory, compute features, and
## then get rid of it before parsing the next one. Storing the biggest of the files 
## in memory should require at most 200MB or so, which should be no problem for
## reasonably modern laptops. If this is too much, however, you can lower the
## memory requirement by using ElementTree.iterparse(), which does parsing in
## a streaming way. See http://eli.thegreenplace.net/2012/03/15/processing-xml-in-python-with-elementtree/
## for an example. 

import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util


# In[93]:


import matplotlib.pyplot as plt
import sklearn

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn import linear_model, datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score

In [2]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = []
    counting = 0
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))    
                
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
           
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids

In [3]:
def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
        
    cols = []
    rows = []
    data = []        
    for i in range(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].items():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   
    
    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict

In [4]:
## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

In [5]:
def system_call_count_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    return c

In [6]:
def system_call_histogram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['hist-'+el.tag] += 1
    return c

In [7]:
def most_frequent_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag] += 1
    d = Counter()
    key, val = c.most_common(1)[0]
    #print(key, val)
    d[key] = val
    return d

In [8]:
def distinct_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag] += 1

    d = Counter()
    d['num_distinct_calls'] = len(c)
    return d

In [9]:
def system_call_histogram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['hist-'+el.tag] += 1
    return c

In [10]:
def system_call_histogram_feats_normalized(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = system_call_histogram_feats(tree)

    #normalize the counter https://stackoverflow.com/questions/22428842/how-to-normalize-a-counter-and-combine-2-normalized-counters-python
    total = sum(c.values(), 0.0)
    for key in c:
        c[key] /= total
    return c

In [11]:
def system_call_bigram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    
    
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag] += 1

    d = Counter()
    # iterate over all pairs
    for k1 in list(c):
        for k2 in list(c):
            combo_key = '-'.join(sorted([k1, k2]))
            combo_key = 'bigram-'+combo_key
            #print(combo_key)
            d[combo_key] = 1
        
    return d

In [12]:
def system_call_ordered_bigram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    
    last1 = ''
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag+'-'+last1] += 1
        last1 = el.tag
        
    return c

In [13]:
def system_call_ordered_bigram_feats_normalized(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = system_call_ordered_bigram_feats(tree)

    #normalize the counter https://stackoverflow.com/questions/22428842/how-to-normalize-a-counter-and-combine-2-normalized-counters-python
    total = sum(c.values(), 0.0)
    for key in c:
        c[key] /= total
    return c

In [14]:
def system_call_ordered_trigram_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    
    last1 = ''
    last2 = ''
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c[el.tag+'-'+last1+'-'+last2] += 1
        last2 = last1
        last1 = el.tag
        
    return c

In [15]:
def system_call_ordered_trigram_feats_normalized(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = system_call_ordered_trigram_feats(tree)

    #normalize the counter https://stackoverflow.com/questions/22428842/how-to-normalize-a-counter-and-combine-2-normalized-counters-python
    total = sum(c.values(), 0.0)
    for key in c:
        c[key] /= total
    return c

In [17]:
def extract__better_feats(vectorizer, scaler, direc="train", training=True):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

  
  returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    classes = []
    ids = []
    cv_files = []
    counting = 0
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        #tree = ET.parse(os.path.join(direc,datafile))
        cv_files.append(os.path.join(direc,datafile))  

    if training == False:
        raw_X = vectorizer.transform(cv_files)
        X = scaler.transform(raw_X)
    elif training == True:
        raw_X = vectorizer.fit_transform(cv_files)
        X = scaler.fit_transform(raw_X)

    return X, vectorizer, scaler, np.array(classes), ids

In [18]:
    train_dir = "train"
    test_dir = "test"
    outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument

    # TODO put the names of the feature functions you've defined above in this list
    #ffs = [first_last_system_call_feats, system_call_count_feats, system_call_histogram_feats, distinct_system_call_feats, most_frequent_system_call_feats, system_call_bigram_feats]
    #ffs = [system_call_ordered_bigram_feats_normalized,system_call_ordered_bigram_feats,system_call_ordered_trigram_feats_normalized,system_call_ordered_trigram_feats,system_call_count_feats]
    ffs = [first_last_system_call_feats, system_call_count_feats, system_call_histogram_feats, distinct_system_call_feats, most_frequent_system_call_feats, system_call_ordered_bigram_feats_normalized,system_call_ordered_bigram_feats,system_call_ordered_trigram_feats_normalized,system_call_ordered_trigram_feats,system_call_bigram_feats]

    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print(X_train.shape)
    print(t_train)
    print "done extracting training features"
    print

    #turn sparse matrix back into an array and separate into training and validation data
    X_mat = X_train.toarray()
    X_tr = X_mat[0:2625,:]
    X_val = X_mat[2625:3086,:]
    t_tr = t_train[0:2625]
    t_val = t_train[2625:3086]
    print(X_tr.shape)
    print(X_val.shape)
    print(t_tr.shape)
    print(t_val.shape)

    # TODO train here, and learn your classification parameters
    #print "learning..."
    #learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    #print "done learning"
    #print

extracting training features...
(3086, 17116)
[ 8 12 10 ..., 13 10 10]
done extracting training features

(2625, 17116)
(461, 17116)
(2625,)
(461,)


In [19]:
print("extracting test features...")
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
print("done extracting test features")
print

extracting test features...
done extracting test features



In [21]:
SVD = TruncatedSVD(n_components=1000)

In [22]:
X_tr_SVD = SVD.fit_transform(X_tr)
X_val_SVD = SVD.transform(X_val)
print X_tr_SVD.shape
print X_val_SVD.shape

(2625, 1000)
(461, 1000)


In [34]:
#MLP Classification 
mlp_model_SVD= MLPClassifier(alpha=1e-03, hidden_layer_sizes=(1000,))
mlp_model_SVD.fit(X_tr_SVD, t_tr)
mlp_preds_SVD = mlp_model_SVD.predict(X_val_SVD)
B = t_val == mlp_preds_SVD 

acc = 1.*sum(B)/len(B)
print("Accuracy of MLP model given validation data:", acc)
print mlp_model_SVD.n_iter_

('Accuracy of MLP model given validation data:', 0.8850325379609545)
12


In [20]:
    #CV = CountVectorizer(input='filename', encoding='utf-8',
                         #decode_error='strict', strip_accents='ascii',
                         #lowercase=True, preprocessor=None,
                         #tokenizer=None, stop_words=None,
                         #token_pattern=’(?u)\b\w\w+\b’,
                         #ngram_range=(1, 5),
                         #analyzer='word', max_df=1.0, min_df=1,
                         #max_features=None, vocabulary=None
                         #binary=False,
                         #dtype=<class 'numpy.int64'>
                         #dtype=np.uint8
                        #)

In [None]:
#prep variable call
vectorizer = CountVectorizer(input='filename', encoding='utf-8',
                         decode_error='strict', strip_accents='ascii',
                         lowercase=True, preprocessor=None,
                         tokenizer=None, stop_words=None,
                         #token_pattern=’(?u)\b\w\w+\b’,
                         ngram_range=(1, 2),
                         analyzer='word', max_df=1.0, min_df=1,
                         max_features=None, vocabulary=None,
                         #binary=False,
                         #dtype=<class 'numpy.int64'>
                         dtype=np.uint8#
                        )
scaler = StandardScaler(copy=False, with_mean=False, with_std=True)

In [None]:
X, vectorizer, svd, Y, ids = extract__better_feats(vectorizer, scaler, direc="train", training=True)

In [None]:
Xt, vectorizer, svd, Yt, idst = extract__better_feats(vectorizer, scaler, direc="test", training=False)

In [None]:
print type(Xt)
print type(Xt.shape)
print np.ndarray.max(Xt.toarray())
print np.ndarray.min(Xt.toarray())
print type(X)
print type(X.shape)
print np.ndarray.max(X.toarray())
print np.ndarray.min(X.toarray())
print type(X_mat)
print type(X_mat.shape)
print np.ndarray.max(X_mat.toarray())
print np.ndarray.min(X_mat.toarray())
print type(X_test)
print type(X_test.shape)
print np.ndarray.max(X_test.toarray())
print np.ndarray.min(X_test.toarray())

In [None]:
print vectorizer.vocabulary_
print vectorizer.stop_words_

In [245]:
    #turn sparse matrix back into an array and separate into training and validation data
    X_mat1 = X.toarray()
    X_test1 = Xt.toarray()
    X_tr1 = X_mat1[0:2625,:]
    X_val1 = X_mat1[2625:3086,:]
    t_tr1 = Y[0:2625]
    t_val1 = Y[2625:3086]
    print(X_tr1.shape)
    print(X_val1.shape)
    print(t_tr1.shape)
    print(t_val1.shape)

(2625, 1386)
(461, 1386)
(2625,)
(461,)


In [None]:
print X_tr.shape
print X_tr1.toarray().shape

print type(X_tr)
print type(X_tr1.toarray())
print type(X_test)
print type(Xt.toarray())

In [289]:
X_tr2 = np.concatenate((X_tr, X_tr1.toarray()), axis=1)
X_val2 = np.concatenate((X_val, X_val1.toarray()), axis=1)
print X_tr2.shape
print X_val2.shape

In [301]:
#MLP Classification 
mlp_model= MLPClassifier(alpha=1e-05)
mlp_model.fit(X_tr2, t_tr1)
mlp_preds = mlp_model.predict(X_val2)
B = t_val1 == mlp_preds 

acc = 1.*sum(B)/len(B)
print("Accuracy of MLP model given validation data:", acc)
print mlp_model.n_iter_

('Accuracy of MLP model given validation data:', 0.8915401301518439)


In [304]:
X_tr3 = SVD.fit_transform(X_tr2)
X_val3 = SVD.transform(X_val2)
print X_tr3.shape
print X_val3.shape

In [312]:
#MLP Classification 
mlp_model= MLPClassifier(alpha=1e-05)
mlp_model.fit(X_tr3, t_tr)
mlp_preds = mlp_model.predict(X_val3)
B = t_val1 == mlp_preds 

acc = 1.*sum(B)/len(B)
print("Accuracy of MLP model given validation data:", acc)
print mlp_model.n_iter_

('Accuracy of MLP model given validation data:', 0.8980477223427332)
18


In [63]:
alphas = 10.0 ** -np.arange(4, 9)
names = []
for i in alphas:
    names.append('alpha ' + str(i))
print names

['alpha 0.0001', 'alpha 1e-05', 'alpha 1e-06', 'alpha 1e-07', 'alpha 1e-08']


In [64]:
SIZES = [[200,], [400,], [600,]] 
print SIZES

[[200], [400], [600]]


In [65]:
pipe = Pipeline([
    #('reduce_dim', PCA()),
    ('classify', MLPClassifier())
])

In [66]:
param_grid = [
    {
#        'reduce_dim': [PCA(iterated_power=7), NMF()],
#        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__hidden_layer_sizes': SIZES,
        'classify__alpha': alphas
    }
]
print param_grid

[{'classify__hidden_layer_sizes': [[200], [400], [600]], 'classify__alpha': array([  1.00000000e-04,   1.00000000e-05,   1.00000000e-06,
         1.00000000e-07,   1.00000000e-08])}]


In [67]:
grid = GridSearchCV(pipe, cv=3, n_jobs=1, param_grid=param_grid,verbose=100)
print grid

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('classify', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'classify__hidden_layer_sizes': [[200], [400], [600]], 'classify__alpha': array([  1.00000e-04,   1.00000e-05,   1.00000e-06,   1.00000e-07,
         1.00000e-08])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100)


In [47]:
grid.fit(X_tr_SVD, t_tr)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] classify__hidden_layer_sizes=[100], classify__alpha=0.1 .........
[CV]  classify__hidden_layer_sizes=[100], classify__alpha=0.1, score=0.846590909091, total=   2.1s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[100], classify__alpha=0.1 .........
[CV]  classify__hidden_layer_sizes=[100], classify__alpha=0.1, score=0.867579908676, total=   1.9s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.0s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[100], classify__alpha=0.1 .........
[CV]  classify__hidden_layer_sizes=[100], classify__alpha=0.1, score=0.853855005754, total=   1.8s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    5.9s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[500], classify__alpha=0.1 .........
[CV]  classify__hidden_layer_sizes=[500], classify__alpha=0.1, score=0.856818181818, total=   6.9s
[Parallel(n_jobs=1)

[CV]  classify__hidden_layer_sizes=[1000], classify__alpha=0.0001, score=0.870454545455, total=  13.7s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:  3.9min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[1000], classify__alpha=0.0001 .....
[CV]  classify__hidden_layer_sizes=[1000], classify__alpha=0.0001, score=0.87100456621, total=  14.5s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  4.2min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[1000], classify__alpha=0.0001 .....
[CV]  classify__hidden_layer_sizes=[1000], classify__alpha=0.0001, score=0.85500575374, total=  13.8s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  4.4min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[100], classify__alpha=1e-05 .......
[CV]  classify__hidden_layer_sizes=[100], classify__alpha=1e-05, score=0.845454545455, total=   2.1s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:  4.4min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[100], classify__alpha=1e

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('classify', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'classify__hidden_layer_sizes': [[100], [500], [1000]], 'classify__alpha': array([  1.00000e-01,   1.00000e-02,   1.00000e-03,   1.00000e-04,
         1.00000e-05])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100)

In [68]:
grid.fit(X_tr_SVD, t_tr)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[CV] classify__hidden_layer_sizes=[200], classify__alpha=0.0001 ......
[CV]  classify__hidden_layer_sizes=[200], classify__alpha=0.0001, score=0.8625, total=   4.6s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.6s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[200], classify__alpha=0.0001 ......
[CV]  classify__hidden_layer_sizes=[200], classify__alpha=0.0001, score=0.876712328767, total=   3.5s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    8.2s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[200], classify__alpha=0.0001 ......
[CV]  classify__hidden_layer_sizes=[200], classify__alpha=0.0001, score=0.851553509781, total=   3.4s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   11.6s remaining:    0.0s
[CV] classify__hidden_layer_sizes=[400], classify__alpha=0.0001 ......
[CV]  classify__hidden_layer_sizes=[400], classify__alpha=0.0001, score=0.851136363636, total=   4.3s
[Parallel(n_job

[CV]  classify__hidden_layer_sizes=[400], classify__alpha=1e-07, score=0.858457997699, total=   5.0s
[Parallel(n_jobs=1)]: Done  33 out of  33 | elapsed:  3.1min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[600], classify__alpha=1e-07 .......
[CV]  classify__hidden_layer_sizes=[600], classify__alpha=1e-07, score=0.846590909091, total=   7.2s
[Parallel(n_jobs=1)]: Done  34 out of  34 | elapsed:  3.2min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[600], classify__alpha=1e-07 .......
[CV]  classify__hidden_layer_sizes=[600], classify__alpha=1e-07, score=0.852739726027, total=   8.0s
[Parallel(n_jobs=1)]: Done  35 out of  35 | elapsed:  3.4min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[600], classify__alpha=1e-07 .......
[CV]  classify__hidden_layer_sizes=[600], classify__alpha=1e-07, score=0.850402761795, total=   7.1s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  3.5min remaining:    0.0s
[CV] classify__hidden_layer_sizes=[200], classify__alpha=1e-08 

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('classify', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'classify__hidden_layer_sizes': [[200], [400], [600]], 'classify__alpha': array([  1.00000e-04,   1.00000e-05,   1.00000e-06,   1.00000e-07,
         1.00000e-08])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=100)

In [69]:
mlp_grid_preds = grid.predict(X_val_SVD)
B = t_val == mlp_grid_preds

acc = 1.*sum(B)/len(B)
print("Accuracy of MLP model given validation data:", acc)

('Accuracy of MLP model given validation data:', 0.8872017353579176)


In [49]:
print grid.get_params()
print grid.best_index_
print grid.cv_results_['params'][grid.best_index_]
print grid.best_params_
print grid.best_score_
print grid.best_estimator_
print grid.cv_results_

{'estimator__classify__tol': 0.0001, 'estimator__classify__verbose': False, 'n_jobs': 1, 'verbose': 100, 'estimator__classify__nesterovs_momentum': True, 'estimator__classify__warm_start': False, 'estimator__classify__max_iter': 200, 'estimator__memory': None, 'estimator__classify__random_state': None, 'estimator__steps': [('classify', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False))], 'estimator__classify__momentum': 0.9, 'estimator__classify__learning_rate_init': 0.001, 'param_grid': [{'classify__hidden_layer_sizes': [[100], [500], [1000]], 'classify__alpha': array([  1.00000000e-01,   1.00000000e-02,   1

In [70]:
print grid.best_params_
X_mal_SVD = SVD.transform(X_mat)
X_test_SVD = SVD.transform(X_test)

{'classify__hidden_layer_sizes': [600], 'classify__alpha': 0.0001}


In [75]:
#Optimized MLP Classification 
mlp_final_model= MLPClassifier(alpha=1e-06, hidden_layer_sizes=(1000,))
mlp_final_model.fit(X_mat, t_train)
mlp_super_preds = mlp_final_model.predict(X_mat)
B = t_train == mlp_super_preds 

acc = 1.*sum(B)/len(B)
print("Accuracy of MLP model on training data:", acc)

('Accuracy of MLP model on training data:', 0.9115359688917692)


In [76]:
mlp_final_model.n_iter_
accuracy_score(t_train,mlp_super_preds)

0.91153596889176924

In [78]:
print X_test_SVD.shape
mlp_final_preds = mlp_final_model.predict(X_test)
print mlp_final_preds.shape
print("done making predictions")

(3724, 1000)
(3724,)
done making predictions


In [79]:
outputfile = "P2_MLP_WMB.csv"
print("writing predictions...")
util.write_predictions(mlp_final_preds, test_ids, outputfile)
print("done!")

writing predictions...
done!
