In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
# from GaussianGenerativeModel import GaussianGenerativeModel
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
from collections import Counter
from operator import itemgetter
%matplotlib inline



In [2]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

In [3]:
def extract_feats(ffs, direc="train", global_feat_dict=None):
    """
    arguments:
      ffs are a list of feature-functions.
      direc is a directory containing xml files (expected to be train or test).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.

    returns: 
      a sparse design matrix, a dict mapping features to column-numbers,
      a vector of target classes, and a list of system-call-history ids in order 
      of their rows in the design matrix.
      
      Note: the vector of target classes returned will contain the true indices of the
      target classes on the training data, but will contain only -1's on the test
      data
    """
    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    #datafile: filename(without path)
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2] #id_str: id  #clazz: class name for this file
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile)) #direc: train     datafile: the whole file name
        
        
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
        
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids

In [4]:
## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'first_call-x' to 1 if x was the first system call
      made, and 'last_call-y' to 1 if y was the last system call made. 
      (in other words, it returns a dictionary indicating what the first and 
      last system calls made by an executable were.)
    """
    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element  
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

In [5]:
def system_call_count_feats(tree):
    """
    arguments:
      tree is an xml.etree.ElementTree object
    returns:
      a dictionary mapping 'num_system_calls' to the number of system_calls
      made by an executable (summed over all processes)
    """
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    return c

In [6]:
def make_design_mat(fds, global_feat_dict=None):
    """
    arguments:
      fds is a list of feature dicts (one for each row).
      global_feat_dict is a dictionary mapping feature_names to column-numbers; it
      should only be provided when extracting features from test data, so that 
      the columns of the test matrix align correctly.
       
    returns: 
        a sparse NxD design matrix, where N == len(fds) and D is the number of
        the union of features defined in any of the fds 
    """
    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
        
    cols = []
    rows = []
    data = []        
    for i in xrange(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].iteritems():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict

Add our functions below

In [7]:
def exist_VBA(tree):
    c = Counter()
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "open_key" and  "key" in el.attrib.keys() and "VBA" in el.attrib['key']:
            c['exist_VBA'] = 1
    return c

In [8]:
def exist_VBA_getusername(tree):
    c = Counter()
    found_VBA = False
    #found_getusername = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "open_key" and  "key" in el.attrib.keys() and "VBA" in el.attrib['key']:
            found_VBA = True
        if found_VBA== True and el.tag == "get_username":
            #found_getusername = True
            c['exist_VBA_getusername'] = 1
    return c

In [9]:
def exist_Swizzor_0002DF(tree):
    c = Counter()
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "open_key" and  "key" in el.attrib.keys() and "0002DF01-0000-0000-C000-000000000046" in el.attrib['key']:
            c['exist_Swizzor_0002DF'] = 1
    return c

In [10]:
def all_system_call_count(tree):
    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element  
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
                c["call-"+el.tag] += 1
    return c

In [11]:
#function to calculate accuracy
def model_accuracy(y, y_pred):
    p = (np.array(y) == np.array(y_pred))
    return float(p.sum())/len(y)

## Random forest

In [12]:
train_dir = "train"
test_dir = "test"
outputfile = "mypredictions_rf.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [all_system_call_count]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"

extracting training features...
done extracting training features


In [13]:
feat_matrix = X_train.toarray()
feat_matrix.shape

(3086, 102)

In [14]:
#Split training files randomly as 7:3
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feat_matrix, t_train, test_size=0.3, random_state=42)

In [15]:
x_train.shape
y_train.shape

(2160,)

In [17]:
#Use 5-fold cross validation to tune parameters
RF_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        rf_score = []
        rf = RandomForestClassifier(n_estimators = c, max_features = c1)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            rf.fit(train_x, train_y)
            rf_score.append(rf.score(test_x,test_y))
        RF_score.append((c,c1, np.average(rf_score)))



In [18]:
max(RF_score,key=itemgetter(2))

(20, 40, 0.88518518518518507)

In [20]:
#Test Score
rf = RandomForestClassifier( n_estimators = 20, max_features = 40 )
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8909287257019438

In [21]:
#Use the parameter, train the whole train file
rf = RandomForestClassifier( n_estimators = 20, max_features = 40 )
rf.fit(feat_matrix, t_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=40, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [22]:
#Extract feature matrix of test files
print "extracting training features..."
X_test,test_feat_dict,t_test,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"
test_feat_matrix=X_test.toarray()

extracting training features...
done extracting training features


In [23]:
#predict
y_pred_test = rf.predict(test_feat_matrix)

In [24]:
print "writing predictions..."
util.write_predictions(y_pred_test, test_ids, outputfile)
print "done!"

writing predictions...
done!


## Extra Tree

In [25]:
from sklearn.cross_validation import KFold
ET_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        et_score = []
        et = ExtraTreesClassifier(n_estimators = c, max_features = c1)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            et.fit(train_x, train_y)
            et_score.append(et.score(test_x,test_y))
        ET_score.append((c,c1, np.average(et_score)))


In [26]:
max(ET_score,key=itemgetter(2))

(25, 30, 0.88287037037037042)

In [29]:
#Get test score
et = ExtraTreesClassifier(n_estimators = 25, max_features = 30 )
et.fit(x_train, y_train)

y_pred = et.predict(x_test)
model_accuracy(y_test, y_pred)

0.8984881209503239

In [30]:
#Use the parameter, train the whole train file
et = ExtraTreesClassifier( n_estimators = 25, max_features = 30 )
et.fit(feat_matrix, t_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features=30, max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [31]:
#Predict
y_pred_test  = et.predict(test_feat_matrix)
y_pred_test 

array([ 8, 10,  8, ..., 12,  8,  0])

In [32]:
outputfile = "mypredictions_ET.csv"
print "writing predictions..."
util.write_predictions(y_pred_test, test_ids, outputfile)
print "done!"

writing predictions...
done!


## Gaussian Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

GaussianNB(priors=None)

In [34]:
y_pred = gnb.predict(x_test)
model_accuracy(y_test, y_pred)

0.28077753779697623

## Random forest with new features

Random forest with 1 new features: exist_VBA_getusername

In [35]:
outputfile = "mypredictions_rf_VBA.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [all_system_call_count, exist_VBA_getusername]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"

extracting training features...
done extracting training features


In [36]:
feat_matrix = X_train.toarray()
feat_matrix.shape

(3086, 103)

In [37]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feat_matrix, t_train, test_size=0.3, random_state=42)

In [38]:
x_train.shape
y_train.shape

(2160,)

In [39]:
from sklearn.cross_validation import KFold
RF_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        rf_score = []
        rf = RandomForestClassifier(n_estimators = c, max_features = c1)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            rf.fit(train_x, train_y)
            rf_score.append(rf.score(test_x,test_y))
        RF_score.append((c,c1, np.average(rf_score)))


In [40]:
max(RF_score,key=itemgetter(2))

(15, 40, 0.8842592592592593)

In [41]:
rf = RandomForestClassifier( n_estimators = 15, max_features = 40 )
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=40, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=15, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [42]:
#get test score
y_pred = rf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8822894168466523

Random forest with 1 new features: exist_Swizzor_0002DF

In [43]:
outputfile = "mypredictions_rf_Swizzor.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [all_system_call_count, exist_Swizzor_0002DF]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"

extracting training features...
done extracting training features


In [44]:
feat_matrix = X_train.toarray()
feat_matrix.shape

(3086, 103)

In [46]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feat_matrix, t_train, test_size=0.3, random_state=42)

In [47]:
from sklearn.cross_validation import KFold
RF_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        rf_score = []
        rf = RandomForestClassifier(n_estimators = c, max_features = c1)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            rf.fit(train_x, train_y)
            rf_score.append(rf.score(test_x,test_y))
        RF_score.append((c,c1, np.average(rf_score)))

In [48]:
max(RF_score,key=itemgetter(2))

(25, 30, 0.88564814814814807)

In [49]:
#test score
rf = RandomForestClassifier( n_estimators = 25, max_features = 30 )
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8876889848812095

Random forest with 2 new features: exist_VBA_getusername, exist_Swizzor_0002DF

In [50]:
outputfile = "mypredictions_rf_VBA.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [all_system_call_count, exist_VBA_getusername, exist_Swizzor_0002DF]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"

extracting training features...
done extracting training features


In [51]:
feat_matrix = X_train.toarray()
feat_matrix.shape

(3086, 104)

In [52]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feat_matrix, t_train, test_size=0.3, random_state=42)

In [53]:
from sklearn.cross_validation import KFold
RF_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        rf_score = []
        rf = RandomForestClassifier(n_estimators = c, max_features = c1)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            rf.fit(train_x, train_y)
            rf_score.append(rf.score(test_x,test_y))
        RF_score.append((c,c1, np.average(rf_score)))


In [54]:
max(RF_score,key=itemgetter(2))

(25, 50, 0.88472222222222219)

In [55]:
#get test score
rf = RandomForestClassifier( n_estimators = 25, max_features = 50 )
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8833693304535637

## Decision Tree

We only use the 102 feature now, extract features again

In [57]:
outputfile = "mypredictions_dt.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [all_system_call_count]

# extract features
print "extracting training features..."
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"

extracting training features...
done extracting training features


In [58]:
feat_matrix = X_train.toarray()
feat_matrix.shape

(3086, 102)

In [59]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(feat_matrix, t_train, test_size=0.3, random_state=42)

In [60]:
from sklearn import tree
DT_score = []
kf = KFold(n = len(x_train), n_folds=5)
for c in range(1,100):
    dt_score = []
    clf = tree.DecisionTreeClassifier(max_depth=c)
    for train_index, test_index in kf:
        train_x, test_x = x_train[train_index], x_train[test_index]
        train_y, test_y = y_train[train_index], y_train[test_index]
        clf.fit(train_x, train_y)
        dt_score.append(clf.score(test_x, test_y))
    DT_score.append(np.average(dt_score))

In [61]:
DT_score.index(max(DT_score))

13

In [62]:
clf = tree.DecisionTreeClassifier(max_depth=13)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [63]:
y_pred = clf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8606911447084233

In [64]:
#Now train on the whole dataset
clf = tree.DecisionTreeClassifier(max_depth=13)
clf.fit(feat_matrix, t_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=13,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [65]:
#Extract feature matrix of test files
print "extracting training features..."
X_test,test_feat_dict,t_test,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
#sparse feature matrix, feature dict with its column index, class name, file idea
print "done extracting training features"
test_feat_matrix=X_test.toarray()

extracting training features...
done extracting training features


In [66]:
#predict
y_pred_test = clf.predict(test_feat_matrix)
y_pred_test

array([ 8, 10,  7, ..., 12,  8,  8])

In [67]:
print "writing predictions..."
util.write_predictions(y_pred_test, test_ids, outputfile)
print "done!"

writing predictions...
done!


## Ensemble

In [68]:
est = GradientBoostingClassifier(n_estimators=60, learning_rate=0.01,
     max_depth= 5, random_state=5, loss='deviance')
est.fit(x_train, y_train)
y_pred = est.predict(x_test)
model_accuracy(y_test,y_pred)

0.8736501079913607

In [69]:
#Now train on the whole dataset
est.fit(feat_matrix, t_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=60, presort='auto', random_state=5,
              subsample=1.0, verbose=0, warm_start=False)

In [70]:
#Now use actual test file
y_test_pred = est.predict(test_feat_matrix)
y_test_pred

array([ 8, 10,  8, ..., 12,  8,  8])

In [71]:
print "writing predictions..."
outputfile = "mypredictions_ensemble.csv"
util.write_predictions(y_test_pred, test_ids, outputfile)
print "done!"

writing predictions...
done!


## SVC - with class prior

In [72]:
Cs = [pow(10, i) for i in range(-5, 5)]
Cs

[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

In [73]:
weight = [3.69, 1.62, 1.20, 1.03, 1.33, 1.26, 1.72, 1.33, 52.14, 0.68, 17.56, 1.04, 12.18, 1.91, 1.30]
weights = [x/100 for x in weight]

In [74]:
weight_dict = dict(zip(range(15),weights))
weight_dict

{0: 0.0369,
 1: 0.016200000000000003,
 2: 0.012,
 3: 0.0103,
 4: 0.013300000000000001,
 5: 0.0126,
 6: 0.0172,
 7: 0.013300000000000001,
 8: 0.5214,
 9: 0.0068000000000000005,
 10: 0.17559999999999998,
 11: 0.0104,
 12: 0.12179999999999999,
 13: 0.0191,
 14: 0.013000000000000001}

In [None]:
SVC_score = []
kf = KFold(n = len(x_train), n_folds=5)
for c in Cs:
    svc_score = []
    svcmodel = SVC(kernel='rbf', C=c, class_weight = weight_dict)
    for train_index, test_index in kf:
        train_x, test_x = x_train[train_index], x_train[test_index]
        train_y, test_y = y_train[train_index], y_train[test_index]
        svcmodel.fit(train_x, train_y)
        svc_score.append(svcmodel.score(test_x, test_y))
    SVC_score.append(np.average(svc_score))

In [37]:
SVC_score.index(max(SVC_score))

8

In [38]:
Cs[8]

1000

In [40]:
svcmodel = SVC(kernel='rbf', C=1000, class_weight = weight_dict)
svcmodel.fit(x_train, y_train)
y_pred = svcmodel.predict(x_test)
model_accuracy(y_test, y_pred)

0.7980561555075594

## Random forest with class prior

In [85]:
RF_score = []
kf = KFold(n = x_train.shape[0], n_folds=5)

n_est_regularization = np.arange(5, 30, 5)
n_ft_regularization = np.arange(20, 100, 10)
for c in n_est_regularization:
    for c1 in n_ft_regularization:
        rf_score = []
        rf = RandomForestClassifier(n_estimators = c, max_features = c1, class_weight = weight_dict)
        for train_index, test_index in kf:
            train_x, test_x = x_train[train_index], x_train[test_index]
            train_y, test_y = y_train[train_index], y_train[test_index]
            rf.fit(train_x, train_y)
            rf_score.append(rf.score(test_x,test_y))
        RF_score.append((c,c1, np.average(rf_score)))


In [86]:
max(RF_score,key=itemgetter(2))

(15, 20, 0.87870370370370365)

In [87]:
rf = RandomForestClassifier( n_estimators = 15, max_features = 20, class_weight = weight_dict)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
model_accuracy(y_test, y_pred)

0.8855291576673866

In [88]:
#Use the parameter, train the whole train file
rf = RandomForestClassifier( n_estimators = 15, max_features = 20, class_weight = weight_dict)
rf.fit(feat_matrix, t_train)

RandomForestClassifier(bootstrap=True,
            class_weight={0: 0.0369, 1: 0.016200000000000003, 2: 0.012, 3: 0.0103, 4: 0.013300000000000001, 5: 0.0126, 6: 0.0172, 7: 0.013300000000000001, 8: 0.5214, 9: 0.0068000000000000005, 10: 0.17559999999999998, 11: 0.0104, 12: 0.12179999999999999, 13: 0.0191, 14: 0.013000000000000001},
            criterion='gini', max_depth=None, max_features=20,
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [89]:
y_pred_test = rf.predict(test_feat_matrix)
y_pred_test

array([ 8, 10,  4, ..., 12,  8,  0])

In [90]:
print "writing predictions..."
outputfile = "mypredictions_rf_prior.csv"
util.write_predictions(y_pred_test, test_ids, outputfile)
print "done!"

writing predictions...
done!
