In [2]:
import json
import random
import re
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model

In [3]:
with open("/scripts/table_flag.json") as f:
    table = json.load(f)
    
positive_examples = [i for i, e in enumerate(table) if e["flag-any"] == 1]
negative_examples = [i for i, e in enumerate(table) if e["flag-any"] == 0]
random.shuffle(positive_examples)
random.shuffle(negative_examples)
negative_examples = negative_examples[:len(positive_examples)]
print "%d positive and %d negative examples." % (len(positive_examples), len(negative_examples))

train_table = [table[i] for i in positive_examples[:200]] + [table[i] for i in negative_examples[:200]]
test_table = [table[i] for i in positive_examples[200:]] + [table[i] for i in negative_examples[200:]]

328 positive and 328 negative examples.


In [4]:
def tokenize_js(script):
    script = re.sub(r'(\/\*[^*]+\*\/)', "", script)
    script = re.sub(r'\/\/.+', "", script)
    return re.findall(r'(\"[^"]+\"|\'[^\']+\'|[\w\\\/\-_\"\']+|{|}|,|[\+\*]|\(|\)|\.|/\*.+\*\/)', script)

def parse_scripts(tbl):
    for item in tbl:
        #print "Reading %s.js" % item["sha"]
        with open("/scripts/%s.js" % item["sha"]) as f:
            yield f.read().decode(errors='replace')
            

In [5]:
vectorizer = TfidfVectorizer(tokenizer=tokenize_js)
X_train = vectorizer.fit_transform(parse_scripts(train_table))
Y_train = np.array([item["flag-any"] for item in train_table])
                    
X_test = vectorizer.transform(parse_scripts(test_table))
Y_test = np.array([item["flag-any"] for item in test_table])

In [6]:
print X_train
print Y_train
print "%d/%d positive examples" % (sum(Y_train), len(Y_train))

  (0, 10242)	0.000373772501762
  (0, 14566)	0.000373772501762
  (0, 10244)	0.000373772501762
  (0, 18244)	0.000373772501762
  (0, 48812)	0.000373772501762
  (0, 10241)	0.000373772501762
  (0, 16714)	0.000373772501762
  (0, 14132)	0.000373772501762
  (0, 10243)	0.000373772501762
  (0, 49086)	0.000349719783828
  (0, 16928)	0.000373772501762
  (0, 14494)	0.000373772501762
  (0, 13450)	0.000747545003525
  (0, 29924)	0.000973721120399
  (0, 17781)	0.000373772501762
  (0, 17838)	0.000349719783828
  (0, 17774)	0.000373772501762
  (0, 17234)	0.000373772501762
  (0, 28571)	0.000332654108866
  (0, 30174)	0.000373772501762
  (0, 30173)	0.000373772501762
  (0, 16509)	0.000349719783828
  (0, 28174)	0.000319416942976
  (0, 41758)	0.0002434302801
  (0, 37762)	0.0002434302801
  :	:
  (399, 37158)	0.0320761356944
  (399, 43053)	0.02759053557
  (399, 53825)	0.113768529932
  (399, 38149)	0.0665676610275
  (399, 35640)	0.0453144816464
  (399, 42744)	0.157270602207
  (399, 38964)	0.0929346427543
  (399, 49

In [7]:
model = BernoulliNB()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 128/256 = 50.000000%
Accuracy: 186/256 = 72.656250%


In [8]:
model = SVC()
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 128/256 = 50.000000%
Accuracy: 163/256 = 63.671875%


In [9]:
model = linear_model.SGDClassifier(class_weight="auto")
model.fit(X_train, Y_train)
test_pred = model.predict(X_test)

print "Baseline: %d/%d = %f%%" % (sum(Y_test == 0), len(Y_test), ((float(sum(Y_test == 0)) / len(Y_test)) * 100))
print "Accuracy: %d/%d = %f%%" % (sum(Y_test == test_pred), len(Y_test), ((float(sum(Y_test == test_pred)) / len(Y_test)) * 100))

Baseline: 128/256 = 50.000000%
Accuracy: 190/256 = 74.218750%
