In [22]:
import re

labeled_corpus = []
with open('mds/headlines_corpus.txt') as corpus:
    for line in corpus:
        if re.match(r'^\#+\s+\w+', line):
            labeled_corpus.append((line, True))
        elif line.strip():
            labeled_corpus.append((line, False))
    

In [8]:
list(filter(lambda e: e[1],labeled_corpus))[0:100]

[('# Awesome WAF [![Awesome](https://cdn.rawgit.com/sindresorhus/awesome/d7305f38d29fed78fa85652e3a63e154dd8e8829/media/badge.svg "Awesome")](https://github.com/0xinfection/awesome-waf)\n',
  True),
 ('### Contents:\n', True),
 ('## Introduction:\n', True),
 ('### How WAFs Work:\n', True),
 ('### Operation Modes:\n', True),
 ('## Testing Methodology:\n', True),
 ('### Where To Look:\n', True),
 ('### Detection Techniques:\n', True),
 ('## WAF Fingerprints\n', True),
 ('## Evasion Techniques\n', True),
 ('### Fuzzing/Bruteforcing:\n', True),
 ('#### Method:  \n', True),
 ('#### Technique:\n', True),
 ('#### Drawbacks:\n', True),
 ('### Regex-Reversing:\n', True),
 ('#### Method:\n', True),
 ('#### Techniques:\n', True),
 ('### Keyword Filter Detection/Bypass\n', True),
 ('### Obfuscation:\n', True),
 ('#### Method:\n', True),
 ('#### Techniques:\n', True),
 ('### Browser Bugs:\n', True),
 ('#### Charset Bugs:\n', True),
 ('#### Null Bytes:\n', True),
 ('#### Parsing Bugs:\n', True),
 ('

In [87]:
import random

random.shuffle(labeled_corpus)

train_idx = int(len(labeled_corpus) * 0.7)
train_data = labeled_corpus[:train_idx]
test_data = labeled_corpus[train_idx:]

headlines = list(filter(lambda h: h[1], train_data))
non_headlines = list(filter(lambda h: not h[1], train_data))

print(len(train_data))
print(len(test_data))

146202
62658


In [120]:
def split_by_words(entry):
    words = list(filter(lambda w: len(w.strip()) > 0, entry[0].split(' ')))
    if entry[1]:
        return words[1:]
    return words

def isCapitalized(words):
    return words[0].istitle()


def num_words_lt_15(words):
    return len(words) < 15

def all_alpha(words):
    for w in words:
        if not re.match('[a-zA-Z0-9\-\:\?\!\)\)]', w):
            return False
    return True

def num_of_capitalized(words):
    n = 0
    for w in words:

        if w.istitle():
            n +=1
    return n        


ws = split_by_words(non_headlines[1])

print(isCapitalized(ws))
print(num_words_lt_15(ws))
print(all_alpha(split_by_words("DFf adas df-sf 1!")))
num_of_capitalized(split_by_words(("Ddfa adas df-sf 1!", False)))

True
True
True


1

In [129]:
vdata = []
ldata = []

def feature_vector(e):
    ws = split_by_words(e)
    return [int(isCapitalized(ws)), len(ws), all_alpha(ws), num_of_capitalized(ws)]
    #return [int(isCapitalized(ws)), len(ws), all_alpha(ws)]

for e in train_data:
    vdata.append(feature_vector(e))
    ldata.append(e[1])

In [62]:
vdata[:10]
ldata[:10]

[True, False, False, False, False, False, False, True, False, False]

In [130]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(vdata, ldata)

In [131]:
vtdata = []
ltdata = []
pdata = []

for e in train_data:
    vtdata.append(feature_vector(e))
    ltdata.append(e[1])
    
pdata = clf.predict(vtdata)

In [81]:
print(vtdata[:10])
print(pdata[:10])
print(ltdata[:10])

[[1, 4, False], [0, 9, False], [0, 1, False], [0, 52, False], [1, 5, False], [1, 63, False], [1, 4, False], [1, 1, True], [0, 2, False], [0, 4, False]]
[False False False False False False False  True False False]
[True, False, False, False, False, False, False, True, False, False]


In [132]:
tp = 0
tn = 0
fp = 0
fn = 0

for i in range(len(vtdata)):
    if pdata[i] and ldata[i]:
        tp += 1
    elif pdata[i] and not ldata[i]:
        fp += 1
    elif not pdata[i] and ldata[i]:
        fn += 1
    else:
        tn += 1

print(tp, fp, fn, tn)        

pr = tp/(tp + fp)
rc = tp/(tp + fn)
f1 = 2 * (pr * rc)/(pr + rc)

print("Accuracy: {}".format((tp + tn)/len(ldata)))
print("Precicison: {}".format(pr))
print("Recall: {}".format(rc))
print("F1: {}".format(f1))

4803 1660 4465 135274
Accuracy: 0.9581059082639088
Precicison: 0.7431533343648461
Recall: 0.5182347863616745
F1: 0.6106414086834913
