In [23]:
import re
import math
from collections import Counter
import pandas as pd
import numpy as np
import snowballstemmer

import fetch_csv

## Load the CSV

In [2]:
fetch_csv.fetch('data.csv')

data.csv already exists, skipping.


In [3]:
def xbool(val):
    if val in ['x', 'X']:
        return True
    elif val in ['', '0']:
        return False
    raise ValueError(val)

df = pd.read_csv('data.csv', converters={
    'Cloud': xbool,
    'Cybersecurity': xbool,
    'Governance - Implementation': xbool,
})
pd.set_option('display.max_columns', None)
df[:3]

Unnamed: 0,policyNumber,policyTitle,uriPolicyID,ombPolicyID,policyType,policyIssuanceYear,policySunset,policyStatus,reqStatus,precedent,reqID,relatedReqs,issuingBody,policySection,policySubSection,reqText,ombDataCollection,reqVerb,agenciesImpacted,reqDeadline,Citation,Acquisition/Contracts,Human Capital,Cloud,Data Centers,Cybersecurity,Privacy,Shared Services,IT Project Management,Software,Digital Services,Mobile,Hardware/Government Furnished Equipment (GFE),"IT Transparency (Open Data, FOIA, Public Records, etc.)",Agency Statistics,Customer Services,Governance,Financial Systems,Budget,Governance - Org Structure,Governance - Implementation,Data Management/Standards,Definitions,Reporting,Other
0,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/9/2010,,Active,Active,TBA,1.01,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,,Agencies must focus on consolidating existing ...,TBA,Must,All CFO-Act Agencies,,,,,True,x,False,,x,,,,,,,,,,,,,False,,,,
1,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.02,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.1 Identify agency data center program manage...,"Within the next six months, each agency will d...",TBA,Will; Must; Will be,All CFO-Act Agencies,6/8/2011,,,,False,x,False,,,,,,,,,,,,,,x,False,,,,
2,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.03,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.2 Launch a Data Center Consolidation Task F...,"Within the next three months, the Federal CIO ...",TBA,Will; Will be,CIOC,3/10/2011,,,,False,x,False,,,,,,,,,,,,,,,True,,,,


## Tokenize the requirement texts

Right now we are going to be as simple as possible and not even do any stemming.

In [30]:
STOP_WORDS = "the and to of a for in or that is with as be an are by on this it its they your".split(" ")
non_alphabetic_re = re.compile('[\W0-9_\-]+')
stemmer = snowballstemmer.stemmer('english')

def tokenize(text):
    if not isinstance(text, str): return []
    words = []
    for word in text.lower().split():
        word = non_alphabetic_re.sub('', word)
        if not word: continue
        if word in STOP_WORDS: continue
        word = stemmer.stemWords([word])[0]
        words.append(word)
    return words

df['reqTextTokenized'] = df['reqText'].map(tokenize)

## Construct a vocabulary

In [49]:
VOCAB_SIZE = 1000
LABEL_TO_TRAIN = 'Cybersecurity'

token_counts = Counter()
for tokens in df['reqTextTokenized']:
    for token in tokens:
        token_counts[token] += 1

num_examples = df.shape[0]

token_counts_df = pd.DataFrame({'token': list(token_counts.keys()), 'count': list(token_counts.values())})
token_counts_df.sort_values(by=['count'], ascending=False)

vocab = list(token_counts_df['token'][:VOCAB_SIZE])

## Vectorize all the things

In [50]:
examples = np.zeros((VOCAB_SIZE, num_examples))

for (i, tokens) in df['reqTextTokenized'].iteritems():
    for token in tokens:
        if token in vocab:
            examples[vocab.index(token)][i] = 1

label_ground_truth = df[LABEL_TO_TRAIN].values.reshape(1, num_examples)

## Create train, dev, and test sets

In [51]:
example_ordering = list(range(df.shape[0]))
np.random.seed(1)
np.random.shuffle(example_ordering)

examples = examples[:, example_ordering]
label_ground_truth = label_ground_truth[:, example_ordering]
training_set_size = math.floor(num_examples * 0.6)
cross_validation_set_size = math.floor(num_examples * 0.2)
test_set_start_index = training_set_size + cross_validation_set_size

def create_examples_subset(start, end):
    return {
        'X': examples[:, start:end],
        'y': label_ground_truth[:, start:end],
    }

training_set = create_examples_subset(0, training_set_size)

cross_validation_set = create_examples_subset(training_set_size, test_set_start_index)

test_set = create_examples_subset(test_set_start_index, examples.shape[1])

## Define logistic regression primitives

In [52]:
PROBABILITY_THRESHOLD = 0.75

# Much of the math/theory behind this can be found at:
# https://www.coursera.org/learn/neural-networks-deep-learning/lecture/5sdh6/logistic-regression-gradient-descent

def sigmoid(x):
    return 1 / (1 + np.power(np.e, -x))

# Sanity checks...
assert sigmoid(0) == 0.5
np.testing.assert_almost_equal(sigmoid(-100), 0)
np.testing.assert_almost_equal(sigmoid(100), 1)

def compute_activations(X, W, b):
    return sigmoid(np.dot(W.T, X) + b)

def predict(a):
    return a >= PROBABILITY_THRESHOLD

def true_positives(predictions, y):
    return np.sum((predictions == True) & (y == True))

assert true_positives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def false_positives(predictions, y):
    return np.sum((predictions == True) & (y == False))

assert false_positives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def false_negatives(predictions, y):
    return np.sum((predictions == False) & (y == True))

assert false_negatives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def accuracy(predictions, y):
    correct_predictions = np.sum(predictions == y)
    return correct_predictions / y.shape[1]

assert accuracy(np.array([[1, 1, 0, 1]]), np.array([[1, 0, 1, 1]])) == 0.5

def cost(a, y):
    m = y.shape[1]
    return np.sum(-(y * np.log(a) + (1 - y) * np.log(1 - a))) / m

def compute_gradients(X, W, b, a, y):
    m = y.shape[1]
    dims = W.shape[0]
    dz = a - y
    db = np.sum(dz) / m
    dW = np.sum(np.repeat(dz, dims, axis=0) * X, axis=1).reshape(dims, 1) / m

    return {'db': db, 'dW': dW}

def descend_gradient(X, y, num_iterations, learning_rate):
    W = np.zeros((VOCAB_SIZE, 1))
    b = 0
    for i in range(num_iterations):
        a = compute_activations(X, W, b)
        grads = compute_gradients(X, W, b, a, y)
        W -= learning_rate * grads['dW']
        b -= learning_rate * grads['db']
        yield (i, W, b)

## Perform gradient descent

In [55]:
for (i, W, b) in descend_gradient(num_iterations=2001, learning_rate=3.0, **training_set):
    if i % 100 == 0:
        a = compute_activations(training_set['X'], W, b)
        curr_cost = cost(a, training_set['y'])
        predictions = predict(a)
        acc = accuracy(predictions, training_set['y'])
        fp = false_positives(predictions, training_set['y'])
        fn = false_negatives(predictions, training_set['y'])
        tp = true_positives(predictions, training_set['y'])
        print(f"cost on iteration {i}: {curr_cost} w/ accuracy {acc}, {tp} tp, {fp} fp, and {fn} fn")


cost on iteration 0: 0.7179801508937516 w/ accuracy 0.6747619972799689, 0 tp, 0 fp, and 1674 fn
cost on iteration 100: 0.34460801224355553 w/ accuracy 0.7998834272391685, 716 tp, 72 fp, and 958 fn
cost on iteration 200: 0.32338477809527094 w/ accuracy 0.8144550223431125, 797 tp, 78 fp, and 877 fn
cost on iteration 300: 0.3126204138986876 w/ accuracy 0.8200893724499708, 824 tp, 76 fp, and 850 fn
cost on iteration 400: 0.3057027049214824 w/ accuracy 0.8251408587526715, 852 tp, 78 fp, and 822 fn
cost on iteration 500: 0.3007351459589023 w/ accuracy 0.827083738099864, 861 tp, 77 fp, and 813 fn
cost on iteration 600: 0.29692758359127086 w/ accuracy 0.829415193316495, 875 tp, 79 fp, and 799 fn
cost on iteration 700: 0.2938803725336101 w/ accuracy 0.8303866329900913, 881 tp, 80 fp, and 793 fn
cost on iteration 800: 0.29136537639921595 w/ accuracy 0.8323295123372838, 893 tp, 82 fp, and 781 fn
cost on iteration 900: 0.28924106257067694 w/ accuracy 0.8333009520108802, 897 tp, 81 fp, and 777 fn
c

In [54]:
def print_true_positives(maximum=10):
    found = 0
    for i in range(training_set_size):
        if predictions[0][i] and training_set['y'][0][i]:
            orig_index = example_ordering.index(i)
            df_row = df.loc[example_ordering[i]]
            print(f'-- example row {orig_index}')
            print(df_row['reqText'])
            print(df_row['reqTextTokenized'])
            assert df_row[LABEL_TO_TRAIN] == True
            found += 1
            if found == maximum:
                return

print_true_positives(1)

-- example row 2535
Under the authority of the Attorney General, the Director of the Federal Bureau of Investigation (FBI) shall be responsible for the operation of the NCIJTF. This authority does not allow the Director of the FBI to direct the operations of other agencies. The Director of the FBI shall ensure that participants share the methodology and, to the extent appropriate, case information related to criminal cyber intrusion investigations among law enforcement organizations represented in the NCIJTF in accordance with paragraphs 32 - 33. [Ref. reqs. 1143.52 and 1143.53]
['under', 'author', 'attorney', 'general', 'director', 'feder', 'bureau', 'investig', 'fbi', 'shall', 'respons', 'oper', 'ncijtf', 'author', 'doe', 'not', 'allow', 'director', 'fbi', 'direct', 'oper', 'other', 'agenc', 'director', 'fbi', 'shall', 'ensur', 'particip', 'share', 'methodolog', 'extent', 'appropri', 'case', 'inform', 'relat', 'crimin', 'cyber', 'intrus', 'investig', 'among', 'law', 'enforc', 'organ'