In [1]:
import re
import math
from collections import Counter
import pandas as pd
import numpy as np
import snowballstemmer

import fetch_csv

## Load the CSV

In [2]:
fetch_csv.fetch('data.csv')

data.csv already exists, skipping.


In [3]:
def xbool(val):
    if val in ['x', 'X']:
        return True
    elif val in ['', '0']:
        return False
    raise ValueError(val)

df = pd.read_csv('data.csv', converters={
    'Cloud': xbool,
    'Cybersecurity': xbool,
    'Governance - Implementation': xbool,
})
pd.set_option('display.max_columns', None)
df[:3]

Unnamed: 0,policyNumber,policyTitle,uriPolicyID,ombPolicyID,policyType,policyIssuanceYear,policySunset,policyStatus,reqStatus,precedent,reqID,relatedReqs,issuingBody,policySection,policySubSection,reqText,ombDataCollection,reqVerb,agenciesImpacted,reqDeadline,Citation,Acquisition/Contracts,Human Capital,Cloud,Data Centers,Cybersecurity,Privacy,Shared Services,IT Project Management,Software,Digital Services,Mobile,Hardware/Government Furnished Equipment (GFE),"IT Transparency (Open Data, FOIA, Public Records, etc.)",Agency Statistics,Customer Services,Governance,Financial Systems,Budget,Governance - Org Structure,Governance - Implementation,Data Management/Standards,Definitions,Reporting,Other
0,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/9/2010,,Active,Active,TBA,1.01,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,,Agencies must focus on consolidating existing ...,TBA,Must,All CFO-Act Agencies,,,,,True,x,False,,x,,,,,,,,,,,,,False,,,,
1,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.02,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.1 Identify agency data center program manage...,"Within the next six months, each agency will d...",TBA,Will; Must; Will be,All CFO-Act Agencies,6/8/2011,,,,False,x,False,,,,,,,,,,,,,,x,False,,,,
2,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.03,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.2 Launch a Data Center Consolidation Task F...,"Within the next three months, the Federal CIO ...",TBA,Will; Will be,CIOC,3/10/2011,,,,False,x,False,,,,,,,,,,,,,,,True,,,,


## Tokenize the requirement texts

In [4]:
STOP_WORDS = "the and to of a for in or that is with as be an are by on this it its they your".split(" ")
non_alphabetic_re = re.compile('[\W0-9_\-]+')
stemmer = snowballstemmer.stemmer('english')

def tokenize(text):
    if not isinstance(text, str): return []
    words = []
    for word in text.lower().split():
        word = non_alphabetic_re.sub('', word)
        if not word: continue
        if word in STOP_WORDS: continue
        word = stemmer.stemWords([word])[0]
        words.append(word)
    return words

df['reqTextTokenized'] = df['reqText'].map(tokenize)

## Construct a vocabulary

In [5]:
VOCAB_SIZE = 1500

token_counts = Counter()
for tokens in df['reqTextTokenized']:
    for token in tokens:
        token_counts[token] += 1

num_examples = df.shape[0]

token_counts_df = pd.DataFrame({'token': list(token_counts.keys()), 'count': list(token_counts.values())})
token_counts_df.sort_values(by=['count'], ascending=False)

vocab = list(token_counts_df['token'][:VOCAB_SIZE])

## Vectorize all the things

In [6]:
examples = np.zeros((VOCAB_SIZE, num_examples))

for (i, tokens) in df['reqTextTokenized'].iteritems():
    for token in tokens:
        if token in vocab:
            examples[vocab.index(token)][i] += 1

def center_and_rescale(examples):
    """
    For each dimension in each example, subtract the mean and divide
    by the standard deviation. This is taught as a reasonable
    strategy to speed up gradient descent in Coursera's ML class.

    In practice it allows us to achieve in 300 iterations of
    gradient descent what once took 2000.
    """

    dims = examples.shape[0]
    m = examples.shape[1]
    means = (np.sum(examples, axis=1) / m).reshape(dims, 1)
    stddevs = np.std(examples, axis=1).reshape(dims, 1)
    return (examples - means) / stddevs

examples = center_and_rescale(examples)

## Decide on a label to train

In [7]:
LABEL_TO_TRAIN = 'Cybersecurity'

label_ground_truth = df[LABEL_TO_TRAIN].values.reshape(1, num_examples)

## Create train, dev, and test sets

In [8]:
example_ordering = list(range(df.shape[0]))
np.random.seed(1)
np.random.shuffle(example_ordering)

examples = examples[:, example_ordering]
label_ground_truth = label_ground_truth[:, example_ordering]
training_set_size = math.floor(num_examples * 0.6)
cross_validation_set_size = math.floor(num_examples * 0.2)
test_set_start_index = training_set_size + cross_validation_set_size

def create_examples_subset(start, end):
    return {
        'X': examples[:, start:end],
        'y': label_ground_truth[:, start:end],
    }

training_set = create_examples_subset(0, training_set_size)

cross_validation_set = create_examples_subset(training_set_size, test_set_start_index)

test_set = create_examples_subset(test_set_start_index, examples.shape[1])

## Define logistic regression primitives

In [9]:
PROBABILITY_THRESHOLD = 0.75

# Much of the math/theory behind this can be found at:
# https://www.coursera.org/learn/neural-networks-deep-learning/lecture/5sdh6/logistic-regression-gradient-descent

def sigmoid(x):
    return 1 / (1 + np.power(np.e, -x))

# Sanity checks...
assert sigmoid(0) == 0.5
np.testing.assert_almost_equal(sigmoid(-100), 0)
np.testing.assert_almost_equal(sigmoid(100), 1)

def compute_activations(X, W, b):
    return sigmoid(np.dot(W.T, X) + b)

def predict(a):
    return a >= PROBABILITY_THRESHOLD

def true_positives(predictions, y):
    return np.sum((predictions == True) & (y == True))

assert true_positives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def false_positives(predictions, y):
    return np.sum((predictions == True) & (y == False))

assert false_positives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def false_negatives(predictions, y):
    return np.sum((predictions == False) & (y == True))

assert false_negatives(np.array([[1, 1, 0]]), np.array([[1, 0, 1]])) == 1

def accuracy(predictions, y):
    correct_predictions = np.sum(predictions == y)
    return correct_predictions / y.shape[1]

assert accuracy(np.array([[1, 1, 0, 1]]), np.array([[1, 0, 1, 1]])) == 0.5

def cost(a, y):
    m = y.shape[1]

    # TODO: I'm not sure if this is the best solution, but sometimes the
    # result of the activation function is exactly 0 or 1, which makes us
    # return NaN, so we'll clip our values to be within the open interval
    # (0, 1).
    a = np.clip(a, 1e-7, 1 - 1e-7)

    return np.sum(-(y * np.log(a) + (1 - y) * np.log(1 - a))) / m

def compute_gradients(X, W, b, a, y):
    m = y.shape[1]
    dims = W.shape[0]
    dz = a - y
    db = np.sum(dz) / m
    dW = np.sum(np.repeat(dz, dims, axis=0) * X, axis=1).reshape(dims, 1) / m

    return {'db': db, 'dW': dW}

def descend_gradient(X, y, num_iterations, learning_rate):
    W = np.zeros((VOCAB_SIZE, 1))
    b = 0
    for i in range(num_iterations):
        a = compute_activations(X, W, b)
        grads = compute_gradients(X, W, b, a, y)
        W -= learning_rate * grads['dW']
        b -= learning_rate * grads['db']
        yield (i, W, b)

## Perform gradient descent

In [10]:
for (i, W, b) in descend_gradient(num_iterations=301, learning_rate=3.0, **training_set):
    if i % 100 == 0:
        a = compute_activations(training_set['X'], W, b)
        curr_cost = cost(a, training_set['y'])
        predictions = predict(a)
        acc = accuracy(predictions, training_set['y'])
        fp = false_positives(predictions, training_set['y'])
        fn = false_negatives(predictions, training_set['y'])
        tp = true_positives(predictions, training_set['y'])
        print(f"cost on iteration {i}: {curr_cost} w/ accuracy {acc}, {tp} tp, {fp} fp, and {fn} fn")


cost on iteration 0: 0.429086413055763 w/ accuracy 0.8404896055954926, 1106 tp, 253 fp, and 568 fn
cost on iteration 100: 0.1336331227141149 w/ accuracy 0.9220905381775791, 1305 tp, 32 fp, and 369 fn
cost on iteration 200: 0.11780483479177055 w/ accuracy 0.9314163590441034, 1351 tp, 30 fp, and 323 fn
cost on iteration 300: 0.10928065041714953 w/ accuracy 0.9364678453468039, 1376 tp, 29 fp, and 298 fn


In [11]:
def print_true_positives(maximum=10):
    found = 0
    for i in range(training_set_size):
        if predictions[0][i] and training_set['y'][0][i]:
            orig_index = example_ordering.index(i)
            df_row = df.loc[example_ordering[i]]
            print(f'-- example row {orig_index}')
            print(df_row['reqText'])
            print(df_row['reqTextTokenized'])
            assert df_row[LABEL_TO_TRAIN] == True
            found += 1
            if found == maximum:
                return

print_true_positives(1)

-- example row 6855
[The following is part of a Reporting Template for SAOPs for annual FISMA and privacy reporting, ref. Reqs. 1357.01-1357.06] 
5. PIA and Web Privacy Policies and Processes 
Section 208 of the E-Government Act requires that agencies (a) conduct PIAs under appropriate circumstances, (b) post web privacy policies on their web sites, and (c) ensure machine-readability of web privacy policies. 

Does the agency have a written policy or process for each of the following? Indicate Yes or No for each item in the table below. 

PIA Policies 
a. Determining whether a PIA is needed 
b. Conducting a PIA 
c. Evaluating changes in business process or technology that the PIA indicates may be required
d. Ensuring that systems owners and privacy and information technology experts participate in conducting the PIA 
e. Making PIAs available to the public in the required circumstances 
f. Making PIAs available in other than required circumstances 

Web Policies 
g. Determining continue

In [12]:
def run_cross_validation_set():
    a = compute_activations(cross_validation_set['X'], W, b)
    curr_cost = cost(a, cross_validation_set['y'])
    predictions = predict(a)
    acc = accuracy(predictions, cross_validation_set['y'])
    fp = false_positives(predictions, cross_validation_set['y'])
    fn = false_negatives(predictions, cross_validation_set['y'])
    tp = true_positives(predictions, cross_validation_set['y'])
    print(f"cost for cross validation set: {curr_cost} w/ accuracy {acc}, {tp} tp, {fp} fp, and {fn} fn")

run_cross_validation_set()

cost for cross validation set: 0.9511206265392399 w/ accuracy 0.8011661807580175, 342 tp, 135 fp, and 206 fn
