In [167]:
import re
import math
from collections import Counter
import pandas as pd
import numpy as np

import fetch_csv

## Load the CSV

In [168]:
fetch_csv.fetch('data.csv')

data.csv already exists, skipping.


In [169]:
def xbool(val):
    if val in ['x', 'X']:
        return True
    elif val in ['', '0']:
        return False
    raise ValueError(val)

df = pd.read_csv('data.csv', converters={
    'Cloud': xbool,
    'Cybersecurity': xbool,
    'Governance - Implementation': xbool,
})
pd.set_option('display.max_columns', None)
df[:3]

Unnamed: 0,policyNumber,policyTitle,uriPolicyID,ombPolicyID,policyType,policyIssuanceYear,policySunset,policyStatus,reqStatus,precedent,reqID,relatedReqs,issuingBody,policySection,policySubSection,reqText,ombDataCollection,reqVerb,agenciesImpacted,reqDeadline,Citation,Acquisition/Contracts,Human Capital,Cloud,Data Centers,Cybersecurity,Privacy,Shared Services,IT Project Management,Software,Digital Services,Mobile,Hardware/Government Furnished Equipment (GFE),"IT Transparency (Open Data, FOIA, Public Records, etc.)",Agency Statistics,Customer Services,Governance,Financial Systems,Budget,Governance - Org Structure,Governance - Implementation,Data Management/Standards,Definitions,Reporting,Other
0,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/9/2010,,Active,Active,TBA,1.01,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,,Agencies must focus on consolidating existing ...,TBA,Must,All CFO-Act Agencies,,,,,True,x,False,,x,,,,,,,,,,,,,False,,,,
1,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.02,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.1 Identify agency data center program manage...,"Within the next six months, each agency will d...",TBA,Will; Must; Will be,All CFO-Act Agencies,6/8/2011,,,,False,x,False,,,,,,,,,,,,,,x,False,,,,
2,1,25 Point Implementation Plan To Reform Federal...,https://www.whitehouse.gov/sites/default/files...,,Strategy,12/10/2010,,Active,Active,TBA,1.03,,Office of the Federal Chief Information Office...,A. Apply “Light Technology” and Shared Solutions,1.2 Launch a Data Center Consolidation Task F...,"Within the next three months, the Federal CIO ...",TBA,Will; Will be,CIOC,3/10/2011,,,,False,x,False,,,,,,,,,,,,,,,True,,,,


## Tokenize the requirement texts

Right now we are going to be as simple as possible and not even do any stemming.

In [170]:
STOP_WORDS = "the and to of a for in or that is with as be an are by on this it its they your".split(" ")
non_alphabetic_re = re.compile('[\W0-9_\-]+')

def tokenize(text):
    if not isinstance(text, str): return []
    words = []
    for word in text.lower().split():
        word = non_alphabetic_re.sub('', word)
        if not word: continue
        if word in STOP_WORDS: continue
        words.append(word)
    return words

df['reqTextTokenized'] = df['reqText'].map(tokenize)

## Construct a vocabulary

In [171]:
VOCAB_SIZE = 100
LABEL_TO_TRAIN = 'Cybersecurity'

token_counts = Counter()
for tokens in df['reqTextTokenized']:
    for token in tokens:
        token_counts[token] += 1

num_examples = df.shape[0]

token_counts_df = pd.DataFrame({'token': list(token_counts.keys()), 'count': list(token_counts.values())})
token_counts_df.sort_values(by=['count'], ascending=False)

vocab = list(token_counts_df['token'][:VOCAB_SIZE])

## Vectorize all the things

In [172]:
examples = np.zeros((VOCAB_SIZE, num_examples))

for (i, tokens) in df['reqTextTokenized'].iteritems():
    for token in tokens:
        if token in vocab:
            examples[vocab.index(token)][i] = 1

label_ground_truth = df[LABEL_TO_TRAIN].values.reshape(1, num_examples)

## Create train, dev, and test sets

In [173]:
example_ordering = list(range(df.shape[0]))
np.random.seed(1)
np.random.shuffle(example_ordering)

examples = examples[:, example_ordering]
training_set_size = math.floor(num_examples * 0.6)
cross_validation_set_size = math.floor(num_examples * 0.2)

training_set = examples[:, 0:training_set_size]
cross_validation_set = examples[:, training_set_size:(training_set_size + cross_validation_set_size)]
test_set = examples[:, (training_set_size + cross_validation_set_size):]