In [51]:
import shelve
import pandas as pd
import us
import pickle
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk import pos_tag
import C2V.data_cleaning as dc
import C2V.preprocessing as pc

In [41]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## Cleaning

In [69]:
#inititalize a pandas dataframe we will populate with the 112th congress
df = pd.DataFrame(columns=['congress','date','gender','party','speaker','state','text'])

In [71]:
with shelve.open("data/USGPO.shelf") as data:
    #an example of what our dictionary contains
    print(data['Mrs_DAHLKEMPER_20090401_1.txt'])
    for key in data.keys():
        dictionary = data[key]
        #we will only be looking at the 112th congress for the sake of computation
        if dictionary['congress']  == '112th':
            #clean up the text
            dictionary = dc.strip_html(dictionary)
            #if our dictionary actually contains text
            #print(dictionary)
            if len(dictionary['text']) is not 0:
                df = df.append(dictionary,ignore_index=True)

{'text': '<html>\n<head>\n<title>Congressional Record, Volume 155 Issue 55 (Wednesday, April 1, 2009)</title>\n</head>\n<body><pre>\n[Congressional Record Volume 155, Number 55 (Wednesday, April 1, 2009)]\n[House]\n[Page H4393]\nFrom the Congressional Record Online through the Government Printing Office [<a href="http://www.gpo.gov">www.gpo.gov</a>]\n\n\n\n\n                             GENERAL LEAVE\n\n  Mrs. DAHLKEMPER. Madam Speaker, I ask unanimous consent that all \nMembers may have 5 legislative days in which to revise and extend their \nremarks and include extraneous material on H.R. 1256.\n  The SPEAKER pro tempore. Is there objection to the request of the \ngentlewoman from Pennsylvania?\n  There was no objection.\n\n                          ____________________\n\n\n</pre></body>\n</html>', 'state': 'Pennsylvania', 'gender': 'F', 'congress': '111th', 'date': '20090401', 'party': '(D)', 'speaker': 'Mrs DAHLKEMPER'}


*The dataset looks like it contains scraped entries from the Congressional Record for speaker, state, party, Congress number, gender, and text of the given speech for that day. Data exists from 2002 to 2014. We'll focus on the 112th Congress (2011-2012)* 

In [73]:
df.tail()

Unnamed: 0,congress,date,gender,party,speaker,state,text
13554,112th,20111013,M,(D),Mr CICILLINE,Rhode Island,H.R. 3190. Congress has the power...
13555,112th,20110706,M,(D),Mr CONNOLLY of Virginia,Virginia,of Virginia: H.R. 2419. Congress h...
13556,112th,20120518,M,(R),Mr DUNCAN of Tennessee,Tennessee,of Tennessee: H.R. 5840. Congres...
13557,112th,20110301,M,(R),Mr DUNCAN of Tennessee,Tennessee,"of Tennessee. Mr. Speaker, last Tuesday the to..."
13558,112th,20120131,F,(R),Mrs BLACKBURN,Tennessee,". Madam Speaker, it is so true that ``to whom ..."


In [38]:
#an example of what text looks like in our dataset
df.text[13546][0:200]

' First, let me thank my colleagues Congressmen McGovern and Jones, Congresswomen Woolsey and Waters, and Congressman Honda for their efforts to bring the war in Afghanistan to a swift and safe end.  M'

*it looks like many of the documents are not actual speaches, but instead protocol speak, let's see what percent of our documents are protocol*

In [132]:
df['protocol'] = df.apply(lambda row: dc.detect_protocol(row['text']), axis=1)

In [150]:
sum(df['protocol'])/len(df)

0.48462275979054503

 *48% of the data is composed of protocol format text. For now, we will choose to model with protocol speech included because it might still contain meaningful information; however, it might be worth throwing out down the road.*

In [193]:
df['text'] = df.apply(lambda row: dc.strip_states(row['text']), axis=1)
df['text'] = df.apply(lambda row: dc.replace_underscore(row['text']), axis=1)
df['text'] = df.apply(lambda row: dc.clean_numbers(row['text']), axis=1)
df['text'] = df.apply(lambda row: dc.clean_beginning(row['text']), axis=1)

*There were a few select cases where there were naming inconsistencies of speeches that belonged to specific Congress people. I manually identifed and fixed these cases by hand:*

In [123]:
# we also want to add a field to out table that allows us to marry a single record per congress person:
#Log of manual changes:
df.set_value(148, 'speaker', 'Mr BISHOP of Utah')
df.set_value(3085, 'speaker', 'Mr GRAVES of Missouri')
df.set_value(618, 'speaker', 'Mr TURNER of Ohio')
df.set_value(2995, 'speaker', 'Mr TURNER of Ohio')
df.set_value(5309, 'speaker', 'Mr TURNER of Ohio')
df.set_value(7435, 'speaker', 'Mr TURNER of Ohio')
df.set_value(7479, 'speaker', 'Mr TURNER of Ohio')
df.set_value(8168, 'speaker', 'Mr TURNER of Ohio')
df.set_value(11630, 'speaker', 'Mr TURNER of Ohio')
df.set_value(13312, 'speaker', 'Mr TURNER of Ohio')
df.set_value(2758, 'speaker', 'Mr WALZ of Minnesota')
df.set_value(6379, 'speaker', 'Mr WALZ of Minnesota')
df.set_value(4562, 'speaker', 'Ms BASS of California')

*In addition, it's useful to capture word count for later use as a feature in our model:*

## Preprocessing

In [124]:
#add a word count field to explore:
df['length'] = df.apply(lambda row: len(row['text'].translate({ord(c): None for c in string.punctuation}).split()),
                        axis=1)

*We can also pull in useful features like who were members of the Tea Party*

In [52]:
#from https://en.wikipedia.org/wiki/Tea_Party_Caucus
t_party = ['Franks','Schweikert','McClintock','Royce','Coffman','Lamborn','Bilirakis','Crenshaw','Nugent','Ross',
           'Price','Westmoreland','King','Stutzman','Huelskamp','Jenkins','Fleming','Scalise','Walberg','Palazzo',
           'Hartzler','Luetkemeyer','Smith','Pearce','Carolina','Duncan','Mulvaney','Wilson','Black','Fincher',
           'Roe','Barton','Burgess','Carter','Culberson','Farenthold','Gohmert','Marchant','Neugebauer','Poe',
           'Sessions','Smith','Bishop','McKinley','Lummis','Bachmann','Georgia','Cassidy','Coble','Gingrey',
           'Miller','Sessions','McCain','Rubio','Risch','Moran','McConnell','Paul','Blunt','Toomey','Scott',
           'Cornyn','Cruz','Lee','Johnson','Enzi','DeMint']
t_party = [t.upper() for t in t_party]
#append t_party to the dataset
df['t_party'] = df.apply(lambda row: pc.findTParty(row['speaker'],t_party=t_party, party=row['party']), axis=1)

## Normalizing

*Although we can easily use pass these in as functions into CountVectorizer() in sklearn, we need this step for word2vec and doc2vec formats, so it is worthwhile to process them manually: *

In [53]:
stemmer = SnowballStemmer('english')
lemmer = WordNetLemmatizer()
df['text_tokenize'] = df.apply(lambda row: pc.tokenize(row['text']), axis=1)
df['text_stemmatize'] = df.apply(lambda row: pc.stemmatize(row['text'],stemmer=stemmer), axis=1)
#this one takes a very very long time to run
df['text_lemmatize'] = df.apply(lambda row: pc.lemmatize(row['text'],lemmer=lemmer), axis=1)

In [55]:
#we choose to remove haplaxes from only one of the normalizations 
stem_haps= pc.get_haps(df['text_stemmatize'])
df['text_stemmatized_haplatized'] = df.apply(lambda row: [w for w in row['text_stemmatize'] if w not in set(stem_haps)], axis=1)

In [56]:
#and save
df.to_pickle('data/gov_doc_df.pkl') 