In [104]:
import pandas as pd
import numpy as np
import pickle
import os
from fuzzywuzzy import process
pd.options.display.max_columns = 100

# Construct Bill Dataset for Model
## Pull in Missing Bills
In original API pull, a number of bills present in the model datasets were missing from the bill dataset. We did a separate call on just the missing bills, and stored them individually. Now we need to combine them with the rest of the bills.

In [2]:
df_bills = pd.read_csv('../data/propublica/billsfull.csv')
df_bills.drop_duplicates(subset = ['bill_id'], inplace = True)

  interactivity=interactivity, compiler=compiler, result=result)


In [51]:
# flatten individual bill files into dataframe with same columns as df_bills
path = "../data/propublica/missing_bills/"
bill_files = os.listdir(path)
bill_files = [x for x in bill_files if x.startswith('billinfo')]

# bil details
for i, file in enumerate(bill_files):
    
    with open(path + file, 'rb') as f:
        bill = pickle.load(f)
        
    try:
        bill['dsponsor'] = bill['cosponsors_by_party']['D']
    except:
        bill['dsponsor'] = 0
    try:
        bill['rsponsor'] = bill['cosponsors_by_party']['R']
    except:
        bill['rsponsor'] = 0
    bill['chamber'] = {'h':'House', 's':'Senate'}[bill['bill_type'][0]]
    bill = pd.DataFrame([bill])
    col_drop = ['actions', 'bill','bill_type', 'cosponsors',
                'cosponsors_by_party', 'house_passage_vote', 
                'senate_passage_vote', 'subcommittee_codes', 
                'versions','vetoed','votes','withdrawn_cosponsors']
    col_names = {'congress': 'session', 'sponsor': 'sponsor_name'}
    bill.drop(col_drop, axis = 1, inplace = True)
    bill.rename(columns = col_names, inplace = True)
    bill.set_index('bill_id', inplace = True, drop = True)
    
    if i == 0:
        df_newbills = bill
    else:
        df_newbills = pd.concat([df_newbills, bill])

# cosponsor details
bill_files = os.listdir(path)
bill_files = [x for x in bill_files if x.startswith('cosponsors')]
df_cospon = pd.DataFrame(columns = ['cosponsor', 'bill_id'])
for file in bill_files:
    with open(path + file, 'rb') as f:
        bill = pickle.load(f)
    c = [[x['cosponsor_id'] for x in bill['cosponsors']]]
    bill = pd.DataFrame([{'cosponsor': c, 'bill_id': bill['bill_id']}])
    
    df_cospon = pd.concat([df_cospon, bill])
df_cospon.set_index('bill_id', inplace = True, drop = True)
df_newbills = pd.concat([df_newbills, df_cospon], axis = 1)
df_newbills.reset_index(inplace=True)
df_newbills.rename(columns={'index': 'bill_id'}, inplace=True)

## Drop Bills Not in Model Sets

In [71]:
def get_full_set(path):
    for dataset in ['train','dev','test']:
        df = pd.read_csv(path + dataset + '.csv', encoding = 'latin1')
        if dataset == 'train':
            df_votes = df
        else:
            df_votes = pd.concat([df_votes, df])
    return df_votes
df_votes = get_full_set('../data/model/')
df_votes = df_votes.groupby('bill_id').broke_from_party.sum().sort_values(ascending = False)
df_votes = pd.DataFrame(df_votes)


df_bills.set_index('bill_id', inplace = True)
df_mbills = pd.concat([df_votes, df_bills], axis = 1).dropna(subset = ['broke_from_party']).sort_values('broke_from_party', ascending = False)
df_mbills['is_missing'] = df_mbills.chamber.isnull() == True
df_mbills['session2'] = df_mbills.index.str.split('-').str[-1]

In [76]:
# only a handfull of missing bills in model
df_mbills.groupby(['session2', 'is_missing']).size()

session2  is_missing
105       False         346
          True            1
106       False         267
          True            1
107       False         346
108       False         136
          True            1
109       False         307
          True            2
110       False         610
111       False         468
112       False         438
          True            2
113       False         447
114       False         463
115       False         225
          True            1
dtype: int64

In [78]:
df_mbills.drop('session2', axis = 1, inplace = True)

## Get Committee Codes
Prior to session 113, Propublica did not assign codes to each committee. Will attempt to fuzzymatch committee codes for these records.

In [100]:
# data previously pulled from propublica committees endpoint
df_committees = pd.read_csv('../data/propublica/committees.csv')
df_committees['subject'] = df_committees.subject.str.strip()
df_committees.head()

Unnamed: 0,body,code,name,session,subject
0,house,HSAG,Committee on Agriculture,110,agriculture
1,house,HSAP,Committee on Appropriations,110,appropriations
2,house,HSAS,Committee on Armed Services,110,armed services
3,house,HSED,Committee on Education and the Workforce,110,education workforce
4,house,HSIF,Committee on Energy and Commerce,110,energy commerce


In [87]:
com_dict = {}
for i, row in df_committees.iterrows():
    com_dict[" ".join([row.body, row.subject])] = row.code
choices = com_dict.keys()

In [122]:
def format_names(subject):
    """Apply formatting to make fuzzy match a bit easier"""
    subject = subject.lower().replace("committee", "")
    if 'house' in subject:
        subject = subject.replace('house', "")
        subject = 'house ' + subject
    if 'senate' in subject:
        subject = subject.replace('senate', "")
        subject = 'senate ' + subject
    if 'joint' in subject:
        subject = subject.replace('joint', "")
        subject = 'joint ' + subject  
    subject = subject.replace(' on', "")
    subject = subject.replace('joint ', "")
    subject = subject.replace(",", "")
    subject = subject.replace("'", "")
    subject = subject.replace("the ", "")
    subject = subject.replace(" and", "")
    subject = subject.replace("  ", " ")
    
    return subject

def fuzzy_match_codes(row):
    """Applies fuzzy match on each committee name and returns closest code match"""
    codes = []
    if type(row.committees) == float:
        return np.nan
    if row.committees in [" ", ""]:
        return np.nan
    
    committees = row.committees.split(";")
    committees = list(map(format_names, committees))
    
    for committee in committees:
        best_match = com_dict[process.extractOne(committee, choices)[0]]
        codes.append(best_match)
    return codes

In [144]:
def fill_missing_codes(row):
    """Fill the missing code values in original column with fuzzy match fields
        For some reason, OG committee codes have more than what is listed in the name field for some rows.
        """
    if row.committee_codes == "[]":
        return row.com_codes_fuzzy
    elif row.committee_codes == []:
        return row.com_codes_fuzzy
    elif type(row.committee_codes) == float:
        return row.com_codes_fuzzy        
    else:
        if type(row.committee_codes) == 'str':
            return eval(row.committee_codes)
        else:
            return row.committee_codes
    

In [123]:
df_mbills['com_codes_fuzzy'] = df_mbills.apply(fuzzy_match_codes, axis = 1)
df_mbills['com_codes_fuzzy2'] = df_mbills.apply(fill_missing_codes, axis = 1)
df_mbills.reset_index(inplace = True)
df_mbills.rename(columns={'index':'bill_id'}, inplace = True)
df_mbills.drop('Unnamed: 0', axis = 1, inplace=True)

In [154]:
df_mbills.head()

Unnamed: 0,bill_id,broke_from_party,active,bill_slug,bill_uri,chamber,committee_codes,committees,congressdotgov_url,cosponsor,dsponsor,enacted,govtrack_url,gpo_pdf_uri,house_passage,introduced_date,last_vote,latest_major_action,latest_major_action_date,number,primary_subject,rsponsor,senate_passage,session,short_title,sponsor_id,sponsor_name,sponsor_party,sponsor_state,sponsor_title,sponsor_uri,summary,summary_short,title,is_missing,com_codes_fuzzy,com_codes_fuzzy2
0,hr6124-110,423.0,,hr6124,https://api.propublica.org/congress/v1/110/bil...,House,[],House Agriculture; House Foreign Affairs,https://www.congress.gov/bill/110th-congress/h...,[[]],0.0,,https://www.govtrack.us/congress/bills/110/hr6124,,,2008-05-22,2008-06-18,Became Public Law No: 110-246,2008-06-18,H.R.6124,,0.0,2008-06-05,110,,P000258,Collin C. Peterson,D,MN,,https://api.propublica.org/congress/v1/members...,,,To provide for the continuation of agricultura...,False,"[HSAG, HSFA]","[HSAG, HSFA]"
1,hjres87-107,322.0,,hjres87,https://api.propublica.org/congress/v1/107/bil...,House,[],House Energy and Commerce,https://www.congress.gov/bill/107th-congress/h...,"[[B000657, U000031, T000326, T000058, B001135,...",4.0,,https://www.govtrack.us/congress/bills/107/hjr...,,2002-05-08,2002-04-11,2002-05-08,Became Public Law No: 107-200,2002-07-23,H.J.RES.87,,6.0,,107,,B000213,Joe L. Barton,R,TX,,https://api.propublica.org/congress/v1/members...,,,"Approving the site at Yucca Mountain, Nevada, ...",False,[HSIF],[HSIF]
2,hr434-106,276.0,,hr434,https://api.propublica.org/congress/v1/106/bil...,House,[],House International Relations; House Ways and ...,https://www.congress.gov/bill/106th-congress/h...,"[[R000053, M000404, R000487, D000492, J000070,...",38.0,,https://www.govtrack.us/congress/bills/106/hr434,,,1999-02-02,2000-05-04,Became Public Law No: 106-200,2000-05-18,H.R.434,,39.0,,106,,C000873,Philip M. Crane,R,IL,,https://api.propublica.org/congress/v1/members...,,,An act to authorize a new trade and investment...,False,"[HSAG, HSWM, HSBA]","[HSAG, HSWM, HSBA]"
3,hr1432-105,276.0,,hr1432,https://api.propublica.org/congress/v1/105/bil...,House,[],House International Relations; House Ways and ...,https://www.congress.gov/bill/105th-congress/h...,"[[R000053, M000404, H000814, J000070, M001138,...",43.0,,https://www.govtrack.us/congress/bills/105/hr1432,,,1997-04-24,1998-03-11,Read the second time. Placed on Senate Legisla...,1998-07-21,H.R.1432,,9.0,,105,,C000873,Philip M. Crane,R,IL,,https://api.propublica.org/congress/v1/members...,,,To authorize a new trade and investment policy...,False,"[HSAG, HSWM, HSBA]","[HSAG, HSWM, HSBA]"
4,hr6406-109,275.0,,hr6406,https://api.propublica.org/congress/v1/109/bil...,House,[],House Ways and Means,https://www.congress.gov/bill/109th-congress/h...,[[]],0.0,,https://www.govtrack.us/congress/bills/109/hr6406,,2006-12-08,2006-12-07,2006-12-08,Pursuant to section 2 of House Resolution 1100...,2006-12-08,H.R.6406,,0.0,,109,,T000188,William M. Thomas,R,CA,,https://api.propublica.org/congress/v1/members...,,,To modify temporarily certain rates of duty an...,False,[HSWM],[HSWM]


In [155]:
df_mbills.to_csv('../data/propublica/billsfull_model.csv', index=False)

In [156]:
df_mbills.shape

(4061, 37)