In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 75

## Campaign Finance

- Raw data split into election years, 1990 - 2016
- Text fields are surrounded by the pipe character (ascii 124). Date and numeric fields are not.  Commas separate all fields.
- Data dictionaries and user guide are in raw/campaign_finance

### PAC Contributions
Helpful notes from user guide:
- In all cases, be sure to set realcode to Not like z9* and Not like z4* to eliminate transfers and joint fund raising committees.
- To calculate direct contributions to candidates, you limit to DI = D.
- Note that the FEC every now and then puts PAC to candidate data in the PAC to PAC table –to be thorough, check there as well

In [8]:
def direct_contributions(year):
    """Read pac contributiontable and keep direct contributions, 
        join with candidate and pac info.
    Args:
        year: election year of data
    Returns:
        dataframe of direct PAC contributions for election year
    """
    
    # read PAC to candidate data
    year = str(int(year))
    file_loc = 'raw/campaign_finance/'
    file_loc += year + "/"
    file_name = 'pacs' + year[-2:] + '.txt'
    names_pacs = ['Cycle','FECRecNo','PACID','CID',
                  'Amount','Date','RealCode','Type',
                  'DI','FECCandID']
    df = pd.read_table(file_loc + file_name, 
                       delimiter=",", 
                       quotechar="|", 
                       names=names_pacs)
    
    # read candidate info
    names_cand = ['Cycle','FECCandID','CID','FirstLastP',
                  'Party','DistIDRunFor','DistIDCurr','CurrCand', 'CycleCand', 
                  'CRPICO', 'RecipCode', 'NoPacs']
    file_name = 'cands' + year[-2:] + '.txt'
    df_cand = pd.read_table(file_loc + file_name, 
                            delimiter=",", 
                            quotechar="|", 
                            names=names_cand)
    
    # filter our presidential candidates and candidates who dropped out
    df_cand = df_cand[df_cand.DistIDRunFor != 'PRES']
    df_cand = df_cand[df_cand.CurrCand == 'Y']
    df_cand.drop_duplicates(subset=['CID'], inplace=True)
    
    # read pac to pac data to be thorough
    # check if there are any direct candidate contributions
    # append to df
    names_ptop = ['Cycle','FECRecNo','Filerid','DonorCmte','ContribLendTrans',
                  'City','State','Zip','FECOccEmp','Primcode','Date','Amount',
                  'RecipID','Party','Otherid','RecipCode','RecipPrimcode','Amend', 
                  'Report', 'PG','Microfilm','Type','RealCode','Source']
    file_name = 'pac_other' + year[-2:] + '.txt'
    df_ptop = pd.read_table(file_loc + file_name, 
                            delimiter=",", 
                            quotechar="|", 
                            names=names_ptop)
    df_ptop = df_ptop[df_ptop.Type.str.startswith('2')]
    df_ptop.dropna(subset = ['RecipID'], inplace = True)
    df_ptop = df_ptop[df_ptop.RecipID.str.startswith('N')]
    df_ptop.rename(columns = {'Filerid':'PACID',
                              'RecipID':'CID',}, inplace = True)
    df_ptop['DI'] = np.nan
    df_ptop = pd.merge(df_ptop, df_cand[['CID','FECCandID']], how = 'left', on = 'CID')
    df_ptop = df_ptop[names_pacs]
    df = pd.concat([df,df_ptop])
    
    # filter data
    df = df[df.RealCode.str.lower().str.startswith('z9') == False]
    df = df[df.RealCode.str.lower().str.startswith('z4') == False]
    df = df[df.DI == 'D']
    
    # join on candidate info
    df.drop('FECCandID', axis = 1, inplace=True)
    df = pd.merge(df, df_cand, how = 'left', on = 'CID')
    df.dropna(subset=['Cycle_y'], inplace=True)
    df.drop('Cycle_y', axis = 1, inplace=True)
    df.rename(columns={'Cycle_x': 'Cycle'}, inplace = True)
    
    # read in PAC information
    names_cmtes = ['Cycle', 'CmteID', 'PACShort', 'Affiliate', 'Ultorg',
                   'RecipID', 'RecipCode','FECCandID', 'Party', 'PrimCode', 
                   'Source', 'Sensitive', 'Foreign', 'Active']
    file_name = 'cmtes' + year[-2:] + '.txt'
    df_cmtes = pd.read_table(file_loc + file_name, 
                             delimiter=",", 
                             quotechar="|", 
                             names=names_cmtes, 
                             encoding = 'latin1')
    df_cmtes['CandidateCommittee'] = df_cmtes.CmteID != df_cmtes.RecipID
    df_cmtes.rename(columns={'CmteID': 'PACID'}, inplace=True)
    keep = ['PACID', 'PACShort', 'Affiliate', 'Ultorg',
        'PrimCode', 'Source','Sensitive','Foreign',
        'Active','CandidateCommittee']
    df_cmtes = df_cmtes[keep]
    df_cmtes.drop_duplicates(subset=['PACID'], inplace=True)
    df = pd.merge(df, df_cmtes, how = 'left', on = 'PACID')
    return df

# loop through all years and process data
export_path = "preprocessed/campaign_finance/"
dat_years = np.linspace(1990,2016,14)
for i, year in enumerate(dat_years):
    file_name = "pac_direct_" + str(int(year)) + ".csv"
    df_sub = direct_contributions(year)
    df_sub.to_csv(export_path+file_name, index = False)
    if i == 0:
        df = df_sub
    else:
        df = pd.concat([df, df_sub])
file_name = "pac_direct_all.csv"
df.to_csv(export_path+ file_name, index = False)        


  if self.run_code(code, result):
  if self.run_code(code, result):
  if self.run_code(code, result):


https://www.opensecrets.org/resources/create/data_doc.php

In [14]:
test = pd.read_table('raw/lobbying/lob_lobbying.txt',delimiter=",",quotechar="|", encoding = 'latin1')
test.head()

Unnamed: 0,B035CC2E-6E80-4A58-8E46-A1FBF084C2E4,CARMEN GROUP INC,Carmen Group,y,"POLSON, ERIC","Polson, Eric","Polson, Eric.1",0.0,B4000,wda16,n,Unnamed: 12,n.1,.1,2005,mtn,MID-YEAR TERMINATION (NO ACTIVITY),Unnamed: 17
0,B035D063-95B9-40EF-8B64-5A9AEA1DD3EA,Alpha Strategies,Alpha Strategies,y,Flight Landata,Flight Landata,Flight Landata,0.0,M9200,webDM,n,,n,,2010.0,q2n,SECOND QUARTER (NO ACTIVITY),
1,B035D8AE-A329-4FF2-9C85-0F0853CDB648,"Lindsay, Hart, Neil & Weigler","Lindsay, Hart et al",y,Timberland,Timberland Co,Timberland Co,5000.0,M3200,Hvr07,n,,y,y,2008.0,q3,THIRD QUARTER REPORT,
2,B035EFE3-EC72-4D10-A52B-D4C610B86090,Jude Benedict & Associates,Jude Benedict & Assoc,y,OriginOil,OriginOil,OriginOil,10000.0,Y4000,,n,,y,y,2009.0,q4,FOURTH QUARTER REPORT,
3,B03629F3-38CE-42D9-B5C8-6369BDEF44E0,INVACARE CORP,Invacare Corp,,INVACARE CORP,Invacare Corp,Invacare Corp,266539.0,H4100,pac,p,,n,,2006.0,m,MID-YEAR REPORT,
4,B0365E79-0C2A-49A4-90F3-E8F00B000C11,BALL JANIK,Ball Janik LLP,y,OREGON MUSEUM OF SCIENCE & INDUSTRY,Oregon Museum of Science & Industry,Oregon Museum of Science & Industry,60000.0,X4200,name,n,,y,y,2003.0,er,YEAR-END REPORT,


In [119]:
df_cmtes.shape

(17804, 14)

In [113]:
df.head()

Unnamed: 0,Cycle_x,FECRecNo,PACID,CID,Amount,Date,RealCode,Type,DI,FECCandID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs
0,2016,4071320161306572461,C00002469,N00033508,5000.0,06/21/2016,LM100,24K,D,H2CA15094,Eric Swalwell (D),D,CA15,CA15,Y,Y,I,DW,
1,2016,4052020161293006119,C00125641,N00030071,1500.0,04/06/2016,C5110,24K,D,H8OR05107,Kurt Schrader (D),D,OR05,OR05,Y,Y,I,DW,
2,2016,4101220161340839612,C00331173,N00003682,1000.0,09/15/2016,C5000,24K,D,S0OH00133,Rob Portman (R),R,OHS2,OHS2,Y,Y,I,RW,
3,2016,4073120151248144443,C00235572,N00002593,5000.0,03/30/2015,J2200,24K,D,S6GA00119,Johnny Isakson (R),R,GAS2,GAS2,Y,Y,I,RW,
4,2016,4101320161343026342,C00114025,N00035403,2500.0,09/29/2016,B5100,24K,D,H4FL26038,Carlos Curbelo (R),R,FL26,FL26,Y,Y,I,RW,


In [107]:
df.shape

(240242, 19)

In [114]:
240000 * 14

3360000

In [96]:
df[df.CID.duplicated()].shape

(0, 12)

In [92]:
df[df.CID.duplicated()].sort_values('CID')

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs
7408,2016,H6MA07101,N00000270,Ed Markey (D),D,MAS2,,,,I,DI,
5226,2016,H8NJ03073,N00000781,Frank Pallone Jr (D),D,NJ06,NJ06,Y,Y,I,DW,
5975,2016,H0NJ01066,N00000826,Robert E Andrews (D),D,NJ01,,,,,DN,
5018,2016,H6NJ12144,N00000860,Rush Holt (D),D,NJ12,,,,,DN,
7697,2016,H6PA14184,N00001373,Mike Doyle (D),D,PA14,,,,I,DI,
7652,2016,H6PA14176,N00001373,Mike Doyle (D),D,PA14,,,,I,DI,
6841,2016,S4IL00404,N00001885,Alan Keyes (R),R,ILS2,,,Y,C,RC,
3961,2016,P80005580,N00002526,Bob Barr (R),R,GA11,,,,,RN,
3232,2016,H6ID02191,N00002776,Richard Grayson (D),D,ID02,,,Y,C,DN,
5079,2016,H8AZ06012,N00002776,Richard Grayson (D),D,ID02,,,,C,DC,


In [87]:
df.head()

Unnamed: 0,Cycle,FECCandID,CID,FirstLastP,Party,DistIDRunFor,DistIDCurr,CurrCand,CycleCand,CRPICO,RecipCode,NoPacs
0,2016,H4GA02060,N00035294,Greg Duke (R),R,GA02,,Y,Y,C,RC,
1,2016,H4GA02078,N00036257,Vivian Childs (R),R,GA02,,,,,RN,
2,2016,H4GA04116,N00035798,Thomas Brown (D),D,GA04,,,,,DN,
3,2016,H4GA04124,N00035862,Thomas Wight (D),D,GA07,,,,,DN,
4,2016,H4GA06087,N00026160,Tom Price (R),R,GA06,GA06,Y,Y,I,RW,


In [75]:
df.drop_duplicates(subset=['CID']).shape

(1373, 10)

In [77]:
df.dropna(subset=['FECCandID']).drop_duplicates(subset=['CID','FECCandID']).shape

(1498, 10)

In [42]:
df.shape

(373967, 10)

In [54]:
df.DI.value_counts()

D    267478
I    106489
Name: DI, dtype: int64

In [61]:
df.head()

Unnamed: 0,Cycle,FECRecNo,PACID,CID,Amount,Date,RealCode,Type,DI,FECCandID
0,2016,4071320161306572461,C00002469,N00033508,5000,06/21/2016,LM100,24K,D,H2CA15094
1,2016,4052020161293006119,C00125641,N00030071,1500,04/06/2016,C5110,24K,D,H8OR05107
2,2016,4101220161340839612,C00331173,N00003682,1000,09/15/2016,C5000,24K,D,S0OH00133
3,2016,4073120151248144443,C00235572,N00002593,5000,03/30/2015,J2200,24K,D,S6GA00119
4,2016,4101320161343026342,C00114025,N00035403,2500,09/29/2016,B5100,24K,D,H4FL26038


In [59]:
df.shape

(373967, 10)

In [60]:
df.CID.str.startswith('N').shape

(373967,)

### Individual Contributions
- You do not want to include the money listed in the Indivs table going to PACs because it will show up again as contributions from the PAC.  So, limit the money counted from Indivs:
    - Join Indivs to Cmtes on Indivs.CmteID = Cmtes.CmteID with Indivs.RecipCode not like P* 
    - Note that this will exclude contributions to leadership PACs. If you want to include individual contributions to leadership PACs, do NOT exclude based on Recipcode. Instead, limit to where Indivs.Party is not null and Indivs.Party<>”” (does not equal blank.)Additionally, restrictions are required for the PAC to PAC data because if those PACs getting money from other PACs then turn around and give that money to candidates, that will also result in double counting.