In [13]:
import pandas as pd
import numpy as np
import json
from collections import defaultdict
import os

# Create Committee CRP Industry Codes Map
This notebook will find the CRP industry codes (used to categorize PACs and orgs) most commonly associated with each congressional committee using MapLight data.

## Load Maplight Data

In [17]:
# count number of orgs from each industry for all bills
rows = []
for file in os.listdir('raw/'):
    with open('raw/' + file) as f:
        ml_bills = json.load(f)
    ml_bills= ml_bills['bills']
    for bill in ml_bills:
        row = {}
        row['session'] = bill['session']
        row['bill_id'] = bill['url'].split('/')[-2]
        row['measure'] = bill['measure']
        industry_count = defaultdict(int)
        for org in bill['organizations']:
            industry_count[org['catcode']] += 1
        row['industries'] = industry_count
        rows.append(row)

In [19]:
bill_industries = pd.DataFrame(rows)

In [24]:
# some orgs missing cat codes
bill_industries

Unnamed: 0,bill_id,industries,measure,session
0,109-hr-6111,{},H.R. 6111 (109<sup>th</sup>),109
1,109-hr-5682,{},H.R. 5682 (109<sup>th</sup>),109
2,109-hr-758,{'T1200': 1},H.R. 758 (109<sup>th</sup>),109
3,109-s-3880,{'A3000': 1},S. 3880 (109<sup>th</sup>),109
4,109-hr-5602,"{'M2300': 2, 'M3300': 2, 'H4300': 3, 'F3100': ...",H.R. 5602 (109<sup>th</sup>),109
5,109-hr-5825,"{'': 6, 'J3600': 1}",H.R. 5825 (109<sup>th</sup>),109
6,109-hr-5122,{'': 2},H.R. 5122 (109<sup>th</sup>),109
7,109-hr-4954,{'': 4},H.R. 4954 (109<sup>th</sup>),109
8,109-hr-5574,{'': 1},H.R. 5574 (109<sup>th</sup>),109
9,109-hr-683,{'G1100': 1},H.R. 683 (109<sup>th</sup>),109


In [30]:
# same bill codes as propublica
# just need to adjust position
bill_industries.bill_id.str.split('-').str[1].value_counts()

hr         6143
s          3525
hres         86
hjres        62
sjres        42
sres         33
hconres      26
sconres      18
Name: bill_id, dtype: int64

In [37]:
def reformat_bill_id(row):
    bill_info = row.bill_id.split("-")
    new_id = bill_info[1] + bill_info[2] + "-"
    new_id += bill_info[0]
    return new_id

bill_industries['bill_id'] = bill_industries.apply(reformat_bill_id, axis = 1)

## Join with propublica bill data

In [66]:
# get committees for each bill in long format 
# a bill can be in more than one committee
cols =['body','session','bill_id','date',
       'sponsor_party','cs_count_r','cs_count_d', 'cosponsors',
       'topic','sponsor','bill_slug_official','bill_title','bill_slug','committee','committee_code']
bills = pd.read_csv('../propublica/bills2.csv', names=cols)
bills = bills[['bill_id', 'committee_code']]
bills.dropna(inplace=True)
rows = []
for i, row in bills.iterrows():
    bill_id = row.bill_id
    for committee in eval(row.committee_code):
        rows.append({'bill_id': bill_id, 'committee': committee})
bills = pd.DataFrame(rows)  

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
bills.head()

Unnamed: 0,bill_id,committee
0,hr4452-115,HSBA
1,hr4453-115,HSAS
2,hconres94-115,HSII
3,hr4437-115,HSAS
4,hjres121-115,HSJU


In [113]:
# industry codes for each committee for each bill
committee = pd.merge(bills, bill_industries, how = 'left', on = 'bill_id').dropna()
committee = committee.groupby('committee').industries.agg(lambda x: list(x))
committee

committee
HLIG    [{'J3000': 1}, {'J7000': 1}, {'J1100': 1}, {'J...
HSAG    [{'J1100': 6, 'J4000': 3, 'G1300': 1}, {'J7600...
HSAP    [{'J7000': 1}, {'H0000': 1}, {'L1200': 1, 'JE3...
HSAS    [{'J7000': 1}, {'H5000': 16, 'H5100': 8, 'M100...
HSBA    [{'F1300': 1}, {'J3000': 1}, {'F1100': 1, 'J30...
HSBU    [{'J1100': 1}, {'J1100': 16, 'L1100': 1, 'G100...
HSED    [{'J1100': 1}, {'LT000': 1}, {'L1300': 1}, {'A...
HSFA    [{'JD200': 1}, {'X7000': 12, 'J7000': 5, 'J750...
HSGO    [{'J3000': 1}, {'J7300': 1}, {'J1100': 1}, {},...
HSHA    [{'J4000': 1}, {'J4000': 1}, {'H0000': 1}, {'J...
HSHM    [{'X7000': 1, 'J3000': 1, 'JE300': 2, 'G5290':...
HSIF    [{'H0000': 3, 'JH100': 1, 'H1130': 1}, {'H3200...
HSII    [{'M1000': 1}, {'X7000': 1}, {'JE300': 4, 'A30...
HSJU    [{'J1100': 1}, {'LA100': 1, 'J7000': 1, 'L0000...
HSPW    [{'X7000': 1}, {'LT300': 1}, {'LT300': 1}, {'E...
HSRU    [{'F1100': 1}, {'J1100': 1}, {'LT300': 1, 'LM1...
HSSM    [{'G1200': 1}, {}, {'H4500': 13, 'C5000': 12, ...
HSSO

In [114]:
def sum_industry_codes(row):
    """Sum industry codes. 
    Keep industry codes that are in the top 10th percentile """
    
    # sum them
    totals = defaultdict(int)
    for pair in row:
        for industry, count in pair.items():
            if industry == "":
                continue
            totals[industry] += count
        
    counts = [x for x in totals.values()]
    percent_75 = np.percentile(counts, 90)
    
    totals_filtered = {}
    for industry, count in totals.items():
          if count >= percent_75:  
            totals_filtered[industry] = count
    return totals_filtered

committee = committee.apply(sum_industry_codes)

In [116]:
committee.apply(lambda x: len(list(x)))

committee
HLIG     8
HSAG    26
HSAP    15
HSAS    15
HSBA    30
HSBU    29
HSED    33
HSFA    16
HSGO    28
HSHA    16
HSHM    14
HSIF    35
HSII    28
HSJU    38
HSPW    26
HSRU    26
HSSM    22
HSSO     1
HSSY    24
HSVR    11
HSWM    37
SCNC     1
SLIA     4
SLIN    10
SPAG     4
SSAF    17
SSAP    13
SSAS     6
SSBK    21
SSBU    10
SSCM    29
SSEG    18
SSEV    24
SSFI    37
SSFR     7
SSGA    31
SSHR    32
SSJU    31
SSRA     8
SSSB    27
SSVA     5
dtype: int64

In [120]:
pd.DataFrame(committee).rename(columns = {0:'industry_codes'}).to_csv('committee_industry_codes.csv')