In [52]:
import xml.etree.ElementTree as ET
import json
import glob
import re
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Match Filing IDs with names

In [55]:
keys = pd.read_csv('../data/matching_process/key-matches.csv')

In [56]:
clients = pd.read_csv('../data/matching_process/clients.csv')
clients['client.name.check'] = clients['client.name'].str.replace(",","").str.replace(".","").str.upper()

  


In [57]:
filings = clients[clients['client.name.check'].isin(keys['client.name.check'].unique())].copy()

In [58]:
filings = filings.merge(keys[['client.name.check', 'company','parent_company']], on="client.name.check").copy()

In [59]:
filings = filings[['filing.id','parent_company','company','client.name', 'client.name.check','filing.period','filing.year',]]

### Pull filings and save as a JSON blob

In [60]:
ids = list(filings['filing.id'])

In [61]:
blob = []
for xmlfile in Path('../data/lobbying/').glob('**/*.xml'):
    with open(xmlfile, "rb") as data:
        tree = ET.parse(data)
        root = tree.getroot()
        for filing in root.iter('Filing'):
            file = filing.attrib
            if file['ID'] in ids:
                for child in filing:
                    grandchildren = [c.attrib for c in child]
                    if len(grandchildren) > 0:
                        assert(len(child.attrib) == 0)
                        file[child.tag] = grandchildren
                    else:
                        file[child.tag] = child.attrib
                blob.append(file)

In [62]:
with open('../data/matching_process/filings.json', 'w') as outfile:
    json.dump({'Filings': blob}, outfile, indent=2)

### Reopen the JSON

In [63]:
with open('../data/matching_process/filings.json', 'r') as json_file:
    data = json.load(json_file)

### Filter Out Duplicate/Registration Filings

In [64]:
d = []
for f in data['Filings']:
    d.append({
        'filing.id': f['ID'],
        'client.name': f['Client']['ClientName'],
        'registrant.name': f['Registrant']['RegistrantName'],
        'registrant.id': f['Registrant']['RegistrantID'],
        'is_self_filer': f['Client']['SelfFiler'],
        'date': f['Received'],
        'period': f['Period'],
        'type': f['Type'],
        'spending': f['Amount']
    })
df = pd.DataFrame(d)
df['spending'] = pd.to_numeric(df['spending'])

In [65]:
spending = pd.merge(df, filings[['client.name', 'company','parent_company']].drop_duplicates(), on='client.name')

In [66]:
no_dups = spending.sort_values(by='date', ascending=False).drop_duplicates(subset=['client.name', 'registrant.name', 'period']).copy()

Remove all filings that don't have a spending amount

In [67]:
no_dups = no_dups[no_dups.spending.gt(0)]

Create a key to match subsidiaries with their parent companies

In [68]:
subsidiaries_df = no_dups[~no_dups['parent_company'].isna()][['parent_company', 'company']].drop_duplicates().copy()
subsidiaries = pd.Series(subsidiaries_df.parent_company.values,index=subsidiaries_df.company).to_dict()

Add link to filing and export for easy reference

In [69]:
reference = no_dups.copy()
reference['filing_id'] = reference['filing.id']
reference['lobbying_firm'] = reference['registrant.name']
reference['client_name'] = reference['client.name']
reference['url'] = reference['filing_id'].apply(lambda s: 'https://lda.senate.gov/filings/public/filing/' + s.lower() + '/print/')

In [70]:
ref_cols = ['filing_id', 'url', 'parent_company', 'company', 'client_name', 'lobbying_firm', 'is_self_filer', 'type', 'period', 'spending']
sort_cols = ['company', 'period','is_self_filer','spending']
reference[ref_cols].sort_values(by=sort_cols).to_csv('../data/findings/data-broker-filings.csv',index=False)