In [1]:
import pandas as pd
import numpy as np
import os
import os.path
import datetime

# from functools import reduce
from download_data import get_df 
# hooray nice sublime keymaps!
import my_keymap

pd.set_option('max_columns', 40)
pd.set_option('max_rows', 10)
pd.set_option('display.float_format', lambda x: '{:,.2f}'.format(x))

<IPython.core.display.Javascript object>

In [2]:
# only include net connections greater than this number of dollars (not including official, which have no value)
cutoff_val = 1

In [3]:

# our big function that runs for a given set of years and outputs the links and nodes .csv files for our visualization
# applying ammendments is slow because it requires parsing the transaction dates and then grouping them
def process(desired_years, apply_amendments, force_data_download=False):
    # load up our data
    pacs = get_df('cm', desired_years, apply_amendments, force_data_download)
    pac_to_pac_transactions = get_df('oth', desired_years, apply_amendments, force_data_download)
    pac_to_cand_transactions =  get_df('pas2', desired_years, apply_amendments, force_data_download)
    candidates = get_df('cn', desired_years, apply_amendments, force_data_download)
    # there's no header to download for this one, guhh
    # has cand_id, a simple way to do the cand_to_pac_linkages,, and meta stuff like total disbursements and receipts
    pac_summary = get_df('webk', desired_years)
    # official links, not monetary directly
    cand_to_pac_linkages = get_df('ccl', desired_years)

    # Links (getting net transactions between nodes)
    # -----------------------------



    print("all transactions: {:,}".format(len(pac_to_pac_transactions)))
    
    
    # https://docs.google.com/spreadsheets/d/1-0rz62rxhsfdVrYFlKP6aJEGG57AQvKEDwbEUOKnYtU/edit?usp=sharing    
    excluded_tp_str = '24A|16C|20C|16G|20G|20F|16F|24N|24R|41Z|42Z'
    # drop opposing, loans, loan repayments and other transaction types, maybe put this above positive neg
    print("pre-drop transaction tp p2p: {:,}".format(len(pac_to_pac_transactions)))
#     pac_to_pac_transactions = pac_to_pac_transactions[pac_to_pac_transactions.TRANSACTION_TP!='24A']
    pac_to_pac_transactions = pac_to_pac_transactions[~pac_to_pac_transactions.TRANSACTION_TP.str.contains(excluded_tp_str, na=False)]
    print("post drop transaction tp p2p: {:,}".format(len(pac_to_pac_transactions)))

    
    # drop all the negative transactions, for now, todo, figure out what's up with that, they should probably be dealt with differently anyway?
    # hopefully trans tp above will end the need for this

    # negative_pac_to_pac_transactions = pac_to_pac_transactions = pac_to_pac_transactions[pac_to_pac_transactions['TRANSACTION_AMT'] < 0]
    pac_to_pac_transactions = pac_to_pac_transactions[pac_to_pac_transactions['TRANSACTION_AMT'] >0]
    print('only positive transactions:', len(pac_to_pac_transactions))
    # print("only positive transactions: {:,}".format(len(pac_to_pac_transactions)))



    
    # group/index the transactions by source and recipient/target
    pac_to_pac_links  = pac_to_pac_transactions.groupby(['CMTE_ID', 'OTHER_ID']) 
    # and then sum that
    pac_to_pac_links = pac_to_pac_links['TRANSACTION_AMT'].sum()
    # and now reset the index to get rid of the group by and just make it flat
    pac_to_pac_links = pac_to_pac_links.reset_index()


    # print("pac_to_pac_transactions: {:,}".format(len(pac_to_pac_transactions)))
    # net transactions are what I'm calling links
    # print("pac_to_pac_links: {:,}".format(len(pac_to_pac_links)))
    # print(len(pac_to_pac_links))

    # ok now on to from pacs to candidates


    print('ok begin pac_to_cand_transactions with:', len(pac_to_cand_transactions))
    
    # drop opposing etc
#     pac_to_cand_transactions = pac_to_cand_transactions[pac_to_cand_transactions.TRANSACTION_TP!='24A']
    pac_to_cand_transactions = pac_to_cand_transactions[~pac_to_cand_transactions.TRANSACTION_TP.str.contains(excluded_tp_str, na=False)]

    print('ok after tp drop p2C with:', len(pac_to_cand_transactions))

        # drop the negative for these too
    print("all p2c transactions: {:,}".format(len(pac_to_cand_transactions)))
    pac_to_cand_transactions = pac_to_cand_transactions[pac_to_cand_transactions['TRANSACTION_AMT'] >0]
    print("only positive p2c transactions: {:,}".format(len(pac_to_cand_transactions)))
    
    # this time group by committee and the the candidate - to get net contribs between
    # (there's an other_id present, not sure what that represents)
    pac_to_cand_links  = pac_to_cand_transactions.groupby(['CMTE_ID', 'CAND_ID']) 
    # and then sum that
    pac_to_cand_links = pac_to_cand_links['TRANSACTION_AMT'].sum()
    # and now reset the index to get rid of the group by and just make it flat
    pac_to_cand_links = pac_to_cand_links.reset_index()
 
    # print("pac_to_cand_transactions: {:,}".format(len(pac_to_cand_transactions)))
    # print("pac_to_cand_links: {:,}".format(len(pac_to_cand_links)))
    # print(len(pac_to_cand_links))

    # getting   official links
    official_pac_to_cand_linkage = cand_to_pac_linkages[['CMTE_ID', 'CAND_ID']].copy()
    official_pac_to_cand_linkage.columns = ['source', 'target']
    official_pac_to_cand_linkage['sum'] = 0
    official_pac_to_cand_linkage['official'] = True    

    # standardize the column names
    pac_to_cand_links.columns = ['source', 'target', 'sum']
    pac_to_pac_links.columns  = ['source', 'target', 'sum']
    
    # only include links greater than the cutoff_val
    # print("pac_to_cand_links before cutoff: {:,}".format(len(pac_to_cand_links)))
    pac_to_cand_links = pac_to_cand_links[pac_to_cand_links['sum'] > cutoff_val]
    # print("pac_to_cand_links AFTER cutoff: {:,}".format(len(pac_to_cand_links)))
    
    # print("-pac_to_pac_links before cutoff: {:,}".format(len(pac_to_pac_links)))
    pac_to_pac_links = pac_to_pac_links[pac_to_pac_links['sum'] > cutoff_val]
    # print("-pac_to_pac_links AFTER cutoff: {:,}".format(len(pac_to_pac_links)))


    # hooray, every link between a pac and another pac, and between a pac and a candidate
    # we'll eventually export this

    # XXX OFFICIAL LINKS- switch to not include'em
    # now concat and sort it all  (including official)
    every_link = pd.concat([pac_to_cand_links, pac_to_pac_links, official_pac_to_cand_linkage], ignore_index=True).sort_values('sum', ascending=False)
    # concat and sort it all (same as above - without official!!!) XXX
#     every_link = pd.concat([pac_to_cand_links, pac_to_pac_links], ignore_index=True).sort_values('sum', ascending=False)

    every_link.reset_index(inplace=True)
    
    # drop any that snuck through lacking, sorta doing this backwards
    # print("every_link before na drop: {:,}".format(len(every_link)))
    every_link.source.replace('', np.nan, inplace=True)
    every_link.target.replace('', np.nan, inplace=True)
    every_link.dropna(subset=['source', 'target'], inplace=True)
    # print("every_link AFTER na drop: {:,}".format(len(every_link)))

    # if we wanted only the top k
    # top_1k_link  = every_link[0:1000]

    # Nodes (pacs and candidates)
    # -----------------------------

    # this is kinda dumb, but it works. 
    # we're rebuilding what we had, but this way it's exhaustive, and the data is so messy, and the force directed graph is so picky it makes sense 
    # actually this isn't dumb, because we need to add fields from the webk

    # OK, now use that to get every possible node, d3-force is very picky
    every_node = pd.DataFrame(np.unique(every_link[['source', 'target']].values))
    every_node.columns = ['id']

    # and same for these, just the unique ones in the top 1k links
    # top_1k_node = pd.DataFrame(np.unique(top_1k_link[['source', 'target']].values))
    # top_1k_node.columns = ['id']

    # # and top 10k
    # top_10k_node = pd.DataFrame(np.unique(top_10k_link[['source', 'target']].values))
    # top_10k_node.columns = ['id']


    # now to more easily add information about the nodes, set up our pacs and candidates with indexes
    candidates.set_index('CAND_ID', inplace=True)
    pacs.set_index('CMTE_ID',inplace=True)
    pac_summary.set_index('CMTE_ID',inplace=True)

    # helpers for the lambdas
    # XXX this is fairly un-pandas, it's be better to join or merge, todo
    def name_from_id(id):
        if (pacs.index.contains(id)):
            return pacs['CMTE_NM'][id]
        elif (candidates.index.contains(id)):
            return candidates['CAND_NAME'][id]
        else:
            return

    def party_from_id(id):
        if (pacs.index.contains(id)):
            return pacs['CMTE_PTY_AFFILIATION'][id]
        elif (candidates.index.contains(id)):
            return candidates['CAND_PTY_AFFILIATION'][id]
        else:
            return

    # todo, using the actual linkage file here would be better, but the cand_id field seems pretty good, refactoring to use merges would fix this
    def official_cand_linkage_from_pac_id(id):
        if (pacs.index.contains(id)):
            return pacs['CAND_ID'][id]
        else:
            return


    def get_total_disb_from_id(id):
        if (pac_summary.index.contains(id)):
            return pac_summary['TTL_DISB'][id]
        else:
            return
        
    def get_total_recs_from_id(id):
        if (pac_summary.index.contains(id)):
            return pac_summary['TTL_RECEIPTS'][id]
        else:
            return

    # set up the name using the pacs and candidates df in the func above
    every_node['name'] = every_node.apply(lambda row: name_from_id(row.id), axis=1) 
    # and party 
    every_node['party'] = every_node.apply(lambda row: party_from_id(row.id), axis=1) 
    # and the bioguide id, for candidate photos
    every_node['bioguide_id'] = every_node.apply(lambda row: get_bioguide_id_from_fec_id(row.id), axis=1) 
    every_node['pac_cand_id'] = every_node.apply(lambda row: official_cand_linkage_from_pac_id(row.id), axis=1) 
    every_node['disb'] = every_node.apply(lambda row: get_total_disb_from_id(row.id), axis=1) 
    every_node['recs'] = every_node.apply(lambda row: get_total_recs_from_id(row.id), axis=1) 
    # if we wanted to do the top10k links and their relevant nodes, this would be the place

    # set up a string for the year for the name of the file
    year_string = ''
    if len(desired_years) is 1:
        year_string = str(desired_years[0])
    else:
        year_string = str(desired_years[0])+'-'+str(desired_years[-1])

    # almost done!
    print("number of nodes: {:,}".format(len(every_node)))
    print("number of links: {:,}".format(len(every_link)))    
        
    # and finally export it
    every_node.to_csv('out/every_node_'+year_string+'.csv', index=False)
    # oops, drop the unnec index col
    every_link.drop('index', axis=1, inplace=True)    
    every_link.to_csv('out/every_link_'+year_string+'.csv', index=False, columns=["source","target","sum", "official"])
    print('Done, wrote every_node and every_link .csv files for '+year_string)
    return [every_node, every_link]


# XXX this doesn't directly relate to the FEC data, but it makes it possible to load photos in the vis
# lets try to get the bio id for a given fec candidate id
# helper, simpler than the other, not a zip, no concat
# basically just caches the file locally
def get_df_from_url(file_url):
    file_name = file_url.split('/')[-1]
    # print(file_name)
    if not os.path.isfile(file_name):
        ! curl -L $file_url > $file_name
    df = pd.read_csv(file_name,  sep=',', low_memory=False)
    return df

every_legis = get_df_from_url('https://theunitedstates.io/congress-legislators/legislators-current.csv')
# legis[0:5]
# print(len(every_legis))
hist_url = 'https://theunitedstates.io/congress-legislators/legislators-historical.csv'
hist_legis = get_df_from_url(hist_url)
# print(len(hist_legis))
every_legis = every_legis.append(hist_legis, ignore_index=True)
# print(len(every_legis))

legis = every_legis.dropna(subset=['fec_ids', 'bioguide_id'])

# print(len(legis))

def get_bioguide_id_from_fec_id(fec_id):
    res = legis[legis.fec_ids.str.contains(fec_id)]
#     print(res)
    if len(res) is 1:
        res = res.reset_index()
        return res.bioguide_id[0]
    else:
        return 

In [None]:
print('Begin!')
print ([2018])
# In general, we don't care about what this returns, since it writes files out, but for debugging it's useful 
# get new data from the FEC website
# [nodes, links] = process([2018], apply_amendments=True, force_data_download=True)
# used the cached files
[nodes, links] = process([2018], apply_amendments=True, force_data_download=False)

In [None]:
# len( nodes.pac_cand_id.astype('str').unique() )
links[0:3]
nodes[0:3]

In [4]:
force_data_download = True

In [5]:
print([2016])
process([2016], True, force_data_download)

[2016]
loading: cm16.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2016/cm16.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    337      0 --:--:-- --:--:-- --:--:--   337
100  723k  100  723k    0     0   232k      0  0:00:03  0:00:03 --:--:--  310k
loading: oth16.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2016/oth16.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    122      0  0:00:01  0:00:01 --:--:--   122
100 75.7M  100 75.7M    0     0   405k      0  0:03:11  0:03:11 --:--:--  699k   340k      0  0:03:47  0:01:01  0:02:46  458k0   342k      0  0:03:46  0:01:27  0:02:19  146k  0   384k      0  0:03:21  0:01:58  0:01:23  427kM    0     0   358k      0  0:0

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  result_df = result_df.loc[result_df.groupby('TRAN_ID')['TRANSACTION_DT'].idxmax()]


loading: pas216.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2016/pas216.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    339      0 --:--:-- --:--:-- --:--:--   339
100 14.6M  100 14.6M    0     0  1364k      0  0:00:10  0:00:10 --:--:-- 2281k
loading: cn16.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2016/cn16.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    550      0 --:--:-- --:--:-- --:--:--   550
100  273k  100  273k    0     0   224k      0  0:00:01  0:00:01 --:--:--  981k
loading: webk16.zip
loading: ccl16.zip
all transactions: 2,030,441
pre-drop transaction tp p2p: 2,030,441
post drop transaction tp p2p: 1,959,830
only positive transactions: 194225

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of nodes: 17,151
number of links: 314,733
Done, wrote every_node and every_link .csv files for 2016


[              id                                               name party  \
 0       8NM01257                                               None  None   
 1      C00000000                                               None  None   
 2      C00000059                                 HALLMARK CARDS PAC   UNK   
 3      C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...   NaN   
 4      C00000489               D R I V E POLITICAL FUND CHAPTER 886   NaN   
 ...          ...                                                ...   ...   
 17146  S8WA00194                                    CANTWELL, MARIA   DEM   
 17147  S8WI00026                             FEINGOLD, RUSSELL DANA   DEM   
 17148  S8WI00158                                    NEUMANN, MARK W   REP   
 17149  S8WI00578                                               None  None   
 17150  SOIN00095                                               None  None   
 
       bioguide_id pac_cand_id         disb         recs  
 0 

In [6]:
print([2014])
process([2014], True, force_data_download)

[2014]
loading: cm14.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2014/cm14.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0   1013      0 --:--:-- --:--:-- --:--:--  1013
100  609k  100  609k    0     0   302k      0  0:00:02  0:00:02 --:--:--  471k
loading: oth14.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2014/oth14.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    956      0 --:--:-- --:--:-- --:--:--   956
100 18.6M  100 18.6M    0     0   862k      0  0:00:22  0:00:22 --:--:-- 1089k:05  923k


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  result_df = result_df.loc[result_df.groupby('TRAN_ID')['TRANSACTION_DT'].idxmax()]


loading: pas214.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2014/pas214.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    427      0 --:--:-- --:--:-- --:--:--   427
100 11.3M  100 11.3M    0     0  1164k      0  0:00:10  0:00:10 --:--:-- 1462k
loading: cn14.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2014/cn14.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    452      0 --:--:-- --:--:-- --:--:--   452
100  201k  100  201k    0     0   159k      0  0:00:01  0:00:01 --:--:--  247k
loading: webk14.zip
loading: ccl14.zip
all transactions: 507,323
pre-drop transaction tp p2p: 507,323
post drop transaction tp p2p: 464,374
only positive transactions: 452884
ok be

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of nodes: 15,327
number of links: 312,473
Done, wrote every_node and every_link .csv files for 2014


[              id                                               name party  \
 0      149518968                                               None  None   
 1      C00000000                                               None  None   
 2      C00000059                                 HALLMARK CARDS PAC   UNK   
 3      C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...   NaN   
 4      C00000489               D R I V E POLITICAL FUND CHAPTER 886   NaN   
 ...          ...                                                ...   ...   
 15322  S8WA00194                                    CANTWELL, MARIA   DEM   
 15323  S8WI00026                                FEINGOLD, RUSSELL D   DEM   
 15324  S8WI00158                                    NEUMANN, MARK W   REP   
 15325  S8WV00159                                               None  None   
 15326  V00465211                                               None  None   
 
       bioguide_id pac_cand_id         disb         recs  
 0 

In [7]:
print([2012])
process([2012], True, force_data_download)

[2012]
loading: cm12.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2012/cm12.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    402      0 --:--:-- --:--:-- --:--:--   402
100  594k  100  594k    0     0   369k      0  0:00:01  0:00:01 --:--:--  543k
loading: oth12.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2012/oth12.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    354      0 --:--:-- --:--:-- --:--:--   354
100 33.9M  100 33.9M    0     0  1084k      0  0:00:32  0:00:32 --:--:-- 1141k


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  result_df = result_df.loc[result_df.groupby('TRAN_ID')['TRANSACTION_DT'].idxmax()]


loading: pas212.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2012/pas212.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    525      0 --:--:-- --:--:-- --:--:--   525
100 11.6M  100 11.6M    0     0  1053k      0  0:00:11  0:00:11 --:--:-- 1229k
loading: cn12.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2012/cn12.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    314      0 --:--:-- --:--:-- --:--:--   314
100  202k  100  202k    0     0   129k      0  0:00:01  0:00:01 --:--:--  324k
loading: webk12.zip
loading: ccl12.zip
all transactions: 933,195
pre-drop transaction tp p2p: 933,195
post drop transaction tp p2p: 878,420
only positive transactions: 867035
ok be

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of nodes: 15,175
number of links: 306,103
Done, wrote every_node and every_link .csv files for 2012


[              id                                               name party  \
 0          C0000                                               None  None   
 1      C00000000                                               None  None   
 2      C00000042  ILLINOIS TOOL WORKS INC. FOR BETTER GOVERNMENT...   NaN   
 3      C00000059                                 HALLMARK CARDS PAC   UNK   
 4      C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...   NaN   
 ...          ...                                                ...   ...   
 15170  S8WA00194                                    CANTWELL, MARIA   DEM   
 15171  S8WI00026                                FEINGOLD, RUSSELL D   DEM   
 15172  S8WI00158                                    NEUMANN, MARK W   REP   
 15173  S8WV00101                               FLETCHER, SHEIRL LEE   DEM   
 15174  S8WY00171                               GOODENOUGH, KEITH B.   DEM   
 
       bioguide_id pac_cand_id         disb         recs  
 0 

In [8]:
print([2010])
process([2010], True, force_data_download)

[2010]
loading: cm10.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2010/cm10.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    666      0 --:--:-- --:--:-- --:--:--   669
100  500k  100  500k    0     0   323k      0  0:00:01  0:00:01 --:--:--  389k
loading: oth10.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2010/oth10.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    736      0 --:--:-- --:--:-- --:--:--   733    0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 15.7M  100 15.7M    0     0  1032k      0  0:00:15  0:00:15 --:--:--  966k


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  result_df = result_df.loc[result_df.groupby('TRAN_ID')['TRANSACTION_DT'].idxmax()]


loading: pas210.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2010/pas210.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    527      0 --:--:-- --:--:-- --:--:--   527
100 10.4M  100 10.4M    0     0  1152k      0  0:00:09  0:00:09 --:--:-- 1628k
loading: cn10.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2010/cn10.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    687      0 --:--:-- --:--:-- --:--:--   690
100  185k  100  185k    0     0   179k      0  0:00:01  0:00:01 --:--:--  504k
loading: webk10.zip
loading: ccl10.zip
all transactions: 436,150
pre-drop transaction tp p2p: 436,150
post drop transaction tp p2p: 422,679
only positive transactions: 413201
ok be

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of nodes: 13,594
number of links: 291,009
Done, wrote every_node and every_link .csv files for 2010


[              id                                               name party  \
 0      C00000000                                               None  None   
 1      C00000042  ILLINOIS TOOL WORKS FOR BETTER GOVERNMENT COMM...   NaN   
 2      C00000059                                 HALLMARK CARDS PAC   UNK   
 3      C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...   NaN   
 4      C00000489  D R I V E POLITICAL FUND, TEAMSTERS LOCAL UNIO...   NaN   
 ...          ...                                                ...   ...   
 13589  S8WV00093                                               None  None   
 13590  S8WV00101                               FLETCHER, SHEIRL LEE   DEM   
 13591  S8WY00155                                 CARTER, NICHOLAS H   DEM   
 13592  S8WY00163                            ROTHFUSS, CHRISTOPHER J   DEM   
 13593  S8WY00171                               GOODENOUGH, KEITH B.   DEM   
 
       bioguide_id pac_cand_id         disb         recs  
 0 

In [9]:
print([2008])
process([2008], True, force_data_download)

[2008]
loading: cm08.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2008/cm08.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    405      0 --:--:-- --:--:-- --:--:--   404
100  493k  100  493k    0     0   272k      0  0:00:01  0:00:01 --:--:--  655k
loading: oth08.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2008/oth08.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    639      0 --:--:-- --:--:-- --:--:--   641
100 22.8M  100 22.8M    0     0  1179k      0  0:00:19  0:00:19 --:--:-- 1143k


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  result_df = result_df.loc[result_df.groupby('TRAN_ID')['TRANSACTION_DT'].idxmax()]


loading: pas208.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2008/pas208.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    286      0 --:--:-- --:--:-- --:--:--   286
100  9.7M  100  9.7M    0     0   809k      0  0:00:12  0:00:12 --:--:-- 1050k
loading: cn08.zip
downloading from:  https://www.fec.gov/files/bulk-downloads/2008/cn08.zip
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   154  100   154    0     0    719      0 --:--:-- --:--:-- --:--:--   719
100  150k  100  150k    0     0   145k      0  0:00:01  0:00:01 --:--:--  254k
loading: webk08.zip
loading: ccl08.zip
all transactions: 616,124
pre-drop transaction tp p2p: 616,124
post drop transaction tp p2p: 602,239
only positive transactions: 592190
ok be

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




number of nodes: 11,968
number of links: 278,664
Done, wrote every_node and every_link .csv files for 2008


[              id                                               name party  \
 0      C00000000                                               None  None   
 1      C00000042  ILLINOIS TOOL WORKS FOR BETTER GOVERNMENT COMM...   NaN   
 2      C00000059                                 HALLMARK CARDS PAC   UNK   
 3      C00000125                                               None  None   
 4      C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...   NaN   
 ...          ...                                                ...   ...   
 11963  S8WV00093                                 WOLFE, MATTHEW JAY   REP   
 11964  S8WV00101                                 FLETCHER, SHEIRL L   DEM   
 11965  S8WY00155                                 CARTER, NICHOLAS H   DEM   
 11966  S8WY00163                            ROTHFUSS, CHRISTOPHER J   DEM   
 11967  S8WY00171                               GOODENOUGH, KEITH B.   DEM   
 
       bioguide_id pac_cand_id         disb         recs  
 0 

In [10]:
# print([2012, 2014, 2016, 2018])
# process([2016, 2018])

In [11]:
d = datetime.datetime.now() #.isoformat()
last_updated= pd.DataFrame([d])

# last_updated.to_json()
last_updated.to_json('out/last_updated.json',orient="values")