Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
240 lines (200 sloc) 10.5 KB
See README for overview and LICENSE for license
Corrections and coding advice always welcome!
import sqlite3, csv, re
from gender import *
#can set path to where SQLite database is stored to be different from code directory if desired
path = ""
#returns sqlite queries as dictionaries instead of lists
def dict_factory(cursor, row):
d = {}
for idx, col in enumerate(cursor.description):
d[col[0]] = row[idx]
return d
#establish database connection
conn = sqlite3.connect(path + 'all_names.sqlite')
conn.row_factory = dict_factory
c = conn.cursor()
#read the (large) csv files from the FEC into SQLite databases
def load_data(filename=""):
#make table once
c.execute('CREATE TABLE IF NOT EXISTS donations \
"candidate" VARCHAR(50), \
"name" VARCHAR(50), \
"first" VARCHAR(30), \
"last" VARCHAR(30), \
"employer" VARCHAR(50), \
"occupation" VARCHAR(50), \
"city" VARCHAR(50), \
"state" VARCHAR(50), \
"zip" VARCHAR(10), \
"date" DATE, \
"amount" FLOAT, \
"desc" VARCHAR(20), \
"election_type" VARCHAR(10))')
c.execute('DELETE FROM donations')
#this is a large file, so we don't want to load it all into memory with .read() or .readlines()
#format: cmte_id,cand_id,cand_nm,contbr_nm,contbr_city,contbr_st,contbr_zip,contbr_employer,contbr_occupation,contb_receipt_amt,contb_receipt_dt,receipt_desc,memo_cd,memo_text,form_tp,file_num,tran_id,election_tp
count = 1
for line in csv.reader(open(path + filename, "r"), delimiter=",", quotechar='"'):
if count % 10000 == 0:
print "added %i records" % count
if line[0] != 'cmte_id':
last, first = split_name(line[3])
#important to exclude those without first name -- often it's a big campaign expenditure
if first != "":
dt = get_date(line[10])
#note: cut off zip after 5 digits and remove periods from names, to avoid double-counting due to inconsistent data entry
c.execute('INSERT INTO donations ("candidate", "name", "last", "first", "employer", "occupation", "city", "state", "zip", "date", "amount", "desc", "election_type") \
VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", %.2f, "%s", "%s" )' %
(line[2], line[3], last, first, line[7], line[8], line[4], line[5], line[6][0:5], dt, float(line[9]), line[11], line[17]))
except Exception as e:
print line, e
count = count + 1
print "Added a total of %i records for %s" % (count, candidate)
#The names arrive as strings, so we need to intelligently split into first and last names
def split_name(name):
#remove suffixes and salutations that occur after surname in some records
#this is tailored to problems I found in FEC records, not a general solution
name = name.replace(".", "").replace("DR ","").replace(",DR", "").replace("REV ", "").replace(", PHD","").replace(",PHD", "").replace(", III","").replace(", JR","").replace(", SR", "").replace(", MD", "").title()
parts = re.split(",+", name) #sometimes two quotes end up adjacent, but doesn't appear to denote blank field
last = parts[0]
if len(parts) > 1:
first = re.split("[\s,]+", parts[1].strip())[0].replace("(", "").replace(")", "")
first = ""
return last, first
#for converting date to SQL format
months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12" }
party = {
"Bachmann, Michele" : "Rep",
"Cain, Herman" : "Rep",
"Gingrich, Newt" : "Rep",
"Huntsman, Jon" : "Rep",
"Johnson, Gary Earl" : "Rep",
"McCotter, Thaddeus G" : "Rep",
"Obama, Barack" : "Dem",
"Paul, Ron" : "Rep",
"Pawlenty, Timothy" : "Rep",
"Perry, Rick" : "Rep",
"Roemer, Charles E. 'Buddy' III" : "Rep",
"Romney, Mitt" : "Rep",
"Santorum, Rick" : "Rep"
def get_date(dt):
dt = dt.split("-")
dt = "20" + dt[2] + "-" + months[dt[1]] + "-" + dt[0]
return dt
#after raw data is loaded, make a database with every unique person
def get_names():
c.execute('''CREATE TABLE IF NOT EXISTS "names"
"candidate" VARCHAR(50),
"party" VARCHAR(10),
"first" VARCHAR(25),
"last" VARCHAR(25),
'count' INTEGER,
'amount' FLOAT,
"zip" VARCHAR(10), CONSTRAINT unq UNIQUE (candidate, first, last, zip))''')
#the FEC database lists every contribution, so many people are listed multiple times
#there is no failsafe way to identify unique individuals, given noise in the data, but we can get close by looking at unqiue names per zipcode
#this will undercount two people with the same name in the same zip code, but the collision space appears very low
#this will ocercount one person who reports two different zipcodes. This also appears to occur with tolerable infrequency
names = c.execute("SELECT candidate, first, last, zip, sum(amount), count(*) from donations group by candidate, first, last, zip order by first").fetchall()
for name in names:
candidate = name['candidate']
c.execute('''INSERT OR IGNORE INTO "names" ("candidate", "party", "first", "last", "count", "amount", "zip") VALUES (?, ?, ?, ?, ?, ?, ?)''', (candidate, party[candidate], name['first'], name['last'], name['count(*)'], name['sum(amount)'], name['zip']))
except Exception as e:
print last, first, e
def get_genders():
not_founds = c.execute("SELECT * FROM stats where gender = 'Not found'").fetchall()
for not_found in not_founds:
h = not_found['name'].split('-')
if len(h) > 1:
g0 = get_gender(h[0])
g1 = get_gender(h[1])
if g0 == g1 and (g0 == "male" or g0 == "female"):
c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g0, not_found['name']))
g = get_gender(h[0])
if g == "male" or g == "female":
c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g, not_found['name']))
#reduces to groups of unique first or last names
def compile_names(first=True):
name_type = "first" if first else "last"
c.execute('CREATE TABLE IF NOT EXISTS "%s" \
"name" VARCHAR(25), \
"gender" VARCHAR(15), \
"party" VARCHAR(5), \
"count" INTEGER, \
"amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))' % name_type)
c.execute('DELETE FROM stats')
#select every unique first name
names = c.execute("SELECT %s, party, count(*) as count, sum(amount) as amount FROM names group by %s, party order by %s" % (name_type, name_type, name_type)).fetchall()
first_letter = ''
for name in names:
nm = name[name_type]
if len(nm) > 1:
#track progreess
if nm[0] != first_letter:
first_letter = nm[0]
print "Searching %s names beginning with %s..." % (name_type, first_letter)
#don't bother with gender for surnames
if name_type == "first":
c.execute('INSERT INTO "%s" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' %
(name_type, nm, get_gender(nm), name['party'], name['count'], name['amount']))
c.execute('INSERT INTO "%s" ("name", "party", "count", "amount") VALUES ("%s", "%s", %i, %.2f)' %
(name_type, nm, name['party'], name['count'], name['amount']))
def write_stats(threshold, first=True):
name_type = "first" if first else "last"
f = open("data/stats_%s_%i.csv" % (name_type, threshold), "w")
totals = {}
for party in c.execute("SELECT party, count(*) FROM %s group by party" % name_type).fetchall():
totals[party["party"]] = party["count(*)"]
#this joins records for the same name in different parties. It is not elegant
#we miss names here that show up on one list but not another. Bad conceptually, OK for our purposes since any name of appreciable frequency shows up on both lists
#surely a better way here, but I'm a JOIN novice
names = c.execute('SELECT, as "name", d.gender as gender, d.count as "d_count", d.amount as "d_amount", \, as "r_name", r.count as "r_count", r.amount as "r_amount" \
from %s as d LEFT OUTER JOIN %s as r ON = \
WHERE = "Dem" AND = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (name_type, name_type, threshold, threshold)).fetchall()
for name in names:
l = 26 - ord(name['name'][0]) + 65
dc = name['d_count']
rc = name['r_count']
d_amount = float(name['d_amount'])
r_amount = float(name['r_amount'])
drate = round(1000 * float(dc) / totals["Dem"], 3)
rrate = round(1000 * float(rc) / totals["Rep"], 3)
if len(name['name']) > 1:
tilt = 100 * float(rc) / (dc + rc)
advantage = 100 * float(r_amount) / (d_amount + r_amount)
if len(name['name']) > 1:
f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.1f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l))
#Data downloaded from here:
#Warning: 450 MB uncompressed
write_stats(25, True)
write_stats(25, False)