Permalink
Browse files

stable 0.1

  • Loading branch information...
1 parent 70fe098 commit b5f1f566af94fd40455fe79fc99e7de00e3975ad @wilson428 committed Sep 28, 2012
Showing with 77 additions and 59 deletions.
  1. +77 −59 get_names.py
View
@@ -11,16 +11,17 @@ def dict_factory(cursor, row):
return d
#establish database connection
-conn = sqlite3.connect(path + 'names.sqlite')
+conn = sqlite3.connect(path + 'all_names.sqlite')
conn.row_factory = dict_factory
c = conn.cursor()
#read the (large) csv files from the FEC into SQLite databases
#http://fec.gov/disclosurep/PDownload.do
-def load_data(candidate, filename):
+def load_data(candidate="", filename=""):
#make table once
- c.execute('CREATE TABLE IF NOT EXISTS %s \
+ c.execute('CREATE TABLE IF NOT EXISTS donations \
("id" INTEGER PRIMARY KEY AUTOINCREMENT, \
+ "candidate" VARCHAR(50), \
"name" VARCHAR(50), \
"first" VARCHAR(30), \
"last" VARCHAR(30), \
@@ -32,9 +33,9 @@ def load_data(candidate, filename):
"date" DATE, \
"amount" FLOAT, \
"desc" VARCHAR(20), \
- "election_type" VARCHAR(10))' % candidate)
+ "election_type" VARCHAR(10))')
- c.execute('DELETE FROM %s' % candidate)
+ c.execute('DELETE FROM donations')
conn.commit()
#this is a large file, so we don't want to load it all into memory with .read() or .readlines()
@@ -45,17 +46,17 @@ def load_data(candidate, filename):
if count % 10000 == 0:
print "added %i records" % count
conn.commit()
-
+
if line[0] != 'cmte_id':
last, first = split_name(line[3])
#important to exclude those without first name -- often it's a big campaign expenditure
if first != "":
dt = get_date(line[10])
#note: cut off zip after 5 digits and remove periods from names, to avoid double-counting due to inconsistent data entry
try:
- c.execute('INSERT INTO %s ("name", "last", "first", "employer", "occupation", "city", "state", "zip", "date", "amount", "desc", "election_type") \
- VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", %.2f, "%s", "%s" )' %
- (candidate, line[3], last, first, line[7], line[8], line[4], line[5], line[6][0:5], dt, float(line[9]), line[11], line[17]))
+ c.execute('INSERT INTO donations ("candidate", "name", "last", "first", "employer", "occupation", "city", "state", "zip", "date", "amount", "desc", "election_type") \
+ VALUES ("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s", %.2f, "%s", "%s" )' %
+ (line[2], line[3], last, first, line[7], line[8], line[4], line[5], line[6][0:5], dt, float(line[9]), line[11], line[17]))
except Exception as e:
print line, e
count = count + 1
@@ -79,33 +80,52 @@ def split_name(name):
#convert date to SQL format
#http://www.sqlite.org/lang_datefunc.html
months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12" }
+party = {
+ "Bachmann, Michele" : "Rep",
+ "Cain, Herman" : "Rep",
+ "Gingrich, Newt" : "Rep",
+ "Huntsman, Jon" : "Rep",
+ "Johnson, Gary Earl" : "Rep",
+ "McCotter, Thaddeus G" : "Rep",
+ "Obama, Barack" : "Dem",
+ "Paul, Ron" : "Rep",
+ "Pawlenty, Timothy" : "Rep",
+ "Perry, Rick" : "Rep",
+ "Roemer, Charles E. 'Buddy' III" : "Rep",
+ "Romney, Mitt" : "Rep",
+ "Santorum, Rick" : "Rep"
+}
+
+
def get_date(dt):
dt = dt.split("-")
dt = "20" + dt[2] + "-" + months[dt[1]] + "-" + dt[0]
return dt
#after raw data is loaded, make a database with every unique person
-def get_names(candidate):
+def get_names():
c.execute('''CREATE TABLE IF NOT EXISTS "names"
("id" INTEGER PRIMARY KEY AUTOINCREMENT,
"candidate" VARCHAR(50),
+ "party" VARCHAR(10),
"first" VARCHAR(25),
"last" VARCHAR(25),
'count' INTEGER,
'amount' FLOAT,
- "zip" VARCHAR(10), CONSTRAINT unq UNIQUE (first, last, zip))''')
+ "zip" VARCHAR(10), CONSTRAINT unq UNIQUE (candidate, first, last, zip))''')
#the FEC database lists every contribution, so many people are listed multiple times
#there is no failsafe way to identify unique individuals, given noise in the data, but we can get close by looking at unqiue names per zipcode
#this will undercount two people with the same name in the same zip code, but the collision space appears very low
#this will ocercount one person who reports two different zipcodes. This also appears to occur with tolerable infrequency
- names = c.execute("SELECT first, last, zip, sum(amount), count(*) from %s group by first, last, zip order by first" % candidate).fetchall()
+ names = c.execute("SELECT candidate, first, last, zip, sum(amount), count(*) from donations group by candidate, first, last, zip order by first").fetchall()
for name in names:
+ candidate = name['candidate']
try:
- c.execute('''INSERT OR IGNORE INTO "names" ("candidate", "first", "last", "count", "amount", "zip") VALUES (?, ?, ?, ?, ?, ?)''', (candidate, name['first'], name['last'], name['count(*)'], name['sum(amount)'], name['zip']))
+ c.execute('''INSERT OR IGNORE INTO "names" ("candidate", "party", "first", "last", "count", "amount", "zip") VALUES (?, ?, ?, ?, ?, ?, ?)''', (candidate, party[candidate], name['first'], name['last'], name['count(*)'], name['sum(amount)'], name['zip']))
except Exception as e:
print last, first, e
conn.commit()
@@ -126,78 +146,76 @@ def get_genders():
c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g, not_found['name']))
conn.commit()
-#reduces to groups of first names
+#reduces to groups of unique first names
def compile_names():
c.execute('''CREATE TABLE IF NOT EXISTS "stats"
("id" INTEGER PRIMARY KEY AUTOINCREMENT,
- "name" VARCHAR(25) UNIQUE,
+ "name" VARCHAR(25),
"gender" VARCHAR(15),
- "obama" INTEGER,
- "romney" INTEGER,
- "amt_obama" FLOAT,
- "amt_romney" FLOAT)''')
+ "party" VARCHAR(5),
+ "count" INTEGER,
+ "amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))''')
c.execute('DELETE FROM stats')
- conn.commit()
+ conn.commit()
- names = c.execute("SELECT first, count(*) FROM names group by first order by count(*) desc").fetchall()
+ #select every unique first name
+ names = c.execute("SELECT first, party, count(*) as count, sum(amount) as amount FROM names group by first, party order by first").fetchall()
+
+ first_letter = ''
for name in names:
if len(name['first']) > 1:
- split = c.execute("SELECT first, candidate, count(*), sum(amount) FROM names WHERE first = \"%s\" group by candidate order by candidate" % name['first']).fetchall()
-
- oc = 0
- rc = 0
- for s in split:
- if s['candidate'] == "Obama":
- oc = s['count(*)']
- o_amt = s['sum(amount)']
- elif s['candidate'] == "Romney":
- rc = s['count(*)']
- r_amt = s['sum(amount)']
-
- c.execute('INSERT INTO "stats" ("name", "gender", "obama", "romney", "amt_obama", "amt_romney") VALUES ("%s", "%s", %i, %i, %.2f, %.2f)' % (name['first'], get_gender(name['first']), oc, rc, o_amt, r_amt))
+ #track progreess
+ if name['first'][0] != first_letter:
+ first_letter = name['first'][0]
+ print "Searching names beginning with %s..." % first_letter
+
+ c.execute('INSERT INTO "stats" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' %
+ (name['first'], get_gender(name['first']), name['party'], name['count'], name['amount']))
conn.commit()
-def write_stats(threshhold):
- f = open("data/stats_%i.csv" % threshhold, "w")
- f.write("name,gender,total,obama_count,romney_count,obama_ratio,romney_ratio,obama_amt,romney_amt,advantage,split,tilt,letter\r")
+def write_stats(threshold):
+ f = open("data/stats_%i.csv" % threshold, "w")
+ f.write("name,gender,total,d_count,r_count,drate,rrate,r_amount,d_amount,advantage,tilt,letter\r")
totals = {}
- for cand in c.execute("SELECT candidate, count(*) FROM names group by candidate").fetchall():
- totals[cand["candidate"]] = cand["count(*)"]
-
- names = c.execute("SELECT * FROM stats").fetchall()
+ for party in c.execute("SELECT party, count(*) FROM names group by party").fetchall():
+ totals[party["party"]] = party["count(*)"]
+
+ #this joins records for the same name in different parties. It is not elegant
+ #we miss names here that show up on one list but not another. Bad conceptually, OK for our purposes since any name of appreciable frequency shows up on both lists
+ #surely a better way here, but I'm a JOIN novice
+ names = c.execute('SELECT d.party, d.name as "name", d.gender as gender, d.count as "d_count", d.amount as "d_amount", \
+ r.party, r.name as "r_name", r.count as "r_count", r.amount as "r_amount" \
+ from stats as d LEFT OUTER JOIN stats as r ON d.name = r.name \
+ WHERE d.party = "Dem" AND r.party = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (threshold, threshold)).fetchall()
for name in names:
l = 26 - ord(name['name'][0]) + 65
- oc = name['obama']
- rc = name['romney']
- oamt = float(name['amt_obama'])
- ramt = float(name['amt_romney'])
- o = round(1000 * float(oc) / totals["Obama"], 3)
- r = round(1000 * float(rc) / totals["Romney"], 3)
-
- if len(name['name']) > 1 and (oc >= threshhold or rc >= threshhold):
- if rc == 0:
- ratio = 100
- else:
- ratio = round(o / r, 3)
-
- tilt = 100 * float(oc) / (oc + rc)
- advantage = 100 * float(oamt) / (oamt + ramt)
+ dc = name['d_count']
+ rc = name['r_count']
+ d_amount = float(name['d_amount'])
+ r_amount = float(name['r_amount'])
+ drate = round(1000 * float(dc) / totals["Dem"], 3)
+ rrate = round(1000 * float(rc) / totals["Rep"], 3)
+
+ if len(name['name']) > 1:
+ tilt = 100 * float(rc) / (dc + rc)
+
+ advantage = 100 * float(r_amount) / (d_amount + r_amount)
if len(name['name']) > 1:
- f.write("%s,%s,%i,%i,%i,%.2f,%.2f,%.3f,%.3f,%.3f,%.3f,%.3f,%i\r" % (name['name'], name['gender'], (oc + rc), oc, rc, o, r, oamt, ramt, advantage, ratio, tilt, l))
+ f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.2f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l))
f.close()
-
+#load_data("", "P00000001-ALL.csv")
#load_data("Obama", "P80003338-ALL.csv")
#load_data("Romney", "P80003353-ALL.csv")
-#get_names("Obama")
+#get_names()
#get_names("Romney")
#compile_names()
#get_genders()

0 comments on commit b5f1f56

Please sign in to comment.