Skip to content

Commit

Permalink
Removed local directories from source
Browse files Browse the repository at this point in the history
  • Loading branch information
wilson428 committed Sep 28, 2012
1 parent 5f4634a commit 82c9eec
Showing 1 changed file with 52 additions and 37 deletions.
89 changes: 52 additions & 37 deletions get_names.py
@@ -1,9 +1,15 @@
'''
See README for overview and LICENSE for license
Corrections and coding advice always welcome! cewilson@yahoo-inc.com
'''

import sqlite3, csv, re import sqlite3, csv, re
from gender import * from gender import *


path = "/Users/cewilson/Desktop/source/FEC/" #can set path to where SQLite database is stored to be different from code directory if desired
path = ""


#optional: returns sqlite queries as dictionaries instead of lists #returns sqlite queries as dictionaries instead of lists
def dict_factory(cursor, row): def dict_factory(cursor, row):
d = {} d = {}
for idx, col in enumerate(cursor.description): for idx, col in enumerate(cursor.description):
Expand All @@ -17,7 +23,7 @@ def dict_factory(cursor, row):


#read the (large) csv files from the FEC into SQLite databases #read the (large) csv files from the FEC into SQLite databases
#http://fec.gov/disclosurep/PDownload.do #http://fec.gov/disclosurep/PDownload.do
def load_data(candidate="", filename=""): def load_data(filename=""):
#make table once #make table once
c.execute('CREATE TABLE IF NOT EXISTS donations \ c.execute('CREATE TABLE IF NOT EXISTS donations \
("id" INTEGER PRIMARY KEY AUTOINCREMENT, \ ("id" INTEGER PRIMARY KEY AUTOINCREMENT, \
Expand Down Expand Up @@ -77,9 +83,10 @@ def split_name(name):
first = "" first = ""
return last, first return last, first


#convert date to SQL format #for converting date to SQL format
#http://www.sqlite.org/lang_datefunc.html #http://www.sqlite.org/lang_datefunc.html
months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12" } months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12" }

party = { party = {
"Bachmann, Michele" : "Rep", "Bachmann, Michele" : "Rep",
"Cain, Herman" : "Rep", "Cain, Herman" : "Rep",
Expand All @@ -95,8 +102,6 @@ def split_name(name):
"Romney, Mitt" : "Rep", "Romney, Mitt" : "Rep",
"Santorum, Rick" : "Rep" "Santorum, Rick" : "Rep"
} }




def get_date(dt): def get_date(dt):
dt = dt.split("-") dt = dt.split("-")
Expand Down Expand Up @@ -146,51 +151,61 @@ def get_genders():
c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g, not_found['name'])) c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g, not_found['name']))
conn.commit() conn.commit()


#reduces to groups of unique first names #reduces to groups of unique first or last names
def compile_names(): def compile_names(first=True):
c.execute('''CREATE TABLE IF NOT EXISTS "stats" name_type = "first" if first else "last"
("id" INTEGER PRIMARY KEY AUTOINCREMENT,
"name" VARCHAR(25), c.execute('CREATE TABLE IF NOT EXISTS "%s" \
"gender" VARCHAR(15), ("id" INTEGER PRIMARY KEY AUTOINCREMENT, \
"party" VARCHAR(5), "name" VARCHAR(25), \
"count" INTEGER, "gender" VARCHAR(15), \
"amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))''') "party" VARCHAR(5), \
"count" INTEGER, \
"amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))' % name_type)


c.execute('DELETE FROM stats') c.execute('DELETE FROM stats')
conn.commit() conn.commit()


#select every unique first name #select every unique first name
names = c.execute("SELECT first, party, count(*) as count, sum(amount) as amount FROM names group by first, party order by first").fetchall() names = c.execute("SELECT %s, party, count(*) as count, sum(amount) as amount FROM names group by %s, party order by %s" % (name_type, name_type, name_type)).fetchall()


first_letter = '' first_letter = ''
for name in names: for name in names:
if len(name['first']) > 1: nm = name[name_type]
if len(nm) > 1:
#track progreess #track progreess
if name['first'][0] != first_letter: if nm[0] != first_letter:
first_letter = name['first'][0] first_letter = nm[0]
print "Searching names beginning with %s..." % first_letter print "Searching %s names beginning with %s..." % (name_type, first_letter)


c.execute('INSERT INTO "stats" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' % #don't bother with gender for surnames
(name['first'], get_gender(name['first']), name['party'], name['count'], name['amount'])) if name_type == "first":
c.execute('INSERT INTO "%s" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' %
(name_type, nm, get_gender(nm), name['party'], name['count'], name['amount']))
else:
c.execute('INSERT INTO "%s" ("name", "party", "count", "amount") VALUES ("%s", "%s", %i, %.2f)' %
(name_type, nm, name['party'], name['count'], name['amount']))


conn.commit() conn.commit()




def write_stats(threshold): def write_stats(threshold, first=True):
f = open("data/stats_%i.csv" % threshold, "w") name_type = "first" if first else "last"

f = open("data/stats_%s_%i.csv" % (name_type, threshold), "w")
f.write("name,gender,total,d_count,r_count,drate,rrate,r_amount,d_amount,advantage,tilt,letter\r") f.write("name,gender,total,d_count,r_count,drate,rrate,r_amount,d_amount,advantage,tilt,letter\r")


totals = {} totals = {}
for party in c.execute("SELECT party, count(*) FROM names group by party").fetchall(): for party in c.execute("SELECT party, count(*) FROM %s group by party" % name_type).fetchall():
totals[party["party"]] = party["count(*)"] totals[party["party"]] = party["count(*)"]


#this joins records for the same name in different parties. It is not elegant #this joins records for the same name in different parties. It is not elegant
#we miss names here that show up on one list but not another. Bad conceptually, OK for our purposes since any name of appreciable frequency shows up on both lists #we miss names here that show up on one list but not another. Bad conceptually, OK for our purposes since any name of appreciable frequency shows up on both lists
#surely a better way here, but I'm a JOIN novice #surely a better way here, but I'm a JOIN novice
names = c.execute('SELECT d.party, d.name as "name", d.gender as gender, d.count as "d_count", d.amount as "d_amount", \ names = c.execute('SELECT d.party, d.name as "name", d.gender as gender, d.count as "d_count", d.amount as "d_amount", \
r.party, r.name as "r_name", r.count as "r_count", r.amount as "r_amount" \ r.party, r.name as "r_name", r.count as "r_count", r.amount as "r_amount" \
from stats as d LEFT OUTER JOIN stats as r ON d.name = r.name \ from %s as d LEFT OUTER JOIN %s as r ON d.name = r.name \
WHERE d.party = "Dem" AND r.party = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (threshold, threshold)).fetchall() WHERE d.party = "Dem" AND r.party = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (name_type, name_type, threshold, threshold)).fetchall()


for name in names: for name in names:
l = 26 - ord(name['name'][0]) + 65 l = 26 - ord(name['name'][0]) + 65
Expand All @@ -207,18 +222,18 @@ def write_stats(threshold):
advantage = 100 * float(r_amount) / (d_amount + r_amount) advantage = 100 * float(r_amount) / (d_amount + r_amount)


if len(name['name']) > 1: if len(name['name']) > 1:
f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.2f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l)) f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.1f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l))


f.close() f.close()


#Data downloaded from here: ftp://ftp.fec.gov/FEC/Presidential_Map/2012/P00000001/P00000001-ALL.zip
#Warning: 450 MB uncompressed


#load_data("", "P00000001-ALL.csv") load_data("P00000001-ALL.csv")
#load_data("Obama", "P80003338-ALL.csv") get_names()
#load_data("Romney", "P80003353-ALL.csv") compile_names(True)
#get_names() compile_names(False)
#get_names("Romney") write_stats(25, True)
#compile_names() write_stats(25, False)
#get_genders()
write_stats(10)


conn.close() conn.close()

0 comments on commit 82c9eec

Please sign in to comment.