Removed local directories from source

wilson428 · Sep 28, 2012 · 82c9eec · 82c9eec
1 parent 5f4634a
commit 82c9eec
Showing 1 changed file with 52 additions and 37 deletions.
diff --git a/get_names.py b/get_names.py
@@ -1,9 +1,15 @@
+'''
+See README for overview and LICENSE for license
+Corrections and coding advice always welcome! cewilson@yahoo-inc.com
+'''
+
 import sqlite3, csv, re
 from gender import *
 
-path = "/Users/cewilson/Desktop/source/FEC/"
+#can set path to where SQLite database is stored to be different from code directory if desired
+path = ""
 
-#optional: returns sqlite queries as dictionaries instead of lists
+#returns sqlite queries as dictionaries instead of lists
 def dict_factory(cursor, row):
     d = {}
     for idx, col in enumerate(cursor.description):
@@ -17,7 +23,7 @@ def dict_factory(cursor, row):
 
 #read the (large) csv files from the FEC into SQLite databases
 #http://fec.gov/disclosurep/PDownload.do
-def load_data(candidate="", filename=""):
+def load_data(filename=""):
     #make table once
     c.execute('CREATE TABLE IF NOT EXISTS donations \
                 ("id" INTEGER PRIMARY KEY AUTOINCREMENT, \
@@ -77,9 +83,10 @@ def split_name(name):
         first = ""
     return last, first
 
-#convert date to SQL format
+#for converting date to SQL format
 #http://www.sqlite.org/lang_datefunc.html
 months = { "JAN": "01", "FEB": "02", "MAR": "03", "APR": "04", "MAY": "05", "JUN": "06", "JUL": "07", "AUG": "08", "SEP": "09", "OCT": "10", "NOV": "11", "DEC": "12" }
+
 party = {
     "Bachmann, Michele" : "Rep",
     "Cain, Herman" : "Rep",
@@ -95,8 +102,6 @@ def split_name(name):
     "Romney, Mitt" : "Rep",
     "Santorum, Rick" : "Rep"
 }
-
-
 
 def get_date(dt):
     dt = dt.split("-")
@@ -146,51 +151,61 @@ def get_genders():
                 c.execute("update stats set gender = \"%s\" where name = \"%s\"" % (g, not_found['name']))
             conn.commit()
 
-#reduces to groups of unique first names
+#reduces to groups of unique first or last names
-def compile_names():
+def compile_names(first=True):
-    c.execute('''CREATE TABLE IF NOT EXISTS "stats"
+    name_type = "first" if first else "last"
-               ("id" INTEGER PRIMARY KEY AUTOINCREMENT,
+
-                "name" VARCHAR(25),
+    c.execute('CREATE TABLE IF NOT EXISTS "%s" \
-                "gender" VARCHAR(15),
+               ("id" INTEGER PRIMARY KEY AUTOINCREMENT, \
-                "party" VARCHAR(5),
+                "name" VARCHAR(25), \
-                "count" INTEGER,
+                "gender" VARCHAR(15), \
-                "amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))''')
+                "party" VARCHAR(5), \
+                "count" INTEGER, \
+                "amount" FLOAT, CONSTRAINT unq UNIQUE (name, party))' % name_type)
 
     c.execute('DELETE FROM stats')
     conn.commit()
 
     #select every unique first name
-    names = c.execute("SELECT first, party, count(*) as count, sum(amount) as amount FROM names group by first, party order by first").fetchall()
+    names = c.execute("SELECT %s, party, count(*) as count, sum(amount) as amount FROM names group by %s, party order by %s" % (name_type, name_type, name_type)).fetchall()
 
     first_letter = ''
     for name in names:
-        if len(name['first']) > 1:
+        nm = name[name_type]
+        if len(nm) > 1:
             #track progreess
-            if name['first'][0] != first_letter:
+            if nm[0] != first_letter:
-                first_letter = name['first'][0]
+                first_letter = nm[0]
-                print "Searching names beginning with %s..." % first_letter
+                print "Searching %s names beginning with %s..." % (name_type, first_letter)
-
+
-            c.execute('INSERT INTO "stats" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' %
+            #don't bother with gender for surnames
-                      (name['first'], get_gender(name['first']), name['party'], name['count'], name['amount']))
+            if name_type == "first":
+                c.execute('INSERT INTO "%s" ("name", "gender", "party", "count", "amount") VALUES ("%s", "%s", "%s", %i, %.2f)' %
+                      (name_type, nm, get_gender(nm), name['party'], name['count'], name['amount']))
+            else:
+                c.execute('INSERT INTO "%s" ("name", "party", "count", "amount") VALUES ("%s", "%s", %i, %.2f)' %
+                      (name_type, nm, name['party'], name['count'], name['amount']))
 
     conn.commit()
 
 
-def write_stats(threshold):
+def write_stats(threshold, first=True):
-    f = open("data/stats_%i.csv" % threshold, "w")
+    name_type = "first" if first else "last"
+
+    f = open("data/stats_%s_%i.csv" % (name_type, threshold), "w")
     f.write("name,gender,total,d_count,r_count,drate,rrate,r_amount,d_amount,advantage,tilt,letter\r")
 
     totals = {}
-    for party in c.execute("SELECT party, count(*) FROM names group by party").fetchall():
+    for party in c.execute("SELECT party, count(*) FROM %s group by party" % name_type).fetchall():
         totals[party["party"]] = party["count(*)"]
 
     #this joins records for the same name in different parties. It is not elegant
     #we miss names here that show up on one list but not another. Bad conceptually, OK for our purposes since any name of appreciable frequency shows up on both lists
     #surely a better way here, but I'm a JOIN novice
     names = c.execute('SELECT d.party, d.name as "name", d.gender as gender, d.count as "d_count", d.amount as "d_amount", \
                         r.party, r.name as "r_name", r.count as "r_count", r.amount as "r_amount" \
-                        from stats as d LEFT OUTER JOIN stats as r ON d.name = r.name \
+                        from %s as d LEFT OUTER JOIN %s as r ON d.name = r.name \
-                        WHERE d.party = "Dem" AND r.party = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (threshold, threshold)).fetchall()
+                        WHERE d.party = "Dem" AND r.party = "Rep" AND (d.count >= %i OR r.count >= %i) order by d.count desc' % (name_type, name_type, threshold, threshold)).fetchall()
 
     for name in names:
         l = 26 - ord(name['name'][0]) + 65        
@@ -207,18 +222,18 @@ def write_stats(threshold):
             advantage = 100 * float(r_amount) / (d_amount + r_amount)
 
             if len(name['name']) > 1:
-                f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.2f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l))
+                f.write("%s,%s,%i,%i,%i,%.3f,%.3f,%.2f,%.2f,%.1f,%.1f,%i\r" % (name['name'], name['gender'], (dc + rc), dc, rc, drate, rrate, d_amount, r_amount, advantage, tilt, l))
 
     f.close()
 
+#Data downloaded from here: ftp://ftp.fec.gov/FEC/Presidential_Map/2012/P00000001/P00000001-ALL.zip
+#Warning: 450 MB uncompressed    
 
-#load_data("", "P00000001-ALL.csv")
+load_data("P00000001-ALL.csv")
-#load_data("Obama", "P80003338-ALL.csv")
+get_names()
-#load_data("Romney", "P80003353-ALL.csv")
+compile_names(True)
-#get_names()
+compile_names(False)
-#get_names("Romney")
+write_stats(25, True)
-#compile_names()
+write_stats(25, False)
-#get_genders()
-write_stats(10)
 
 conn.close()