import script improvements

yourcelf · Oct 31, 2010 · 9da371f · 9da371f
1 parent 26e9739
commit 9da371f
Show file tree

Hide file tree

Showing 4 changed files with 91 additions and 47 deletions.
diff --git a/afg/management/commands/import_wikileaks.py b/afg/management/commands/import_wikileaks.py
@@ -1,3 +1,4 @@
+from __future__ import print_function
 import re
 import os
 import csv
@@ -12,29 +13,50 @@
 
 from afg.models import DiaryEntry, import_fields
 
-def clean_summary(text):
-    # Fix ampersand mess
-    while text.find("&amp;") != -1:
-        text = text.replace("&amp;", "&")
-    text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&amp;", text)
-    return text
+# How many words may constitute a phrase?
+PHRASE_LENGTHS = [2]
+# Phrases will only be linked if they match between these values
+# (non-inclusive).
+PHRASE_LINK_LIMITS = [1, 10]
+OUTPUT_NAME = "data/processed.csv"
+
+class StatusPrinter(object):
+    def __init__(self, c=0, n=0):
+        self.c = c
+        self.n = n
+        self.previous = ""
+
+    def inc(self):
+        self.c += 1
+
+    def print(self):
+        print("\b" * len(self.previous), end="")
+        self.previous = "{0} / {1}".format(self.c, self.n)
+        print(self.previous, end="")
+
+    def end(self):
+        print()
+
+def thru(string):
+    if string.strip() == '<null value>':
+        return ""
+    else:
+        return string
 
 class Command(BaseCommand):
     args = '<csv_file> <release_name>'
     help = """Import the wikileaks Afghan War Diary CSV file(s)."""
 
     def handle(self, *args, **kwargs):
-        print args
         if len(args) < 2:
-            print """Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25").  The CSV file can be downloaded here:
+            print("""Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25").  The CSV file can be downloaded here:
 
 http://wikileaks.org/wiki/Afghan_War_Diary,_2004-2010
 
-"""
+""")
             return
 
         fields = [a[0] for a in import_fields]
-        thru = lambda f: f
         conversions = []
         for f in import_fields:
             if len(f) > 1:
@@ -44,67 +66,81 @@ def handle(self, *args, **kwargs):
 
         rows = []
         phrases = defaultdict(set)
+
         for i in range(0, len(args), 2):
             filename = args[i]
             release = args[i + 1]
 
+            print("Loading", filename)
+            sp = StatusPrinter()
+
             with open(filename) as fh:
-                reader = csv.reader(fh)
+                reader = csv.reader(fh, delimiter=",", quotechar='"')
                 for c, row in enumerate(reader):
-                    print "Loading", filename, c
+                    if len(row) == 0:
+                        continue
+                    sp.print()
+                    sp.inc()
                     values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
                     kwargs = dict(zip(fields, values))
                     kwargs['release'] = release
                     rows.append(kwargs)
-
+                     
                     # get phrases
                     summary = re.sub(r'<[^>]*?>', '', kwargs['summary'])
                     summary = re.sub(r'&[^;\s]+;', ' ', summary)
                     summary = re.sub(r'[^A-Z ]', ' ', summary.upper())
                     summary = re.sub(r'\s+', ' ', summary).strip()
                     words = summary.split(' ')
-                    for i in range(3, 1, -1):
+                    for i in PHRASE_LENGTHS:
                         for j in range(i, len(words)):
-                            phrases[" ".join(words[j-i:j])].add(kwargs['report_key'])
+                            phrase = " ".join(words[j-i:j])
+                            if len(phrases[phrase]) <= PHRASE_LINK_LIMITS[1]:
+                                phrases[phrase].add(kwargs['report_key'])
+            sp.end()
 
-        print "Calcuting phrase links..."
+        print("Calcuting phrase links...")
         phrase_links = defaultdict(dict)
         n = len(phrases)
-        for c, (phrase, report_keys) in enumerate(phrases.iteritems()):
-            print "Phrases:", c, n
-            if len(report_keys) > 2 and len(report_keys) < 10:
+        sp = StatusPrinter(0, n)
+        for phrase, report_keys in phrases.iteritems():
+            sp.print()
+            sp.inc()
+            if len(report_keys) > PHRASE_LINK_LIMITS[0] and \
+                    len(report_keys) < PHRASE_LINK_LIMITS[1]:
                 key_list = list(report_keys)
                 for report_key in report_keys:
                     phrase_links[report_key][phrase] = key_list
         phrases = None
+        sp.end()
 
-        print "Writing CSV"
+        print("Writing CSV")
         # Write to CSV and bulk import.
         fields = rows[0].keys()
         fields.append('phrase_links')
-        temp = tempfile.NamedTemporaryFile(delete=False)
-        writer = csv.writer(temp)
-        name = temp.name
-        n = len(rows)
-        c = 0
-        # Pop rows to preserve memory (adding the json in phrase_links grows
-        # too fast).
-        while len(rows) > 0:
-            row = rows.pop(0)
-            print "CSV", c, len(rows), n
-            row['phrase_links'] = json.dumps(phrase_links[row['report_key']])
-            writer.writerow([row[f] for f in fields])
-            c += 1
-        temp.close()
-
-        print "Loading into postgres"
-        cmd = '''psql -U %(user)s -c "\copy %(table)s (%(fields)s) FROM '%(filename)s' WITH CSV NULL AS 'NULL' "''' % {
+        with open(OUTPUT_NAME, 'w') as fh:
+            writer = csv.writer(fh)
+            n = len(rows)
+            c = 0
+            # Pop rows to preserve memory (adding the json in phrase_links grows
+            # too fast).
+            sp = StatusPrinter(c, n)
+            while len(rows) > 0:
+                row = rows.pop(0)
+                sp.print()
+                sp.inc()
+                row['phrase_links'] = json.dumps(phrase_links[row['report_key']])
+                writer.writerow([row[f] for f in fields])
+                c += 1
+        sp.end()
+
+        print("Loading into postgres")
+        cmd = '''psql -U %(user)s -c "\copy %(table)s (%(fields)s) FROM '%(filename)s' WITH CSV NULL AS '<null value>' "''' % {
             'user': connection.settings_dict['USER'],
             'table': DiaryEntry._meta.db_table,
             'fields': ",".join('"%s"' % f for f in fields),
-            'filename': name,
+            'filename': OUTPUT_NAME,
         }
-        print cmd
+        print(cmd)
         proc = subprocess.Popen(cmd, shell=True)
         proc.wait()
-        os.remove(name)
diff --git a/afg/models.py b/afg/models.py
@@ -6,6 +6,8 @@
 
 def clean_summary(text):
     # Fix ampersand mess
+    if text.strip() == "<null value>":
+        return ""
     while text.find("&amp;") != -1:
         text = text.replace("&amp;", "&")
     text = re.sub('&(?!(#[a-z\d]+|\w+);)/gi', "&amp;", text)
@@ -17,7 +19,13 @@ def force_int(a):
 def float_or_null(f):
     if f:
         return float(f)
-    return "NULL"
+    return "<null value>"
+
+def complex_attack(f):
+    if f == "<null value>":
+        return f
+    else:
+        return bool(f)
 
 import_fields = [
     ("report_key",),       # 0
@@ -29,7 +37,7 @@ def float_or_null(f):
     ("summary", clean_summary),          # 6
     ("region",),           # 7 
     ("attack_on",),        # 8
-    ("complex_attack", lambda f: bool(f)),   # 9 
+    ("complex_attack", complex_attack),   # 9 
     ("reporting_unit",),   # 10
     ("unit_name",),        # 11
     ("type_of_unit",),     # 12 
@@ -47,7 +55,7 @@ def float_or_null(f):
     ("longitude", float_or_null),        # 24
     ("originator_group",), # 25
     ("updated_by_group",), # 26
-    ("ccir", lambda f: f or ""),             # 27
+    ("ccir",),             # 27
     ("sigact",),           # 28
     ("affiliation",),      # 29
     ("dcolor",),           # 30
@@ -65,7 +73,7 @@ class DiaryEntry(models.Model):
     summary = models.TextField()
     region = models.CharField(max_length=255)
     attack_on = models.CharField(max_length=255)
-    complex_attack = models.BooleanField()
+    complex_attack = models.NullBooleanField(null=True)
     reporting_unit = models.CharField(max_length=255)
     unit_name = models.CharField(max_length=255)
     type_of_unit = models.CharField(max_length=255)

diff --git a/afg/search_indexes.py b/afg/search_indexes.py
@@ -15,7 +15,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
     summary = indexes.CharField(model_attr='summary')
     region = indexes.CharField(model_attr='region', faceted=True)
     attack_on = indexes.CharField(model_attr='attack_on', faceted=True)
-    complex_attack = indexes.BooleanField(model_attr='complex_attack', faceted=True)
+    complex_attack = indexes.BooleanField(model_attr='complex_attack', faceted=True, null=True)
     reporting_unit = indexes.CharField(model_attr='reporting_unit', faceted=True)
     unit_name = indexes.CharField(model_attr='unit_name', faceted=True)
     type_of_unit = indexes.CharField(model_attr='type_of_unit', faceted=True)
@@ -40,7 +40,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
     classification = indexes.CharField(model_attr='classification', faceted=True)
     total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)
 
-    search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
+    search_facet_display = ('release', 'date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
     offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs
 
     min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-django==1.2.3
+django>=1.2.3
 -e git://github.com/toastdriven/django-haystack.git#egg=django-haystack
 pysolr
 httplib2