adding release identification

yourcelf · Sep 12, 2010 · 04e2944 · 04e2944
1 parent fde03f5
commit 04e2944
Show file tree

Hide file tree

Showing 6 changed files with 23 additions and 14 deletions.
diff --git a/afg/management/commands/import_wikileaks.py b/afg/management/commands/import_wikileaks.py
@@ -17,18 +17,19 @@ def clean_summary(text):
     return text
 
 class Command(BaseCommand):
-    args = '<csv_file>'
+    args = '<csv_file> <release_name>'
     help = """Import the wikileaks Afghan War Diary CSV file."""
 
     def handle(self, *args, **kwargs):
-        if len(args) < 1:
-            print """Requires one argument: the path to the wikileaks Afghan War Diary CSV file.  It can be downloaded here:
+        if len(args) < 2:
+            print """Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25").  The CSV file can be downloaded here:
 
 http://wikileaks.org/wiki/Afghan_War_Diary,_2004-2010
 
 """
             return
 
+        release = args[1]
         fields = [a[0] for a in import_fields]
         thru = lambda f: f
         conversions = []
@@ -47,6 +48,7 @@ def handle(self, *args, **kwargs):
                     print c
                 values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
                 kwargs = dict(zip(fields, values))
+                kwargs['release'] = release
                 entry = DiaryEntry.objects.create(**kwargs)
 
                 # Get words for phrases
@@ -62,22 +64,23 @@ def handle(self, *args, **kwargs):
 
         n = len(phrases)
         cursor = connection.cursor()
+        transaction.commit_unless_managed()
         # Drop the join reference constraint for efficiency.  We're confident
         # that the 4 million rows we're about to add all satisfy the
         # constraint, and it saves about 5 hours of computation time.
         cursor.execute('''ALTER TABLE "afg_phrase_entries" DROP CONSTRAINT "phrase_id_refs_id_48aa97f2"''')
-        transaction.commit_unless_managed()
         for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
+            if c % 10000 == 0:
+                transaction.commit_unless_managed()
+                print c, n
             if len(entry_ids) > 1 and len(entry_ids) <= 10:
-                if c % 1000 == 0:
-                    print c, n
                 cursor.execute("INSERT INTO afg_phrase (phrase, entry_count) VALUES (%s, %s) RETURNING id", (phrase, len(entry_ids)))
                 phrase_id = cursor.fetchone()[0]
-                cursor.execute("""
-                    INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES 
-                    """ + ",".join("(%s, %s)" % (phrase_id, entry_id) for entry_id in entry_ids)
-                )
-        transaction.commit_unless_managed()
+                phrase_entries = []
+                for entry_id in entry_ids:
+                    phrase_entries.append((phrase_id, entry_id))
+                cursor.executemany("""INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES (%s, %s)""", phrase_entries)
+
         cursor.execute('''ALTER TABLE "afg_phrase_entries" ADD CONSTRAINT "phrase_id_refs_id_48aa97f2" FOREIGN KEY ("phrase_id") REFERENCES "afg_phrase" ("id") DEFERRABLE INITIALLY DEFERRED;''')
         transaction.commit_unless_managed()
 
diff --git a/afg/models.py b/afg/models.py
@@ -54,6 +54,7 @@ def float_or_null(f):
 ]
 # No DB indexes because we're kicking all that to SOLR.
 class DiaryEntry(models.Model):
+    release = models.CharField(max_length=255)
     report_key = models.CharField(max_length=255, unique=True)
     date = models.DateTimeField()
     type = models.CharField(max_length=255)

diff --git a/afg/search_indexes.py b/afg/search_indexes.py
@@ -5,6 +5,7 @@
 
 class DiaryEntryIndex(indexes.SearchIndex):
     text = indexes.CharField(document=True, use_template=True)
+    release = indexes.CharField(model_attr='release', faceted=True)
     report_key = indexes.CharField(model_attr='report_key')
     date = indexes.DateTimeField(model_attr='date', faceted=True)
     type_ = indexes.CharField(model_attr='type', faceted=True)
@@ -39,7 +40,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
     classification = indexes.CharField(model_attr='classification', faceted=True)
     total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)
 
-    search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
+    search_facet_display = ('release', 'date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
     offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs
 
     min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)

diff --git a/afg/views.py b/afg/views.py
@@ -204,9 +204,9 @@ def search(request, about=False, api=False):
             field_name, lookup = (key + "__exact").rsplit(r'__')[0:2]
             # "type" is a reserved name for Solr, so munge it to "type_"
             field_name = "type_" if field_name == "type" else field_name
-            # Dates are handled specially below
             field = DiaryEntryIndex.fields.get(field_name, None)
             if field and field.faceted:
+                # Dates are handled specially below
                 if isinstance(field, haystack.fields.DateTimeField):
                     continue
                 elif isinstance(field, haystack.fields.IntegerField):

diff --git a/reset_afgexplorer.sh b/reset_afgexplorer.sh
@@ -6,7 +6,7 @@ DBUSER=afg2
 sudo su postgres -c "dropdb $DBNAME"
 sudo su postgres -c "createdb -O $DBUSER $DBNAME"
 python manage.py syncdb --noinput
-python manage.py import_wikileaks data/afg.csv
+python manage.py import_wikileaks data/afg.csv "2010 July 25"
 python manage.py build_solr_schema > schema.xml
 echo "Please reset Solr now to reflect the new schema, then press [Enter]"
 read foo

diff --git a/schema.xml b/schema.xml
@@ -192,6 +192,10 @@
 
     <field name="tracking_number" type="text" indexed="true" stored="true" multiValued="false" />
 
+    <field name="release" type="text" indexed="true" stored="true" multiValued="false" />
+
+    <field name="release_exact" type="string" indexed="true" stored="true" multiValued="false" />
+
     <field name="sigact" type="text" indexed="true" stored="true" multiValued="false" />
 
     <field name="sigact_exact" type="string" indexed="true" stored="true" multiValued="false" />