Permalink
Browse files

adding release identification

  • Loading branch information...
1 parent fde03f5 commit 04e2944e7c1daf9622a609b50fc191f03c7b6abd @yourcelf committed Sep 12, 2010
Showing with 23 additions and 14 deletions.
  1. +14 −11 afg/management/commands/import_wikileaks.py
  2. +1 −0 afg/models.py
  3. +2 −1 afg/search_indexes.py
  4. +1 −1 afg/views.py
  5. +1 −1 reset_afgexplorer.sh
  6. +4 −0 schema.xml
@@ -17,18 +17,19 @@ def clean_summary(text):
return text
class Command(BaseCommand):
- args = '<csv_file>'
+ args = '<csv_file> <release_name>'
help = """Import the wikileaks Afghan War Diary CSV file."""
def handle(self, *args, **kwargs):
- if len(args) < 1:
- print """Requires one argument: the path to the wikileaks Afghan War Diary CSV file. It can be downloaded here:
+ if len(args) < 2:
+ print """Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25"). The CSV file can be downloaded here:
http://wikileaks.org/wiki/Afghan_War_Diary,_2004-2010
"""
return
+ release = args[1]
fields = [a[0] for a in import_fields]
thru = lambda f: f
conversions = []
@@ -47,6 +48,7 @@ def handle(self, *args, **kwargs):
print c
values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
kwargs = dict(zip(fields, values))
+ kwargs['release'] = release
entry = DiaryEntry.objects.create(**kwargs)
# Get words for phrases
@@ -62,22 +64,23 @@ def handle(self, *args, **kwargs):
n = len(phrases)
cursor = connection.cursor()
+ transaction.commit_unless_managed()
# Drop the join reference constraint for efficiency. We're confident
# that the 4 million rows we're about to add all satisfy the
# constraint, and it saves about 5 hours of computation time.
cursor.execute('''ALTER TABLE "afg_phrase_entries" DROP CONSTRAINT "phrase_id_refs_id_48aa97f2"''')
- transaction.commit_unless_managed()
for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
+ if c % 10000 == 0:
+ transaction.commit_unless_managed()
+ print c, n
if len(entry_ids) > 1 and len(entry_ids) <= 10:
- if c % 1000 == 0:
- print c, n
cursor.execute("INSERT INTO afg_phrase (phrase, entry_count) VALUES (%s, %s) RETURNING id", (phrase, len(entry_ids)))
phrase_id = cursor.fetchone()[0]
- cursor.execute("""
- INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES
- """ + ",".join("(%s, %s)" % (phrase_id, entry_id) for entry_id in entry_ids)
- )
- transaction.commit_unless_managed()
+ phrase_entries = []
+ for entry_id in entry_ids:
+ phrase_entries.append((phrase_id, entry_id))
+ cursor.executemany("""INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES (%s, %s)""", phrase_entries)
+
cursor.execute('''ALTER TABLE "afg_phrase_entries" ADD CONSTRAINT "phrase_id_refs_id_48aa97f2" FOREIGN KEY ("phrase_id") REFERENCES "afg_phrase" ("id") DEFERRABLE INITIALLY DEFERRED;''')
transaction.commit_unless_managed()
View
@@ -54,6 +54,7 @@ def float_or_null(f):
]
# No DB indexes because we're kicking all that to SOLR.
class DiaryEntry(models.Model):
+ release = models.CharField(max_length=255)
report_key = models.CharField(max_length=255, unique=True)
date = models.DateTimeField()
type = models.CharField(max_length=255)
@@ -5,6 +5,7 @@
class DiaryEntryIndex(indexes.SearchIndex):
text = indexes.CharField(document=True, use_template=True)
+ release = indexes.CharField(model_attr='release', faceted=True)
report_key = indexes.CharField(model_attr='report_key')
date = indexes.DateTimeField(model_attr='date', faceted=True)
type_ = indexes.CharField(model_attr='type', faceted=True)
@@ -39,7 +40,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
classification = indexes.CharField(model_attr='classification', faceted=True)
total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)
- search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
+ search_facet_display = ('release', 'date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs
min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)
View
@@ -204,9 +204,9 @@ def search(request, about=False, api=False):
field_name, lookup = (key + "__exact").rsplit(r'__')[0:2]
# "type" is a reserved name for Solr, so munge it to "type_"
field_name = "type_" if field_name == "type" else field_name
- # Dates are handled specially below
field = DiaryEntryIndex.fields.get(field_name, None)
if field and field.faceted:
+ # Dates are handled specially below
if isinstance(field, haystack.fields.DateTimeField):
continue
elif isinstance(field, haystack.fields.IntegerField):
@@ -6,7 +6,7 @@ DBUSER=afg2
sudo su postgres -c "dropdb $DBNAME"
sudo su postgres -c "createdb -O $DBUSER $DBNAME"
python manage.py syncdb --noinput
-python manage.py import_wikileaks data/afg.csv
+python manage.py import_wikileaks data/afg.csv "2010 July 25"
python manage.py build_solr_schema > schema.xml
echo "Please reset Solr now to reflect the new schema, then press [Enter]"
read foo
View
@@ -192,6 +192,10 @@
<field name="tracking_number" type="text" indexed="true" stored="true" multiValued="false" />
+ <field name="release" type="text" indexed="true" stored="true" multiValued="false" />
+
+ <field name="release_exact" type="string" indexed="true" stored="true" multiValued="false" />
+
<field name="sigact" type="text" indexed="true" stored="true" multiValued="false" />
<field name="sigact_exact" type="string" indexed="true" stored="true" multiValued="false" />

0 comments on commit 04e2944

Please sign in to comment.