Skip to content

Commit

Permalink
adding release identification
Browse files Browse the repository at this point in the history
  • Loading branch information
yourcelf committed Sep 12, 2010
1 parent fde03f5 commit 04e2944
Show file tree
Hide file tree
Showing 6 changed files with 23 additions and 14 deletions.
25 changes: 14 additions & 11 deletions afg/management/commands/import_wikileaks.py
Expand Up @@ -17,18 +17,19 @@ def clean_summary(text):
return text

class Command(BaseCommand):
args = '<csv_file>'
args = '<csv_file> <release_name>'
help = """Import the wikileaks Afghan War Diary CSV file."""

def handle(self, *args, **kwargs):
if len(args) < 1:
print """Requires one argument: the path to the wikileaks Afghan War Diary CSV file. It can be downloaded here:
if len(args) < 2:
print """Requires two arguments: the path to the wikileaks Afghan War Diary CSV file, and a string identifying this release (e.g. "2010 July 25"). The CSV file can be downloaded here:
http://wikileaks.org/wiki/Afghan_War_Diary,_2004-2010
"""
return

release = args[1]
fields = [a[0] for a in import_fields]
thru = lambda f: f
conversions = []
Expand All @@ -47,6 +48,7 @@ def handle(self, *args, **kwargs):
print c
values = map(lambda t: conversions[t[0]](t[1]), enumerate(row))
kwargs = dict(zip(fields, values))
kwargs['release'] = release
entry = DiaryEntry.objects.create(**kwargs)

# Get words for phrases
Expand All @@ -62,22 +64,23 @@ def handle(self, *args, **kwargs):

n = len(phrases)
cursor = connection.cursor()
transaction.commit_unless_managed()
# Drop the join reference constraint for efficiency. We're confident
# that the 4 million rows we're about to add all satisfy the
# constraint, and it saves about 5 hours of computation time.
cursor.execute('''ALTER TABLE "afg_phrase_entries" DROP CONSTRAINT "phrase_id_refs_id_48aa97f2"''')
transaction.commit_unless_managed()
for c, (phrase, entry_ids) in enumerate(phrases.iteritems()):
if c % 10000 == 0:
transaction.commit_unless_managed()
print c, n
if len(entry_ids) > 1 and len(entry_ids) <= 10:
if c % 1000 == 0:
print c, n
cursor.execute("INSERT INTO afg_phrase (phrase, entry_count) VALUES (%s, %s) RETURNING id", (phrase, len(entry_ids)))
phrase_id = cursor.fetchone()[0]
cursor.execute("""
INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES
""" + ",".join("(%s, %s)" % (phrase_id, entry_id) for entry_id in entry_ids)
)
transaction.commit_unless_managed()
phrase_entries = []
for entry_id in entry_ids:
phrase_entries.append((phrase_id, entry_id))
cursor.executemany("""INSERT INTO afg_phrase_entries (phrase_id, diaryentry_id) VALUES (%s, %s)""", phrase_entries)

cursor.execute('''ALTER TABLE "afg_phrase_entries" ADD CONSTRAINT "phrase_id_refs_id_48aa97f2" FOREIGN KEY ("phrase_id") REFERENCES "afg_phrase" ("id") DEFERRABLE INITIALLY DEFERRED;''')
transaction.commit_unless_managed()

1 change: 1 addition & 0 deletions afg/models.py
Expand Up @@ -54,6 +54,7 @@ def float_or_null(f):
]
# No DB indexes because we're kicking all that to SOLR.
class DiaryEntry(models.Model):
release = models.CharField(max_length=255)
report_key = models.CharField(max_length=255, unique=True)
date = models.DateTimeField()
type = models.CharField(max_length=255)
Expand Down
3 changes: 2 additions & 1 deletion afg/search_indexes.py
Expand Up @@ -5,6 +5,7 @@

class DiaryEntryIndex(indexes.SearchIndex):
text = indexes.CharField(document=True, use_template=True)
release = indexes.CharField(model_attr='release', faceted=True)
report_key = indexes.CharField(model_attr='report_key')
date = indexes.DateTimeField(model_attr='date', faceted=True)
type_ = indexes.CharField(model_attr='type', faceted=True)
Expand Down Expand Up @@ -39,7 +40,7 @@ class DiaryEntryIndex(indexes.SearchIndex):
classification = indexes.CharField(model_attr='classification', faceted=True)
total_casualties = indexes.IntegerField(model_attr='total_casualties', faceted=True)

search_facet_display = ('date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
search_facet_display = ('release', 'date', 'type_', 'region', 'attack_on', 'type_of_unit', 'affiliation', 'dcolor', 'classification', 'category', 'total_casualties', 'civilian_kia', 'civilian_wia', 'host_nation_kia', 'host_nation_wia', 'friendly_kia', 'friendly_wia', 'enemy_kia', 'enemy_wia', 'enemy_detained')
offer_to_sort_by = (('Date', 'date'), ('Casualties', 'total_casualties')) # (display, field) pairs

min_date = datetime.datetime(2004, 1, 1, 0, 0, 0)
Expand Down
2 changes: 1 addition & 1 deletion afg/views.py
Expand Up @@ -204,9 +204,9 @@ def search(request, about=False, api=False):
field_name, lookup = (key + "__exact").rsplit(r'__')[0:2]
# "type" is a reserved name for Solr, so munge it to "type_"
field_name = "type_" if field_name == "type" else field_name
# Dates are handled specially below
field = DiaryEntryIndex.fields.get(field_name, None)
if field and field.faceted:
# Dates are handled specially below
if isinstance(field, haystack.fields.DateTimeField):
continue
elif isinstance(field, haystack.fields.IntegerField):
Expand Down
2 changes: 1 addition & 1 deletion reset_afgexplorer.sh
Expand Up @@ -6,7 +6,7 @@ DBUSER=afg2
sudo su postgres -c "dropdb $DBNAME"
sudo su postgres -c "createdb -O $DBUSER $DBNAME"
python manage.py syncdb --noinput
python manage.py import_wikileaks data/afg.csv
python manage.py import_wikileaks data/afg.csv "2010 July 25"
python manage.py build_solr_schema > schema.xml
echo "Please reset Solr now to reflect the new schema, then press [Enter]"
read foo
Expand Down
4 changes: 4 additions & 0 deletions schema.xml
Expand Up @@ -192,6 +192,10 @@

<field name="tracking_number" type="text" indexed="true" stored="true" multiValued="false" />

<field name="release" type="text" indexed="true" stored="true" multiValued="false" />

<field name="release_exact" type="string" indexed="true" stored="true" multiValued="false" />

<field name="sigact" type="text" indexed="true" stored="true" multiValued="false" />

<field name="sigact_exact" type="string" indexed="true" stored="true" multiValued="false" />
Expand Down

0 comments on commit 04e2944

Please sign in to comment.